From a33c26aa394fdc89068f66270a76313f5d1b96af Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 27 Oct 2025 14:49:14 +0100 Subject: [PATCH 01/64] Add SSE4.2 implementation --- cpp/src/arrow/util/bpacking_benchmark.cc | 7 +++++++ cpp/src/arrow/util/bpacking_simd_default.cc | 16 +++++++++++++- cpp/src/arrow/util/bpacking_simd_internal.h | 23 +++++++++++++++++++++ cpp/src/arrow/util/bpacking_test.cc | 8 +++++++ 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc index 7094656eeef..8508a5b715c 100644 --- a/cpp/src/arrow/util/bpacking_benchmark.cc +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -173,6 +173,13 @@ BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, &unpack_neon) ->ArgsProduct(kBitWidthsNumValues32); BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, &unpack_neon) ->ArgsProduct(kBitWidthsNumValues64); +#elif defined(ARROW_HAVE_SSE4_2) +BENCHMARK_CAPTURE(BM_UnpackUint16, Sse42Unaligned, false, &unpack_sse4_2) + ->ArgsProduct(kBitWidthsNumValues16); +BENCHMARK_CAPTURE(BM_UnpackUint32, Sse42Unaligned, false, &unpack_sse4_2) + ->ArgsProduct(kBitWidthsNumValues32); +BENCHMARK_CAPTURE(BM_UnpackUint64, Sse42Unaligned, false, &unpack_sse4_2) + ->ArgsProduct(kBitWidthsNumValues64); #endif BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicAligned, true, &unpack) diff --git a/cpp/src/arrow/util/bpacking_simd_default.cc b/cpp/src/arrow/util/bpacking_simd_default.cc index e9af823807d..691d43f783b 100644 --- a/cpp/src/arrow/util/bpacking_simd_default.cc +++ b/cpp/src/arrow/util/bpacking_simd_default.cc @@ -16,7 +16,7 @@ // under the License. #include "arrow/util/bpacking_dispatch_internal.h" -#if defined(ARROW_HAVE_NEON) +#if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2) # include "arrow/util/bpacking_simd128_generated_internal.h" #endif #include "arrow/util/bpacking_simd_internal.h" @@ -37,6 +37,20 @@ template void unpack_neon(const uint8_t*, uint16_t*, int, int, int); template void unpack_neon(const uint8_t*, uint32_t*, int, int, int); template void unpack_neon(const uint8_t*, uint64_t*, int, int, int); +#elif defined(ARROW_HAVE_SSE4_2) + +template +void unpack_sse4_2(const uint8_t* in, Uint* out, int batch_size, int num_bits, + int bit_offset) { + return unpack_jump(in, out, batch_size, num_bits, bit_offset); +} + +template void unpack_sse4_2(const uint8_t*, bool*, int, int, int); +template void unpack_sse4_2(const uint8_t*, uint8_t*, int, int, int); +template void unpack_sse4_2(const uint8_t*, uint16_t*, int, int, int); +template void unpack_sse4_2(const uint8_t*, uint32_t*, int, int, int); +template void unpack_sse4_2(const uint8_t*, uint64_t*, int, int, int); + #endif } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_simd_internal.h b/cpp/src/arrow/util/bpacking_simd_internal.h index 755b001140b..e9fafabfcc7 100644 --- a/cpp/src/arrow/util/bpacking_simd_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_internal.h @@ -45,6 +45,29 @@ extern template ARROW_TEMPLATE_EXPORT void unpack_neon( extern template ARROW_TEMPLATE_EXPORT void unpack_neon( const uint8_t* in, uint64_t* out, int batch_size, int num_bits, int bit_offset); +#elif defined(ARROW_HAVE_SSE4_2) + +template +ARROW_EXPORT void unpack_sse4_2(const uint8_t* in, Uint* out, int batch_size, + int num_bits, int bit_offset); + +extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2(const uint8_t* in, + bool* out, int batch_size, + int num_bits, + int bit_offset); + +extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2( + const uint8_t* in, uint8_t* out, int batch_size, int num_bits, int bit_offset); + +extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2( + const uint8_t* in, uint16_t* out, int batch_size, int num_bits, int bit_offset); + +extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2( + const uint8_t* in, uint32_t* out, int batch_size, int num_bits, int bit_offset); + +extern template ARROW_TEMPLATE_EXPORT void unpack_sse4_2( + const uint8_t* in, uint64_t* out, int batch_size, int num_bits, int bit_offset); + #endif #if defined(ARROW_HAVE_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX2) diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc index a3ab4d6dda2..7d487c35d15 100644 --- a/cpp/src/arrow/util/bpacking_test.cc +++ b/cpp/src/arrow/util/bpacking_test.cc @@ -247,6 +247,14 @@ TEST_P(TestUnpack, Unpack16Scalar) { this->TestAll(&unpack_scalar); } TEST_P(TestUnpack, Unpack32Scalar) { this->TestAll(&unpack_scalar); } TEST_P(TestUnpack, Unpack64Scalar) { this->TestAll(&unpack_scalar); } +#if defined(ARROW_HAVE_SSE4_2) +TEST_P(TestUnpack, UnpackBoolSse4_2) { this->TestAll(&unpack_sse4_2); } +TEST_P(TestUnpack, Unpack8Sse4_2) { this->TestAll(&unpack_sse4_2); } +TEST_P(TestUnpack, Unpack16Sse4_2) { this->TestAll(&unpack_sse4_2); } +TEST_P(TestUnpack, Unpack32Sse4_2) { this->TestAll(&unpack_sse4_2); } +TEST_P(TestUnpack, Unpack64Sse4_2) { this->TestAll(&unpack_sse4_2); } +#endif + #if defined(ARROW_HAVE_RUNTIME_AVX2) TEST_P(TestUnpack, UnpackBoolAvx2) { if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2)) { From 09e98351fb651ceaf86d8ee0af5ac527514d33ef Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 28 Oct 2025 16:20:44 +0100 Subject: [PATCH 02/64] Add unpack uint8_t benchmark --- cpp/src/arrow/util/bpacking_benchmark.cc | 45 ++++++++++++++++++++---- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc index 8508a5b715c..f5881bec078 100644 --- a/cpp/src/arrow/util/bpacking_benchmark.cc +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -97,9 +97,14 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc unpack, bo constexpr int32_t kMinRange = 64; constexpr int32_t kMaxRange = 32768; +constexpr std::initializer_list kBitWidths8 = {1, 2, 8}; constexpr std::initializer_list kBitWidths16 = {1, 2, 8, 13}; constexpr std::initializer_list kBitWidths32 = {1, 2, 8, 20}; constexpr std::initializer_list kBitWidths64 = {1, 2, 8, 20, 47}; +static const std::vector> kBitWidthsNumValues8 = { + kBitWidths8, + benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), +}; static const std::vector> kBitWidthsNumValues16 = { kBitWidths16, benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), @@ -113,6 +118,11 @@ static const std::vector> kBitWidthsNumValues64 = { benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), }; +/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro. +void BM_UnpackUint8(benchmark::State& state, bool aligned, UnpackFunc unpack, + bool skip = false, std::string skip_msg = "") { + return BM_Unpack(state, aligned, unpack, skip, std::move(skip_msg)); +} /// Nudge for MSVC template inside BENCHMARK_CAPTURE macro. void BM_UnpackUint16(benchmark::State& state, bool aligned, UnpackFunc unpack, bool skip = false, std::string skip_msg = "") { @@ -129,6 +139,8 @@ void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc return BM_Unpack(state, aligned, unpack, skip, std::move(skip_msg)); } +BENCHMARK_CAPTURE(BM_UnpackUint8, ScalarUnaligned, false, &unpack_scalar) + ->ArgsProduct(kBitWidthsNumValues8); BENCHMARK_CAPTURE(BM_UnpackUint16, ScalarUnaligned, false, &unpack_scalar) ->ArgsProduct(kBitWidthsNumValues16); BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, &unpack_scalar) @@ -136,7 +148,22 @@ BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, &unpack_scalar) ->ArgsProduct(kBitWidthsNumValues64); +#if defined(ARROW_HAVE_SSE4_2) +BENCHMARK_CAPTURE(BM_UnpackUint8, Sse42Unaligned, false, &unpack_sse4_2) + ->ArgsProduct(kBitWidthsNumValues8); +BENCHMARK_CAPTURE(BM_UnpackUint16, Sse42Unaligned, false, &unpack_sse4_2) + ->ArgsProduct(kBitWidthsNumValues16); +BENCHMARK_CAPTURE(BM_UnpackUint32, Sse42Unaligned, false, &unpack_sse4_2) + ->ArgsProduct(kBitWidthsNumValues32); +BENCHMARK_CAPTURE(BM_UnpackUint64, Sse42Unaligned, false, &unpack_sse4_2) + ->ArgsProduct(kBitWidthsNumValues64); +#endif + #if defined(ARROW_HAVE_RUNTIME_AVX2) +BENCHMARK_CAPTURE(BM_UnpackUint8, Avx2Unaligned, false, &unpack_avx2, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), + "Avx2 not available") + ->ArgsProduct(kBitWidthsNumValues8); BENCHMARK_CAPTURE(BM_UnpackUint16, Avx2Unaligned, false, &unpack_avx2, !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), "Avx2 not available") @@ -152,6 +179,10 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, Avx2Unaligned, false, &unpack_avx2, #endif #if defined(ARROW_HAVE_RUNTIME_AVX512) +BENCHMARK_CAPTURE(BM_UnpackUint8, Avx512Unaligned, false, &unpack_avx512, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), + "Avx512 not available") + ->ArgsProduct(kBitWidthsNumValues8); BENCHMARK_CAPTURE(BM_UnpackUint16, Avx512Unaligned, false, &unpack_avx512, !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), "Avx512 not available") @@ -167,21 +198,21 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, Avx512Unaligned, false, &unpack_avx512) + ->ArgsProduct(kBitWidthsNumValues8); BENCHMARK_CAPTURE(BM_UnpackUint16, NeonUnaligned, false, &unpack_neon) ->ArgsProduct(kBitWidthsNumValues16); BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, &unpack_neon) ->ArgsProduct(kBitWidthsNumValues32); BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, &unpack_neon) ->ArgsProduct(kBitWidthsNumValues64); -#elif defined(ARROW_HAVE_SSE4_2) -BENCHMARK_CAPTURE(BM_UnpackUint16, Sse42Unaligned, false, &unpack_sse4_2) - ->ArgsProduct(kBitWidthsNumValues16); -BENCHMARK_CAPTURE(BM_UnpackUint32, Sse42Unaligned, false, &unpack_sse4_2) - ->ArgsProduct(kBitWidthsNumValues32); -BENCHMARK_CAPTURE(BM_UnpackUint64, Sse42Unaligned, false, &unpack_sse4_2) - ->ArgsProduct(kBitWidthsNumValues64); #endif +BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicAligned, true, &unpack) + ->ArgsProduct(kBitWidthsNumValues8); +BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicUnaligned, false, &unpack) + ->ArgsProduct(kBitWidthsNumValues8); + BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicAligned, true, &unpack) ->ArgsProduct(kBitWidthsNumValues16); BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicUnaligned, false, &unpack) From af24a60100136b0cff3b0b9da0836b58c3f45283 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 30 Oct 2025 16:46:20 +0100 Subject: [PATCH 03/64] Add bool unpack benchmark --- cpp/src/arrow/util/bpacking_benchmark.cc | 34 ++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc index f5881bec078..39dd60f18e1 100644 --- a/cpp/src/arrow/util/bpacking_benchmark.cc +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include @@ -86,10 +87,10 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc unpack, bo const uint8_t* packed_ptr = GetNextAlignedByte(packed.data(), sizeof(Int)) + (aligned ? 0 : 1); - std::vector unpacked(num_values, 0); + auto unpacked = std::make_unique(num_values); for (auto _ : state) { - unpack(packed_ptr, unpacked.data(), num_values, bit_width, /* bit_offset = */ 0); + unpack(packed_ptr, unpacked.get(), num_values, bit_width, /* bit_offset = */ 0); benchmark::ClobberMemory(); } state.SetItemsProcessed(num_values * state.iterations()); @@ -101,6 +102,11 @@ constexpr std::initializer_list kBitWidths8 = {1, 2, 8}; constexpr std::initializer_list kBitWidths16 = {1, 2, 8, 13}; constexpr std::initializer_list kBitWidths32 = {1, 2, 8, 20}; constexpr std::initializer_list kBitWidths64 = {1, 2, 8, 20, 47}; + +static const std::vector> kBitWidthsNumValuesBool = { + {0, 1}, + benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), +}; static const std::vector> kBitWidthsNumValues8 = { kBitWidths8, benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), @@ -118,6 +124,11 @@ static const std::vector> kBitWidthsNumValues64 = { benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), }; +/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro. +void BM_UnpackBool(benchmark::State& state, bool aligned, UnpackFunc unpack, + bool skip = false, std::string skip_msg = "") { + return BM_Unpack(state, aligned, unpack, skip, std::move(skip_msg)); +} /// Nudge for MSVC template inside BENCHMARK_CAPTURE macro. void BM_UnpackUint8(benchmark::State& state, bool aligned, UnpackFunc unpack, bool skip = false, std::string skip_msg = "") { @@ -139,6 +150,8 @@ void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc return BM_Unpack(state, aligned, unpack, skip, std::move(skip_msg)); } +BENCHMARK_CAPTURE(BM_UnpackBool, ScalarUnaligned, false, &unpack_scalar) + ->ArgsProduct(kBitWidthsNumValuesBool); BENCHMARK_CAPTURE(BM_UnpackUint8, ScalarUnaligned, false, &unpack_scalar) ->ArgsProduct(kBitWidthsNumValues8); BENCHMARK_CAPTURE(BM_UnpackUint16, ScalarUnaligned, false, &unpack_scalar) @@ -149,6 +162,8 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, &unpack_scalarArgsProduct(kBitWidthsNumValues64); #if defined(ARROW_HAVE_SSE4_2) +BENCHMARK_CAPTURE(BM_UnpackBool, Sse42Unaligned, false, &unpack_sse4_2) + ->ArgsProduct(kBitWidthsNumValuesBool); BENCHMARK_CAPTURE(BM_UnpackUint8, Sse42Unaligned, false, &unpack_sse4_2) ->ArgsProduct(kBitWidthsNumValues8); BENCHMARK_CAPTURE(BM_UnpackUint16, Sse42Unaligned, false, &unpack_sse4_2) @@ -160,6 +175,10 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, Sse42Unaligned, false, &unpack_sse4_2, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), + "Avx2 not available") + ->ArgsProduct(kBitWidthsNumValuesBool); BENCHMARK_CAPTURE(BM_UnpackUint8, Avx2Unaligned, false, &unpack_avx2, !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), "Avx2 not available") @@ -179,6 +198,10 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, Avx2Unaligned, false, &unpack_avx2, #endif #if defined(ARROW_HAVE_RUNTIME_AVX512) +BENCHMARK_CAPTURE(BM_UnpackBool, Avx512Unaligned, false, &unpack_avx512, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), + "Avx512 not available") + ->ArgsProduct(kBitWidthsNumValuesBool); BENCHMARK_CAPTURE(BM_UnpackUint8, Avx512Unaligned, false, &unpack_avx512, !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), "Avx512 not available") @@ -198,6 +221,8 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, Avx512Unaligned, false, &unpack_avx512) + ->ArgsProduct(kBitWidthsNumValuesBool); BENCHMARK_CAPTURE(BM_UnpackUint8, NeonUnaligned, false, &unpack_neon) ->ArgsProduct(kBitWidthsNumValues8); BENCHMARK_CAPTURE(BM_UnpackUint16, NeonUnaligned, false, &unpack_neon) @@ -208,6 +233,11 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, &unpack_neon) ->ArgsProduct(kBitWidthsNumValues64); #endif +BENCHMARK_CAPTURE(BM_UnpackBool, DynamicAligned, true, &unpack) + ->ArgsProduct(kBitWidthsNumValuesBool); +BENCHMARK_CAPTURE(BM_UnpackBool, DynamicUnaligned, false, &unpack) + ->ArgsProduct(kBitWidthsNumValuesBool); + BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicAligned, true, &unpack) ->ArgsProduct(kBitWidthsNumValues8); BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicUnaligned, false, &unpack) From 2caf2e11ad127bc32f5ad2718f8fbf041fb10c68 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 25 Nov 2025 10:42:28 +0100 Subject: [PATCH 04/64] Bias benchmarks toward small scale --- cpp/src/arrow/util/bpacking_benchmark.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc index 39dd60f18e1..e053226fbfb 100644 --- a/cpp/src/arrow/util/bpacking_benchmark.cc +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -96,32 +96,32 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc unpack, bo state.SetItemsProcessed(num_values * state.iterations()); } -constexpr int32_t kMinRange = 64; -constexpr int32_t kMaxRange = 32768; +/// Currently, the minimum unpack SIMD kernel size is 32 and the bit packing encoder will +/// not emit runs larger than 512 (though other implementation might), so we biased the +/// benchmarks towards a rather small scale. +static const auto kNumValuesRange = benchmark::CreateRange(32, 512, 2); constexpr std::initializer_list kBitWidths8 = {1, 2, 8}; constexpr std::initializer_list kBitWidths16 = {1, 2, 8, 13}; constexpr std::initializer_list kBitWidths32 = {1, 2, 8, 20}; constexpr std::initializer_list kBitWidths64 = {1, 2, 8, 20, 47}; static const std::vector> kBitWidthsNumValuesBool = { - {0, 1}, - benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), -}; + {0, 1}, kNumValuesRange}; static const std::vector> kBitWidthsNumValues8 = { kBitWidths8, - benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), + kNumValuesRange, }; static const std::vector> kBitWidthsNumValues16 = { kBitWidths16, - benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), + kNumValuesRange, }; static const std::vector> kBitWidthsNumValues32 = { kBitWidths32, - benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), + kNumValuesRange, }; static const std::vector> kBitWidthsNumValues64 = { kBitWidths64, - benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), + kNumValuesRange, }; /// Nudge for MSVC template inside BENCHMARK_CAPTURE macro. From a9789b897369210f51a5cac0c968ac479dd6a7ff Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 20 Oct 2025 16:31:10 +0200 Subject: [PATCH 05/64] Add Kernel plan builder --- .../arrow/util/bpacking_simd_impl_internal.h | 232 ++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 cpp/src/arrow/util/bpacking_simd_impl_internal.h diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h new file mode 100644 index 00000000000..298b965999a --- /dev/null +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -0,0 +1,232 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include + +#include "arrow/util/bit_util.h" + +namespace arrow::internal { + +// https://github.com/fast-pack/LittleIntPacker/blob/master/src/horizontalpacking32.c +// TODO +// - No zero and full size unpack here +// - _mm_cvtepi8_epi32 +// - var rshifts no avail on SSE +// - no need for while loop (for up to 8 is sufficient) +// - no need for the top functions +// - Shifts per swizzle can be improved when self.packed_max_byte_spread == 1 and the +// byte can be reused (when val_bit_width divides packed_max_byte_spread). + +/// Compute the maximum spread in bytes that a packed integer can cover. +/// +/// This is assuming contiguous packed integer starting on a byte aligned boundary. +/// This function is non-monotonic, for instance three bit integers will be split on the +/// first byte boundary (hence having a spread of two bytes) while four bit integer will +/// be well behaved and never spread over byte boundary (hence having a spread of one). +constexpr int PackedMaxSpreadBytes(int width) { + int max = static_cast(bit_util::BytesForBits(width)); + int start = width; + while (start % 8 != 0) { + const int byte_start = start / 8; + const int byte_end = (start + width - 1) / 8; // inclusive end bit + const int spread = byte_end - byte_start + 1; + max = spread > max ? spread : max; + start += width; + } + return max; +} + +struct KernelShape { + const int simd_bit_size_; + const int unpacked_bit_size_; + const int packed_bit_size_; + const int packed_max_spread_bytes_ = PackedMaxSpreadBytes(packed_bit_size_); + + /// Properties of an SIMD batch + constexpr int simd_bit_size() const { return simd_bit_size_; } + constexpr int simd_byte_size() const { return simd_bit_size_ / 8; } + + /// Properties of the unpacked integers + constexpr int unpacked_bit_size() const { return unpacked_bit_size_; } + constexpr int unpacked_byte_size() const { return unpacked_bit_size_ / 8; } + constexpr int unpacked_per_simd() const { return simd_bit_size_ / unpacked_bit_size_; } + + /// Properties of the packed integers + constexpr int packed_bit_size() const { return packed_bit_size_; } + constexpr int packed_max_spread_bytes() const { return packed_max_spread_bytes_; } +}; + +template +struct KernelTraits { + static constexpr KernelShape kShape = { + /* .simd_bit_size_= */ kSimdBitSize, + /* .unpacked_bit_size= */ 8 * sizeof(UnpackedUint), + /* .packed_bit_size_= */ kPackedBitSize, + }; + + using unpacked_type = UnpackedUint; + using simd_batch = xsimd::make_sized_batch_t; + using simd_bytes = xsimd::make_sized_batch_t; + using arch_type = typename simd_batch::arch_type; +}; + +struct KernelPlanSize { + int reads_per_kernel_; + int swizzles_per_read_; + int shifts_per_swizzle_; + + constexpr int reads_per_kernel() const { return reads_per_kernel_; } + + constexpr int swizzles_per_read() const { return swizzles_per_read_; } + constexpr int swizzles_per_kernel() const { + return swizzles_per_read_ * reads_per_kernel(); + } + + constexpr int shifts_per_swizzle() const { return shifts_per_swizzle_; } + constexpr int shifts_per_read() const { + return shifts_per_swizzle_ * swizzles_per_read(); + } + constexpr int shifts_per_kernel() const { + return shifts_per_read() * reads_per_kernel(); + } +}; + +constexpr KernelPlanSize BuildPlanSize(const KernelShape& shape) { + const int shifts_per_swizzle = + shape.unpacked_byte_size() / shape.packed_max_spread_bytes(); + + const int vals_per_swizzle = shifts_per_swizzle * shape.unpacked_per_simd(); + + const auto swizzles_per_read_for_offset = [&](int bit_offset) -> int { + const int vals_per_simd = + (shape.simd_bit_size() - bit_offset) / shape.packed_bit_size(); + return vals_per_simd / vals_per_swizzle; + }; + + // If after a whole swizzle reading iteration we fall unaligned, the remaining + // iterations will start with an aligned first value, reducing the effective capacity of + // the SIMD batch. + // We must check that our read iteration size still works with subsequent misalignment + // by looping until aligned. + // One may think that using such large reading iterations risks overshooting an aligned + // byte and increasing the total number of values extracted, but in practice reading + // iterations increase by factors of 2 and are quickly multiple of 8 (and aligned after + // the first one). + int swizzles_per_read = swizzles_per_read_for_offset(0); + int reads_per_kernel = 0; + int packed_start_bit = 0; + do { + int new_swizzles_per_read = swizzles_per_read_for_offset(packed_start_bit % 8); + if (new_swizzles_per_read <= swizzles_per_read) { + swizzles_per_read = new_swizzles_per_read; + packed_start_bit = 0; + reads_per_kernel = 0; + } + int bits_per_read = swizzles_per_read * vals_per_swizzle * shape.packed_bit_size(); + packed_start_bit += bits_per_read; + reads_per_kernel += 1; + } while (packed_start_bit % 8 != 0); + + return { + /* .reads_per_kernel_= */ reads_per_kernel, + /* .swizzles_per_read_= */ swizzles_per_read, + /* .shifts_per_swizzle_= */ shifts_per_swizzle, + }; +} + +template +struct KernelPlan { + using Traits = KernelTraits; + static constexpr auto kShape = Traits::kShape; + static constexpr auto kPlanSize = BuildPlanSize(kShape); + + using ReadsPerKernel = std::array; + + using Swizzle = std::array; + using SwizzlesPerRead = std::array; + using SwizzlesPerKernel = std::array; + + using Shift = std::array; + using ShiftsPerSwizzle = std::array; + using ShiftsPerRead = std::array; + using ShitsPerKernel = std::array; + + static constexpr int unpacked_per_shifts() { return kShape.unpacked_per_simd(); } + static constexpr int unpacked_per_swizzle() { + return unpacked_per_shifts() * kPlanSize.shifts_per_swizzle(); + } + static constexpr int unpacked_per_read() { + return unpacked_per_swizzle() * kPlanSize.swizzle_per_read(); + } + static constexpr int unpacked_per_kernel() { + return unpacked_per_read() * kPlanSize.reads_per_kernel(); + } + + ReadsPerKernel reads; + SwizzlesPerKernel swizzles; + ShitsPerKernel shifts; + UnpackedUint mask = bit_util::LeastSignificantBitMask(kPackedBitSize); +}; + +template +constexpr KernelPlan BuildPlan() { + using Plan = KernelPlan; + constexpr auto kShape = Plan::kShape; + constexpr auto kPlanSize = Plan::kPlanSize; + + Plan plan = {}; + + int packed_start_bit = 0; + for (int r = 0; r < kPlanSize.reads_per_kernel(); ++r) { + plan.reads.at(r) = packed_start_bit / 8; + packed_start_bit = packed_start_bit % 8; + + for (int sw = 0; sw < kPlanSize.swizzles_per_read(); ++sw) { + for (int sh = 0; sh < kPlanSize.shifts_per_swizzle(); ++sh) { + const int sh_offset_bytes = sh * kShape.packed_max_spread_bytes(); + const int sh_offset_bits = 8 * sh_offset_bytes; + + for (int u = 0; u < kShape.unpacked_per_simd(); ++u) { + const int packed_start_byte = packed_start_bit / 8; + const int u_offset_byte = u * kShape.unpacked_byte_size(); + const int sw_offset_byte = sh_offset_bytes + u_offset_byte; + + // Looping over the multiple bytes needed for current values + for (int b = 0; b < kShape.packed_max_spread_bytes(); ++b) { + plan.swizzles.at(r).at(sw).at(sw_offset_byte + b) = packed_start_byte + b; + } + // Shift is a single value but many packed values may be swizzles to a sing + // unpacked value + plan.shifts.at(r).at(sw).at(sh).at(u) = sh_offset_bits + packed_start_bit % 8; + + packed_start_bit += kShape.packed_bit_size(); + } + } + } + } + + return plan; +} + +} // namespace arrow::internal From 2c4f9f17a68c0b4faef88eec7e65ba36c9245c40 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 24 Oct 2025 13:11:27 +0200 Subject: [PATCH 06/64] Add simd kernel --- .../arrow/util/bpacking_simd_impl_internal.h | 104 +++++++++++++----- 1 file changed, 79 insertions(+), 25 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 298b965999a..97fe42d6f34 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -25,6 +25,7 @@ #include #include "arrow/util/bit_util.h" +#include "arrow/util/bpacking_dispatch_internal.h" namespace arrow::internal { @@ -38,30 +39,11 @@ namespace arrow::internal { // - Shifts per swizzle can be improved when self.packed_max_byte_spread == 1 and the // byte can be reused (when val_bit_width divides packed_max_byte_spread). -/// Compute the maximum spread in bytes that a packed integer can cover. -/// -/// This is assuming contiguous packed integer starting on a byte aligned boundary. -/// This function is non-monotonic, for instance three bit integers will be split on the -/// first byte boundary (hence having a spread of two bytes) while four bit integer will -/// be well behaved and never spread over byte boundary (hence having a spread of one). -constexpr int PackedMaxSpreadBytes(int width) { - int max = static_cast(bit_util::BytesForBits(width)); - int start = width; - while (start % 8 != 0) { - const int byte_start = start / 8; - const int byte_end = (start + width - 1) / 8; // inclusive end bit - const int spread = byte_end - byte_start + 1; - max = spread > max ? spread : max; - start += width; - } - return max; -} - struct KernelShape { const int simd_bit_size_; const int unpacked_bit_size_; const int packed_bit_size_; - const int packed_max_spread_bytes_ = PackedMaxSpreadBytes(packed_bit_size_); + const int packed_max_spread_bytes_ = PackedMaxSpreadBytes(packed_bit_size_, 0); /// Properties of an SIMD batch constexpr int simd_bit_size() const { return simd_bit_size_; } @@ -138,7 +120,7 @@ constexpr KernelPlanSize BuildPlanSize(const KernelShape& shape) { int packed_start_bit = 0; do { int new_swizzles_per_read = swizzles_per_read_for_offset(packed_start_bit % 8); - if (new_swizzles_per_read <= swizzles_per_read) { + if (new_swizzles_per_read < swizzles_per_read) { swizzles_per_read = new_swizzles_per_read; packed_start_bit = 0; reads_per_kernel = 0; @@ -177,7 +159,7 @@ struct KernelPlan { return unpacked_per_shifts() * kPlanSize.shifts_per_swizzle(); } static constexpr int unpacked_per_read() { - return unpacked_per_swizzle() * kPlanSize.swizzle_per_read(); + return unpacked_per_swizzle() * kPlanSize.swizzles_per_read(); } static constexpr int unpacked_per_kernel() { return unpacked_per_read() * kPlanSize.reads_per_kernel(); @@ -199,8 +181,8 @@ constexpr KernelPlan BuildPlan() { int packed_start_bit = 0; for (int r = 0; r < kPlanSize.reads_per_kernel(); ++r) { - plan.reads.at(r) = packed_start_bit / 8; - packed_start_bit = packed_start_bit % 8; + const int read_start_byte = packed_start_bit / 8; + plan.reads.at(r) = read_start_byte; for (int sw = 0; sw < kPlanSize.swizzles_per_read(); ++sw) { for (int sh = 0; sh < kPlanSize.shifts_per_swizzle(); ++sh) { @@ -209,12 +191,13 @@ constexpr KernelPlan BuildPlan() { for (int u = 0; u < kShape.unpacked_per_simd(); ++u) { const int packed_start_byte = packed_start_bit / 8; + const int packed_byte_in_read = packed_start_byte - read_start_byte; const int u_offset_byte = u * kShape.unpacked_byte_size(); const int sw_offset_byte = sh_offset_bytes + u_offset_byte; // Looping over the multiple bytes needed for current values for (int b = 0; b < kShape.packed_max_spread_bytes(); ++b) { - plan.swizzles.at(r).at(sw).at(sw_offset_byte + b) = packed_start_byte + b; + plan.swizzles.at(r).at(sw).at(sw_offset_byte + b) = packed_byte_in_read + b; } // Shift is a single value but many packed values may be swizzles to a sing // unpacked value @@ -229,4 +212,75 @@ constexpr KernelPlan BuildPlan() { return plan; } +template +struct Kernel { + static constexpr auto kPlan = BuildPlan(); + static constexpr auto kPlanSize = kPlan.kPlanSize; + static constexpr auto kShape = kPlan.kShape; + using Traits = typename decltype(kPlan)::Traits; + using unpacked_type = typename Traits::unpacked_type; + using simd_batch = typename Traits::simd_batch; + using simd_bytes = typename Traits::simd_bytes; + using arch_type = typename Traits::arch_type; + + static constexpr int kValuesUnpacked = kPlan.unpacked_per_kernel(); + + template + static void unpack_one_shift_impl(const simd_batch& words, unpacked_type* out) { + struct MakeShifts { + static constexpr unpacked_type get(int i, int n) { + return kPlan.shifts.at(kReadIdx).at(kSwizzleIdx).at(kShiftIdx).at(i); + } + }; + + constexpr auto kShifts = + xsimd::make_batch_constant(); + constexpr auto kMask = kPlan.mask; + constexpr auto kOutOffset = (kReadIdx * kPlan.unpacked_per_read() + + kSwizzleIdx * kPlan.unpacked_per_swizzle() + + kShiftIdx * kPlan.unpacked_per_shifts()); + + const auto vals = (words >> kShifts) & kMask; + xsimd::store_unaligned(out + kOutOffset, vals); + } + + template + static void unpack_one_swizzle_impl(const simd_bytes& bytes, unpacked_type* out, + std::integer_sequence) { + struct MakeSwizzles { + static constexpr int get(int i, int n) { + return kPlan.swizzles.at(kReadIdx).at(kSwizzleIdx).at(i); + } + }; + + constexpr auto kSwizzles = + xsimd::make_batch_constant(); + + const auto swizzled = xsimd::swizzle(bytes, kSwizzles); + const auto words = xsimd::bitwise_cast(swizzled); + (unpack_one_shift_impl(words, out), ...); + } + + template + static void unpack_one_read_impl(const uint8_t* in, unpacked_type* out, + std::integer_sequence) { + using ShiftSeq = std::make_integer_sequence; + const auto bytes = simd_bytes::load_unaligned(in + kPlan.reads.at(kReadIdx)); + (unpack_one_swizzle_impl(bytes, out, ShiftSeq{}), ...); + } + + template + static void unpack_all_impl(const uint8_t* in, unpacked_type* out, + std::integer_sequence) { + using SwizzleSeq = std::make_integer_sequence; + (unpack_one_read_impl(in, out, SwizzleSeq{}), ...); + } + + static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) { + using ReadSeq = std::make_integer_sequence; + unpack_all_impl(in, out, ReadSeq{}); + return in + (kPlan.unpacked_per_kernel() * kShape.packed_bit_size()) / 8; + } +}; + } // namespace arrow::internal From d79d399d95c60ed5c3886de2a1e2c23bd77c0250 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 27 Oct 2025 12:45:07 +0100 Subject: [PATCH 07/64] Handle rshifts on SSE2 --- .../arrow/util/bpacking_simd_impl_internal.h | 64 +++++++++++++++---- 1 file changed, 53 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 97fe42d6f34..13b6600e5ef 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -31,11 +31,9 @@ namespace arrow::internal { // https://github.com/fast-pack/LittleIntPacker/blob/master/src/horizontalpacking32.c // TODO -// - No zero and full size unpack here // - _mm_cvtepi8_epi32 -// - var rshifts no avail on SSE +// - no _mm_srlv_epi32 (128bit) in xsimd with AVX2 required arch // - no need for while loop (for up to 8 is sufficient) -// - no need for the top functions // - Shifts per swizzle can be improved when self.packed_max_byte_spread == 1 and the // byte can be reused (when val_bit_width divides packed_max_byte_spread). @@ -212,6 +210,19 @@ constexpr KernelPlan BuildPlan() { return plan; } +/// Simple constexpr maximum element suited for non empty arrays. +template +constexpr T max_value(const std::array& arr) { + static_assert(N > 0); + T out = 0; + for (const T& v : arr) { + if (v > out) { + out = v; + } + } + return out; +} + template struct Kernel { static constexpr auto kPlan = BuildPlan(); @@ -227,20 +238,51 @@ struct Kernel { template static void unpack_one_shift_impl(const simd_batch& words, unpacked_type* out) { - struct MakeShifts { - static constexpr unpacked_type get(int i, int n) { - return kPlan.shifts.at(kReadIdx).at(kSwizzleIdx).at(kShiftIdx).at(i); - } - }; + static constexpr auto kRightShiftsArr = + kPlan.shifts.at(kReadIdx).at(kSwizzleIdx).at(kShiftIdx); + + constexpr bool kHasSse2 = xsimd::supported_architectures::contains(); + constexpr bool kHasAvx2 = xsimd::supported_architectures::contains(); + + // Intel x86-64 does not have variable right shifts before AVX2. + // Instead, since we know the packed value can safely be left shifted up to the + // maximum already in the batch, we use a multiplication to emulate a left shits, + // followed by a static right shift. + // Trick from Daniel Lemire and Leonid Boytsov, Decoding billions of integers per + // second through vectorization, Software Practice & Experience 45 (1), 2015. + // http://arxiv.org/abs/1209.2137 + simd_batch shifted; + if constexpr (kHasSse2 && !kHasAvx2) { + static constexpr unpacked_type kMaxRightShift = max_value(kRightShiftsArr); + + struct MakeMults { + static constexpr unpacked_type get(int i, int n) { + // Equivalent to left shift of kMaxRightShift - kRightShifts.at(i). + return unpacked_type{1} << (kMaxRightShift - kRightShiftsArr.at(i)); + } + }; + + constexpr auto kMults = + xsimd::make_batch_constant(); + + shifted = (words * kMults) >> kMaxRightShift; + } else { + struct MakeRightShifts { + static constexpr unpacked_type get(int i, int n) { return kRightShiftsArr.at(i); } + }; + + constexpr auto kRightShifts = + xsimd::make_batch_constant(); + + shifted = words >> kRightShifts; + } - constexpr auto kShifts = - xsimd::make_batch_constant(); constexpr auto kMask = kPlan.mask; constexpr auto kOutOffset = (kReadIdx * kPlan.unpacked_per_read() + kSwizzleIdx * kPlan.unpacked_per_swizzle() + kShiftIdx * kPlan.unpacked_per_shifts()); - const auto vals = (words >> kShifts) & kMask; + const auto vals = shifted & kMask; xsimd::store_unaligned(out + kOutOffset, vals); } From 61ab1b2ab3fddc8b04119d6f0de35d59ce61cf9b Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 27 Oct 2025 14:49:38 +0100 Subject: [PATCH 08/64] Use new kernel when possible in generated 128 code --- .../bpacking_simd128_generated_internal.h | 23689 +--------------- cpp/src/arrow/util/bpacking_simd_codegen.py | 44 +- 2 files changed, 931 insertions(+), 22802 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h index c171da11c6f..6421bd30682 100644 --- a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h @@ -27,6 +27,7 @@ #include #include "arrow/util/ubsan.h" +#include "arrow/util/bpacking_simd_impl_internal.h" namespace arrow::internal { namespace { @@ -84,23050 +85,1473 @@ struct Simd128UnpackerForWidth { template<> struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 1-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 0, 1, 2, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 1-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 4, 5, 6, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 1-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 8, 9, 10, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 1-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 12, 13, 14, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 1-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 16, 17, 18, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 1-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 20, 21, 22, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 1-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 24, 25, 26, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 1-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 28, 29, 30, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 1 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 2-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 0, 2, 4, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 2-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 8, 10, 12, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 2-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 16, 18, 20, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 2-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 24, 26, 28, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 2-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 0, 2, 4, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 2-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 8, 10, 12, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 2-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 16, 18, 20, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 2-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 24, 26, 28, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 2 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; + + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7; + return Dispatch::unpack(in, out); + } +}; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 3-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 0, 3, 6, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 3-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 12, 15, 18, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 3-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 24, 27, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 3-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 4, 7, 10, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 3-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 16, 19, 22, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 3-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{ 28, 0, 2, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 3-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{ 8, 11, 14, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 3-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{ 20, 23, 26, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; - in += 3 * 4; - return in; +template<> +struct Simd128UnpackerForWidth { + + using Dispatch = Kernel; + + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; + + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; + + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xf; + return Dispatch::unpack(in, out); + } +}; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 4-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), +template<> +struct Simd128UnpackerForWidth { + + using Dispatch = Kernel; + + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; + + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; + +template<> +struct Simd128UnpackerForWidth { + + using Dispatch = Kernel; + + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; + + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; + +template<> +struct Simd128UnpackerForWidth { + + using Dispatch = Kernel; + + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; + + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; + +template<> +struct Simd128UnpackerForWidth { + + using Dispatch = Kernel; + + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; + + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; + +template<> +struct Simd128UnpackerForWidth { + + using Dispatch = Kernel; + + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; + + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; + +template<> +struct Simd128UnpackerForWidth { + + using Dispatch = Kernel; + + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; + + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; + +template<> +struct Simd128UnpackerForWidth { + + using Dispatch = Kernel; + + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; + + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; + +template<> +struct Simd128UnpackerForWidth { + + using Dispatch = Kernel; + + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; + + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; + +template<> +struct Simd128UnpackerForWidth { + + using Dispatch = Kernel; + + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; + + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; + +template<> +struct Simd128UnpackerForWidth { + + using Dispatch = Kernel; + + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; + + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; + +template<> +struct Simd128UnpackerForWidth { + + using Dispatch = Kernel; + + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; + + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; + +template<> +struct Simd128UnpackerForWidth { + + using Dispatch = Kernel; + + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; + + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; + +template<> +struct Simd128UnpackerForWidth { + + using Dispatch = Kernel; + + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; + + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; + +template<> +struct Simd128UnpackerForWidth { + + using Dispatch = Kernel; + + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; + + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; + +template<> +struct Simd128UnpackerForWidth { + + using Dispatch = Kernel; + + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; + + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; + +template<> +struct Simd128UnpackerForWidth { + + using simd_batch = xsimd::make_sized_batch_t; + static constexpr int kValuesUnpacked = 32; + + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + constexpr uint32_t kMask = 0x7ffffff; + + simd_batch masks(kMask); + simd_batch words, shifts; + simd_batch results; + // extract 27-bit bundles 0 to 3 + words = simd_batch{ SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 27 | SafeLoadAs(in + 4 * 1) << 5, + SafeLoadAs(in + 4 * 1) >> 22 | SafeLoadAs(in + 4 * 2) << 10, + SafeLoadAs(in + 4 * 2) >> 17 | SafeLoadAs(in + 4 * 3) << 15, }; - shifts = simd_batch{ 0, 4, 8, 12 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 4-bit bundles 4 to 7 + // extract 27-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 3) >> 12 | SafeLoadAs(in + 4 * 4) << 20, + SafeLoadAs(in + 4 * 4) >> 7 | SafeLoadAs(in + 4 * 5) << 25, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, }; - shifts = simd_batch{ 16, 20, 24, 28 }; + shifts = simd_batch{ 0, 0, 2, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 4-bit bundles 8 to 11 + // extract 27-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7) >> 19 | SafeLoadAs(in + 4 * 8) << 13, + SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, + SafeLoadAs(in + 4 * 9) >> 9 | SafeLoadAs(in + 4 * 10) << 23, }; - shifts = simd_batch{ 0, 4, 8, 12 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 4-bit bundles 12 to 15 + // extract 27-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 31 | SafeLoadAs(in + 4 * 11) << 1, + SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, + SafeLoadAs(in + 4 * 12) >> 21 | SafeLoadAs(in + 4 * 13) << 11, }; - shifts = simd_batch{ 16, 20, 24, 28 }; + shifts = simd_batch{ 4, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 4-bit bundles 16 to 19 + // extract 27-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, + SafeLoadAs(in + 4 * 14) >> 11 | SafeLoadAs(in + 4 * 15) << 21, + SafeLoadAs(in + 4 * 15) >> 6 | SafeLoadAs(in + 4 * 16) << 26, + SafeLoadAs(in + 4 * 16), }; - shifts = simd_batch{ 0, 4, 8, 12 }; + shifts = simd_batch{ 0, 0, 0, 1 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 4-bit bundles 20 to 23 + // extract 27-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, + SafeLoadAs(in + 4 * 17) >> 23 | SafeLoadAs(in + 4 * 18) << 9, + SafeLoadAs(in + 4 * 18) >> 18 | SafeLoadAs(in + 4 * 19) << 14, + SafeLoadAs(in + 4 * 19) >> 13 | SafeLoadAs(in + 4 * 20) << 19, }; - shifts = simd_batch{ 16, 20, 24, 28 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 4-bit bundles 24 to 27 + // extract 27-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 20) >> 8 | SafeLoadAs(in + 4 * 21) << 24, + SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 21) >> 30 | SafeLoadAs(in + 4 * 22) << 2, + SafeLoadAs(in + 4 * 22) >> 25 | SafeLoadAs(in + 4 * 23) << 7, }; - shifts = simd_batch{ 0, 4, 8, 12 }; + shifts = simd_batch{ 0, 3, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 4-bit bundles 28 to 31 + // extract 27-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 23) >> 20 | SafeLoadAs(in + 4 * 24) << 12, + SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, + SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, + SafeLoadAs(in + 4 * 26), }; - shifts = simd_batch{ 16, 20, 24, 28 }; + shifts = simd_batch{ 0, 0, 0, 5 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 4 * 4; + in += 27 * 4; return in; } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { + + using Dispatch = Kernel; + + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; + + static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { + return Dispatch::unpack(in, out); + } +}; + +template<> +struct Simd128UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1f; + constexpr uint32_t kMask = 0x1fffffff; simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; - // extract 5-bit bundles 0 to 3 + // extract 29-bit bundles 0 to 3 words = simd_batch{ SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 29 | SafeLoadAs(in + 4 * 1) << 3, + SafeLoadAs(in + 4 * 1) >> 26 | SafeLoadAs(in + 4 * 2) << 6, + SafeLoadAs(in + 4 * 2) >> 23 | SafeLoadAs(in + 4 * 3) << 9, }; - shifts = simd_batch{ 0, 5, 10, 15 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 5-bit bundles 4 to 7 + // extract 29-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 3) >> 20 | SafeLoadAs(in + 4 * 4) << 12, + SafeLoadAs(in + 4 * 4) >> 17 | SafeLoadAs(in + 4 * 5) << 15, + SafeLoadAs(in + 4 * 5) >> 14 | SafeLoadAs(in + 4 * 6) << 18, + SafeLoadAs(in + 4 * 6) >> 11 | SafeLoadAs(in + 4 * 7) << 21, }; - shifts = simd_batch{ 20, 25, 0, 3 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 5-bit bundles 8 to 11 + // extract 29-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 7) >> 8 | SafeLoadAs(in + 4 * 8) << 24, + SafeLoadAs(in + 4 * 8) >> 5 | SafeLoadAs(in + 4 * 9) << 27, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 31 | SafeLoadAs(in + 4 * 10) << 1, }; - shifts = simd_batch{ 8, 13, 18, 23 }; + shifts = simd_batch{ 0, 0, 2, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 5-bit bundles 12 to 15 + // extract 29-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 10) >> 28 | SafeLoadAs(in + 4 * 11) << 4, + SafeLoadAs(in + 4 * 11) >> 25 | SafeLoadAs(in + 4 * 12) << 7, + SafeLoadAs(in + 4 * 12) >> 22 | SafeLoadAs(in + 4 * 13) << 10, + SafeLoadAs(in + 4 * 13) >> 19 | SafeLoadAs(in + 4 * 14) << 13, }; - shifts = simd_batch{ 0, 1, 6, 11 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 5-bit bundles 16 to 19 + // extract 29-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 31 | SafeLoadAs(in + 4 * 3) << 1, + SafeLoadAs(in + 4 * 14) >> 16 | SafeLoadAs(in + 4 * 15) << 16, + SafeLoadAs(in + 4 * 15) >> 13 | SafeLoadAs(in + 4 * 16) << 19, + SafeLoadAs(in + 4 * 16) >> 10 | SafeLoadAs(in + 4 * 17) << 22, + SafeLoadAs(in + 4 * 17) >> 7 | SafeLoadAs(in + 4 * 18) << 25, }; - shifts = simd_batch{ 16, 21, 26, 0 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 5-bit bundles 20 to 23 + // extract 29-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 18) >> 4 | SafeLoadAs(in + 4 * 19) << 28, + SafeLoadAs(in + 4 * 19), + SafeLoadAs(in + 4 * 19) >> 30 | SafeLoadAs(in + 4 * 20) << 2, + SafeLoadAs(in + 4 * 20) >> 27 | SafeLoadAs(in + 4 * 21) << 5, }; - shifts = simd_batch{ 4, 9, 14, 19 }; + shifts = simd_batch{ 0, 1, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 5-bit bundles 24 to 27 + // extract 29-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 29 | SafeLoadAs(in + 4 * 4) << 3, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 21) >> 24 | SafeLoadAs(in + 4 * 22) << 8, + SafeLoadAs(in + 4 * 22) >> 21 | SafeLoadAs(in + 4 * 23) << 11, + SafeLoadAs(in + 4 * 23) >> 18 | SafeLoadAs(in + 4 * 24) << 14, + SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, }; - shifts = simd_batch{ 24, 0, 2, 7 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 5-bit bundles 28 to 31 + // extract 29-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 25) >> 12 | SafeLoadAs(in + 4 * 26) << 20, + SafeLoadAs(in + 4 * 26) >> 9 | SafeLoadAs(in + 4 * 27) << 23, + SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, + SafeLoadAs(in + 4 * 28), }; - shifts = simd_batch{ 12, 17, 22, 27 }; + shifts = simd_batch{ 0, 0, 0, 3 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 5 * 4; + in += 29 * 4; return in; } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3f; + constexpr uint32_t kMask = 0x3fffffff; simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; - // extract 6-bit bundles 0 to 3 + // extract 30-bit bundles 0 to 3 words = simd_batch{ SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, }; - shifts = simd_batch{ 0, 6, 12, 18 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 6-bit bundles 4 to 7 + // extract 30-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, + SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, + SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, }; - shifts = simd_batch{ 24, 0, 4, 10 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 6-bit bundles 8 to 11 + // extract 30-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, + SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, + SafeLoadAs(in + 4 * 9) >> 12 | SafeLoadAs(in + 4 * 10) << 20, + SafeLoadAs(in + 4 * 10) >> 10 | SafeLoadAs(in + 4 * 11) << 22, }; - shifts = simd_batch{ 16, 22, 0, 2 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 6-bit bundles 12 to 15 + // extract 30-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 11) >> 8 | SafeLoadAs(in + 4 * 12) << 24, + SafeLoadAs(in + 4 * 12) >> 6 | SafeLoadAs(in + 4 * 13) << 26, + SafeLoadAs(in + 4 * 13) >> 4 | SafeLoadAs(in + 4 * 14) << 28, + SafeLoadAs(in + 4 * 14), }; - shifts = simd_batch{ 8, 14, 20, 26 }; + shifts = simd_batch{ 0, 0, 0, 2 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 6-bit bundles 16 to 19 + // extract 30-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 30 | SafeLoadAs(in + 4 * 16) << 2, + SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, + SafeLoadAs(in + 4 * 17) >> 26 | SafeLoadAs(in + 4 * 18) << 6, }; - shifts = simd_batch{ 0, 6, 12, 18 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 6-bit bundles 20 to 23 + // extract 30-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, + SafeLoadAs(in + 4 * 19) >> 22 | SafeLoadAs(in + 4 * 20) << 10, + SafeLoadAs(in + 4 * 20) >> 20 | SafeLoadAs(in + 4 * 21) << 12, + SafeLoadAs(in + 4 * 21) >> 18 | SafeLoadAs(in + 4 * 22) << 14, }; - shifts = simd_batch{ 24, 0, 4, 10 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 6-bit bundles 24 to 27 + // extract 30-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, - SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 22) >> 16 | SafeLoadAs(in + 4 * 23) << 16, + SafeLoadAs(in + 4 * 23) >> 14 | SafeLoadAs(in + 4 * 24) << 18, + SafeLoadAs(in + 4 * 24) >> 12 | SafeLoadAs(in + 4 * 25) << 20, + SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, }; - shifts = simd_batch{ 16, 22, 0, 2 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 6-bit bundles 28 to 31 + // extract 30-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 26) >> 8 | SafeLoadAs(in + 4 * 27) << 24, + SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, + SafeLoadAs(in + 4 * 28) >> 4 | SafeLoadAs(in + 4 * 29) << 28, + SafeLoadAs(in + 4 * 29), }; - shifts = simd_batch{ 8, 14, 20, 26 }; + shifts = simd_batch{ 0, 0, 0, 2 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 6 * 4; + in += 30 * 4; return in; } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7f; + constexpr uint32_t kMask = 0x7fffffff; simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; - // extract 7-bit bundles 0 to 3 + // extract 31-bit bundles 0 to 3 words = simd_batch{ SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 31 | SafeLoadAs(in + 4 * 1) << 1, + SafeLoadAs(in + 4 * 1) >> 30 | SafeLoadAs(in + 4 * 2) << 2, + SafeLoadAs(in + 4 * 2) >> 29 | SafeLoadAs(in + 4 * 3) << 3, }; - shifts = simd_batch{ 0, 7, 14, 21 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 7-bit bundles 4 to 7 + // extract 31-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 3) >> 28 | SafeLoadAs(in + 4 * 4) << 4, + SafeLoadAs(in + 4 * 4) >> 27 | SafeLoadAs(in + 4 * 5) << 5, + SafeLoadAs(in + 4 * 5) >> 26 | SafeLoadAs(in + 4 * 6) << 6, + SafeLoadAs(in + 4 * 6) >> 25 | SafeLoadAs(in + 4 * 7) << 7, }; - shifts = simd_batch{ 0, 3, 10, 17 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 7-bit bundles 8 to 11 + // extract 31-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 7) >> 24 | SafeLoadAs(in + 4 * 8) << 8, + SafeLoadAs(in + 4 * 8) >> 23 | SafeLoadAs(in + 4 * 9) << 9, + SafeLoadAs(in + 4 * 9) >> 22 | SafeLoadAs(in + 4 * 10) << 10, + SafeLoadAs(in + 4 * 10) >> 21 | SafeLoadAs(in + 4 * 11) << 11, }; - shifts = simd_batch{ 24, 0, 6, 13 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 7-bit bundles 12 to 15 + // extract 31-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 27 | SafeLoadAs(in + 4 * 3) << 5, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 11) >> 20 | SafeLoadAs(in + 4 * 12) << 12, + SafeLoadAs(in + 4 * 12) >> 19 | SafeLoadAs(in + 4 * 13) << 13, + SafeLoadAs(in + 4 * 13) >> 18 | SafeLoadAs(in + 4 * 14) << 14, + SafeLoadAs(in + 4 * 14) >> 17 | SafeLoadAs(in + 4 * 15) << 15, }; - shifts = simd_batch{ 20, 0, 2, 9 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 7-bit bundles 16 to 19 + // extract 31-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 15) >> 16 | SafeLoadAs(in + 4 * 16) << 16, + SafeLoadAs(in + 4 * 16) >> 15 | SafeLoadAs(in + 4 * 17) << 17, + SafeLoadAs(in + 4 * 17) >> 14 | SafeLoadAs(in + 4 * 18) << 18, + SafeLoadAs(in + 4 * 18) >> 13 | SafeLoadAs(in + 4 * 19) << 19, }; - shifts = simd_batch{ 16, 23, 0, 5 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 7-bit bundles 20 to 23 + // extract 31-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, - SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 19) >> 12 | SafeLoadAs(in + 4 * 20) << 20, + SafeLoadAs(in + 4 * 20) >> 11 | SafeLoadAs(in + 4 * 21) << 21, + SafeLoadAs(in + 4 * 21) >> 10 | SafeLoadAs(in + 4 * 22) << 22, + SafeLoadAs(in + 4 * 22) >> 9 | SafeLoadAs(in + 4 * 23) << 23, }; - shifts = simd_batch{ 12, 19, 0, 1 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 7-bit bundles 24 to 27 + // extract 31-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, + SafeLoadAs(in + 4 * 23) >> 8 | SafeLoadAs(in + 4 * 24) << 24, + SafeLoadAs(in + 4 * 24) >> 7 | SafeLoadAs(in + 4 * 25) << 25, + SafeLoadAs(in + 4 * 25) >> 6 | SafeLoadAs(in + 4 * 26) << 26, + SafeLoadAs(in + 4 * 26) >> 5 | SafeLoadAs(in + 4 * 27) << 27, }; - shifts = simd_batch{ 8, 15, 22, 0 }; + shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - // extract 7-bit bundles 28 to 31 + // extract 31-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 27) >> 4 | SafeLoadAs(in + 4 * 28) << 28, + SafeLoadAs(in + 4 * 28) >> 3 | SafeLoadAs(in + 4 * 29) << 29, + SafeLoadAs(in + 4 * 29) >> 2 | SafeLoadAs(in + 4 * 30) << 30, + SafeLoadAs(in + 4 * 30), }; - shifts = simd_batch{ 4, 11, 18, 25 }; + shifts = simd_batch{ 0, 0, 0, 1 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 7 * 4; + in += 31 * 4; return in; } }; + template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xff; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 8-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 8-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 8-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{ 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 8-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{ 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 8-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{ 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 8-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{ 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 8-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{ 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 8-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - }; - shifts = simd_batch{ 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 8 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1ff; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 9-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 27 | SafeLoadAs(in + 4 * 1) << 5, - }; - shifts = simd_batch{ 0, 9, 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 9-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, - }; - shifts = simd_batch{ 4, 13, 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 9-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{ 8, 17, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 9-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{ 12, 21, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 9-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 25 | SafeLoadAs(in + 4 * 5) << 7, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{ 16, 0, 2, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 9-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{ 20, 0, 6, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 9-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - }; - shifts = simd_batch{ 0, 1, 10, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 9-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - }; - shifts = simd_batch{ 0, 5, 14, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 9 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3ff; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 10-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - }; - shifts = simd_batch{ 0, 10, 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 10-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{ 8, 18, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 10-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{ 16, 0, 4, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 10-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{ 0, 2, 12, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 10-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 30 | SafeLoadAs(in + 4 * 6) << 2, - }; - shifts = simd_batch{ 0, 10, 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 10-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, - SafeLoadAs(in + 4 * 7), - }; - shifts = simd_batch{ 8, 18, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 10-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 26 | SafeLoadAs(in + 4 * 8) << 6, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - }; - shifts = simd_batch{ 16, 0, 4, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 10-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9), - }; - shifts = simd_batch{ 0, 2, 12, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 10 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7ff; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 11-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 22 | SafeLoadAs(in + 4 * 1) << 10, - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 0, 11, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 11-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 23 | SafeLoadAs(in + 4 * 2) << 9, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{ 12, 0, 2, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 11-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 2) >> 24 | SafeLoadAs(in + 4 * 3) << 8, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 25 | SafeLoadAs(in + 4 * 4) << 7, - }; - shifts = simd_batch{ 0, 3, 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 11-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{ 4, 15, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 11-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 27 | SafeLoadAs(in + 4 * 6) << 5, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{ 16, 0, 6, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 11-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 29 | SafeLoadAs(in + 4 * 8) << 3, - }; - shifts = simd_batch{ 0, 7, 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 11-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, - SafeLoadAs(in + 4 * 9), - }; - shifts = simd_batch{ 8, 19, 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 11-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 31 | SafeLoadAs(in + 4 * 10) << 1, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10), - }; - shifts = simd_batch{ 20, 0, 10, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 11 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xfff; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 12-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 24 | SafeLoadAs(in + 4 * 1) << 8, - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 0, 12, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 12-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{ 16, 0, 8, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 12-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{ 0, 12, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 12-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{ 16, 0, 8, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 12-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7), - }; - shifts = simd_batch{ 0, 12, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 12-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - }; - shifts = simd_batch{ 16, 0, 8, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 12-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, - SafeLoadAs(in + 4 * 10), - }; - shifts = simd_batch{ 0, 12, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 12-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 28 | SafeLoadAs(in + 4 * 11) << 4, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11), - }; - shifts = simd_batch{ 16, 0, 8, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 12 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1fff; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 13-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 26 | SafeLoadAs(in + 4 * 1) << 6, - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 0, 13, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 13-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 1) >> 20 | SafeLoadAs(in + 4 * 2) << 12, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 27 | SafeLoadAs(in + 4 * 3) << 5, - }; - shifts = simd_batch{ 0, 1, 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 13-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 21 | SafeLoadAs(in + 4 * 4) << 11, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{ 8, 0, 2, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 13-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{ 0, 9, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 13-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 29 | SafeLoadAs(in + 4 * 7) << 3, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 23 | SafeLoadAs(in + 4 * 8) << 9, - }; - shifts = simd_batch{ 16, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 13-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, - SafeLoadAs(in + 4 * 9), - }; - shifts = simd_batch{ 4, 17, 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 13-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 31 | SafeLoadAs(in + 4 * 11) << 1, - }; - shifts = simd_batch{ 0, 5, 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 13-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 25 | SafeLoadAs(in + 4 * 12) << 7, - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12), - }; - shifts = simd_batch{ 12, 0, 6, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 13 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3fff; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 14-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 0, 14, 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 14-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 1) >> 24 | SafeLoadAs(in + 4 * 2) << 8, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{ 0, 6, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 14-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, - }; - shifts = simd_batch{ 16, 0, 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 14-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{ 8, 0, 4, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 14-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8), - }; - shifts = simd_batch{ 0, 14, 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 14-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, - SafeLoadAs(in + 4 * 10), - }; - shifts = simd_batch{ 0, 6, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 14-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 30 | SafeLoadAs(in + 4 * 11) << 2, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, - }; - shifts = simd_batch{ 16, 0, 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 14-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 22 | SafeLoadAs(in + 4 * 13) << 10, - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13), - }; - shifts = simd_batch{ 8, 0, 4, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 14 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7fff; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 15-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 0, 15, 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 15-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{ 0, 11, 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 15-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{ 0, 7, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 15-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, - SafeLoadAs(in + 4 * 7), - }; - shifts = simd_batch{ 0, 3, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 15-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 31 | SafeLoadAs(in + 4 * 8) << 1, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 29 | SafeLoadAs(in + 4 * 9) << 3, - }; - shifts = simd_batch{ 16, 0, 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 15-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 27 | SafeLoadAs(in + 4 * 10) << 5, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 25 | SafeLoadAs(in + 4 * 11) << 7, - }; - shifts = simd_batch{ 12, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 15-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 23 | SafeLoadAs(in + 4 * 12) << 9, - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 21 | SafeLoadAs(in + 4 * 13) << 11, - }; - shifts = simd_batch{ 8, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 15-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 19 | SafeLoadAs(in + 4 * 14) << 13, - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14), - }; - shifts = simd_batch{ 4, 0, 2, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 15 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xffff; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 16-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 0, 16, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 16-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{ 0, 16, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 16-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{ 0, 16, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 16-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - }; - shifts = simd_batch{ 0, 16, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 16-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9), - }; - shifts = simd_batch{ 0, 16, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 16-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11), - }; - shifts = simd_batch{ 0, 16, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 16-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13), - }; - shifts = simd_batch{ 0, 16, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 16-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15), - }; - shifts = simd_batch{ 0, 16, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 16 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1ffff; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 17-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 17 | SafeLoadAs(in + 4 * 1) << 15, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 19 | SafeLoadAs(in + 4 * 2) << 13, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 17-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 21 | SafeLoadAs(in + 4 * 3) << 11, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 23 | SafeLoadAs(in + 4 * 4) << 9, - }; - shifts = simd_batch{ 4, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 17-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 25 | SafeLoadAs(in + 4 * 5) << 7, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 27 | SafeLoadAs(in + 4 * 6) << 5, - }; - shifts = simd_batch{ 8, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 17-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 29 | SafeLoadAs(in + 4 * 7) << 3, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 31 | SafeLoadAs(in + 4 * 8) << 1, - }; - shifts = simd_batch{ 12, 0, 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 17-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 8) >> 16 | SafeLoadAs(in + 4 * 9) << 16, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 18 | SafeLoadAs(in + 4 * 10) << 14, - SafeLoadAs(in + 4 * 10), - }; - shifts = simd_batch{ 0, 1, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 17-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 10) >> 20 | SafeLoadAs(in + 4 * 11) << 12, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 22 | SafeLoadAs(in + 4 * 12) << 10, - SafeLoadAs(in + 4 * 12), - }; - shifts = simd_batch{ 0, 5, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 17-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 12) >> 24 | SafeLoadAs(in + 4 * 13) << 8, - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 26 | SafeLoadAs(in + 4 * 14) << 6, - SafeLoadAs(in + 4 * 14), - }; - shifts = simd_batch{ 0, 9, 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 17-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 14) >> 28 | SafeLoadAs(in + 4 * 15) << 4, - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 30 | SafeLoadAs(in + 4 * 16) << 2, - SafeLoadAs(in + 4 * 16), - }; - shifts = simd_batch{ 0, 13, 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 17 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3ffff; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 18-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 18 | SafeLoadAs(in + 4 * 1) << 14, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 22 | SafeLoadAs(in + 4 * 2) << 10, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 18-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - }; - shifts = simd_batch{ 8, 0, 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 18-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 4) >> 16 | SafeLoadAs(in + 4 * 5) << 16, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{ 0, 2, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 18-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8), - }; - shifts = simd_batch{ 0, 10, 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 18-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 18 | SafeLoadAs(in + 4 * 10) << 14, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 22 | SafeLoadAs(in + 4 * 11) << 10, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 18-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 30 | SafeLoadAs(in + 4 * 13) << 2, - }; - shifts = simd_batch{ 8, 0, 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 18-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14) >> 20 | SafeLoadAs(in + 4 * 15) << 12, - SafeLoadAs(in + 4 * 15), - }; - shifts = simd_batch{ 0, 2, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 18-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, - SafeLoadAs(in + 4 * 16), - SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, - SafeLoadAs(in + 4 * 17), - }; - shifts = simd_batch{ 0, 10, 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 18 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7ffff; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 19-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 19 | SafeLoadAs(in + 4 * 1) << 13, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 25 | SafeLoadAs(in + 4 * 2) << 7, - }; - shifts = simd_batch{ 0, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 19-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 31 | SafeLoadAs(in + 4 * 3) << 1, - SafeLoadAs(in + 4 * 3) >> 18 | SafeLoadAs(in + 4 * 4) << 14, - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{ 12, 0, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 19-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 4) >> 24 | SafeLoadAs(in + 4 * 5) << 8, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 30 | SafeLoadAs(in + 4 * 6) << 2, - SafeLoadAs(in + 4 * 6) >> 17 | SafeLoadAs(in + 4 * 7) << 15, - }; - shifts = simd_batch{ 0, 11, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 19-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 23 | SafeLoadAs(in + 4 * 8) << 9, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 29 | SafeLoadAs(in + 4 * 9) << 3, - }; - shifts = simd_batch{ 4, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 19-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 9) >> 16 | SafeLoadAs(in + 4 * 10) << 16, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 22 | SafeLoadAs(in + 4 * 11) << 10, - SafeLoadAs(in + 4 * 11), - }; - shifts = simd_batch{ 0, 3, 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 19-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 11) >> 28 | SafeLoadAs(in + 4 * 12) << 4, - SafeLoadAs(in + 4 * 12) >> 15 | SafeLoadAs(in + 4 * 13) << 17, - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 21 | SafeLoadAs(in + 4 * 14) << 11, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 19-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14) >> 27 | SafeLoadAs(in + 4 * 15) << 5, - SafeLoadAs(in + 4 * 15) >> 14 | SafeLoadAs(in + 4 * 16) << 18, - SafeLoadAs(in + 4 * 16), - }; - shifts = simd_batch{ 8, 0, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 19-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 16) >> 20 | SafeLoadAs(in + 4 * 17) << 12, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 26 | SafeLoadAs(in + 4 * 18) << 6, - SafeLoadAs(in + 4 * 18), - }; - shifts = simd_batch{ 0, 7, 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 19 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xfffff; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 20-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 20 | SafeLoadAs(in + 4 * 1) << 12, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - }; - shifts = simd_batch{ 0, 0, 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 20-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 2) >> 16 | SafeLoadAs(in + 4 * 3) << 16, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{ 0, 4, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 20-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, - }; - shifts = simd_batch{ 0, 0, 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 20-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, - SafeLoadAs(in + 4 * 9), - }; - shifts = simd_batch{ 0, 4, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 20-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 20 | SafeLoadAs(in + 4 * 11) << 12, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 28 | SafeLoadAs(in + 4 * 12) << 4, - }; - shifts = simd_batch{ 0, 0, 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 20-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 12) >> 16 | SafeLoadAs(in + 4 * 13) << 16, - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 24 | SafeLoadAs(in + 4 * 14) << 8, - SafeLoadAs(in + 4 * 14), - }; - shifts = simd_batch{ 0, 4, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 20-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 20 | SafeLoadAs(in + 4 * 16) << 12, - SafeLoadAs(in + 4 * 16), - SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, - }; - shifts = simd_batch{ 0, 0, 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 20-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 17) >> 16 | SafeLoadAs(in + 4 * 18) << 16, - SafeLoadAs(in + 4 * 18), - SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, - SafeLoadAs(in + 4 * 19), - }; - shifts = simd_batch{ 0, 4, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 20 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1fffff; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 21-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 21 | SafeLoadAs(in + 4 * 1) << 11, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, - }; - shifts = simd_batch{ 0, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 21-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4) >> 19 | SafeLoadAs(in + 4 * 5) << 13, - }; - shifts = simd_batch{ 0, 9, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 21-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, - SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, - SafeLoadAs(in + 4 * 7), - }; - shifts = simd_batch{ 8, 0, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 21-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8) >> 17 | SafeLoadAs(in + 4 * 9) << 15, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 27 | SafeLoadAs(in + 4 * 10) << 5, - }; - shifts = simd_batch{ 0, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 21-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, - SafeLoadAs(in + 4 * 12) >> 15 | SafeLoadAs(in + 4 * 13) << 17, - }; - shifts = simd_batch{ 0, 5, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 21-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 25 | SafeLoadAs(in + 4 * 14) << 7, - SafeLoadAs(in + 4 * 14) >> 14 | SafeLoadAs(in + 4 * 15) << 18, - SafeLoadAs(in + 4 * 15), - }; - shifts = simd_batch{ 4, 0, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 21-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, - SafeLoadAs(in + 4 * 16) >> 13 | SafeLoadAs(in + 4 * 17) << 19, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 23 | SafeLoadAs(in + 4 * 18) << 9, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 21-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 18) >> 12 | SafeLoadAs(in + 4 * 19) << 20, - SafeLoadAs(in + 4 * 19), - SafeLoadAs(in + 4 * 19) >> 22 | SafeLoadAs(in + 4 * 20) << 10, - SafeLoadAs(in + 4 * 20), - }; - shifts = simd_batch{ 0, 1, 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 21 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3fffff; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 22-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 22 | SafeLoadAs(in + 4 * 1) << 10, - SafeLoadAs(in + 4 * 1) >> 12 | SafeLoadAs(in + 4 * 2) << 20, - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{ 0, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 22-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 2) >> 24 | SafeLoadAs(in + 4 * 3) << 8, - SafeLoadAs(in + 4 * 3) >> 14 | SafeLoadAs(in + 4 * 4) << 18, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 22-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 5) >> 16 | SafeLoadAs(in + 4 * 6) << 16, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, - SafeLoadAs(in + 4 * 7) >> 18 | SafeLoadAs(in + 4 * 8) << 14, - }; - shifts = simd_batch{ 0, 6, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 22-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, - SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, - SafeLoadAs(in + 4 * 10), - }; - shifts = simd_batch{ 8, 0, 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 22-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 22 | SafeLoadAs(in + 4 * 12) << 10, - SafeLoadAs(in + 4 * 12) >> 12 | SafeLoadAs(in + 4 * 13) << 20, - SafeLoadAs(in + 4 * 13), - }; - shifts = simd_batch{ 0, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 22-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 13) >> 24 | SafeLoadAs(in + 4 * 14) << 8, - SafeLoadAs(in + 4 * 14) >> 14 | SafeLoadAs(in + 4 * 15) << 18, - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 26 | SafeLoadAs(in + 4 * 16) << 6, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 22-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 16) >> 16 | SafeLoadAs(in + 4 * 17) << 16, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 28 | SafeLoadAs(in + 4 * 18) << 4, - SafeLoadAs(in + 4 * 18) >> 18 | SafeLoadAs(in + 4 * 19) << 14, - }; - shifts = simd_batch{ 0, 6, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 22-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 19), - SafeLoadAs(in + 4 * 19) >> 30 | SafeLoadAs(in + 4 * 20) << 2, - SafeLoadAs(in + 4 * 20) >> 20 | SafeLoadAs(in + 4 * 21) << 12, - SafeLoadAs(in + 4 * 21), - }; - shifts = simd_batch{ 8, 0, 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 22 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7fffff; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 23-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 23 | SafeLoadAs(in + 4 * 1) << 9, - SafeLoadAs(in + 4 * 1) >> 14 | SafeLoadAs(in + 4 * 2) << 18, - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{ 0, 0, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 23-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 2) >> 28 | SafeLoadAs(in + 4 * 3) << 4, - SafeLoadAs(in + 4 * 3) >> 19 | SafeLoadAs(in + 4 * 4) << 13, - SafeLoadAs(in + 4 * 4) >> 10 | SafeLoadAs(in + 4 * 5) << 22, - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{ 0, 0, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 23-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 5) >> 24 | SafeLoadAs(in + 4 * 6) << 8, - SafeLoadAs(in + 4 * 6) >> 15 | SafeLoadAs(in + 4 * 7) << 17, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 29 | SafeLoadAs(in + 4 * 8) << 3, - }; - shifts = simd_batch{ 0, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 23-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 8) >> 20 | SafeLoadAs(in + 4 * 9) << 12, - SafeLoadAs(in + 4 * 9) >> 11 | SafeLoadAs(in + 4 * 10) << 21, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 25 | SafeLoadAs(in + 4 * 11) << 7, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 23-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 11) >> 16 | SafeLoadAs(in + 4 * 12) << 16, - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 30 | SafeLoadAs(in + 4 * 13) << 2, - SafeLoadAs(in + 4 * 13) >> 21 | SafeLoadAs(in + 4 * 14) << 11, - }; - shifts = simd_batch{ 0, 7, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 23-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 14) >> 12 | SafeLoadAs(in + 4 * 15) << 20, - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 26 | SafeLoadAs(in + 4 * 16) << 6, - SafeLoadAs(in + 4 * 16) >> 17 | SafeLoadAs(in + 4 * 17) << 15, - }; - shifts = simd_batch{ 0, 3, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 23-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 31 | SafeLoadAs(in + 4 * 18) << 1, - SafeLoadAs(in + 4 * 18) >> 22 | SafeLoadAs(in + 4 * 19) << 10, - SafeLoadAs(in + 4 * 19) >> 13 | SafeLoadAs(in + 4 * 20) << 19, - }; - shifts = simd_batch{ 8, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 23-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 20), - SafeLoadAs(in + 4 * 20) >> 27 | SafeLoadAs(in + 4 * 21) << 5, - SafeLoadAs(in + 4 * 21) >> 18 | SafeLoadAs(in + 4 * 22) << 14, - SafeLoadAs(in + 4 * 22), - }; - shifts = simd_batch{ 4, 0, 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 23 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xffffff; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 24-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 24 | SafeLoadAs(in + 4 * 1) << 8, - SafeLoadAs(in + 4 * 1) >> 16 | SafeLoadAs(in + 4 * 2) << 16, - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{ 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4) >> 16 | SafeLoadAs(in + 4 * 5) << 16, - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{ 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, - SafeLoadAs(in + 4 * 8), - }; - shifts = simd_batch{ 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, - SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, - SafeLoadAs(in + 4 * 11), - }; - shifts = simd_batch{ 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 24 | SafeLoadAs(in + 4 * 13) << 8, - SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, - SafeLoadAs(in + 4 * 14), - }; - shifts = simd_batch{ 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 24-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, - SafeLoadAs(in + 4 * 16) >> 16 | SafeLoadAs(in + 4 * 17) << 16, - SafeLoadAs(in + 4 * 17), - }; - shifts = simd_batch{ 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 24-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 18), - SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, - SafeLoadAs(in + 4 * 19) >> 16 | SafeLoadAs(in + 4 * 20) << 16, - SafeLoadAs(in + 4 * 20), - }; - shifts = simd_batch{ 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 24-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 21), - SafeLoadAs(in + 4 * 21) >> 24 | SafeLoadAs(in + 4 * 22) << 8, - SafeLoadAs(in + 4 * 22) >> 16 | SafeLoadAs(in + 4 * 23) << 16, - SafeLoadAs(in + 4 * 23), - }; - shifts = simd_batch{ 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 24 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1ffffff; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 25-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 25 | SafeLoadAs(in + 4 * 1) << 7, - SafeLoadAs(in + 4 * 1) >> 18 | SafeLoadAs(in + 4 * 2) << 14, - SafeLoadAs(in + 4 * 2) >> 11 | SafeLoadAs(in + 4 * 3) << 21, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 25-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 29 | SafeLoadAs(in + 4 * 4) << 3, - SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, - SafeLoadAs(in + 4 * 5) >> 15 | SafeLoadAs(in + 4 * 6) << 17, - }; - shifts = simd_batch{ 4, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 25-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 8 | SafeLoadAs(in + 4 * 7) << 24, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 26 | SafeLoadAs(in + 4 * 8) << 6, - SafeLoadAs(in + 4 * 8) >> 19 | SafeLoadAs(in + 4 * 9) << 13, - }; - shifts = simd_batch{ 0, 1, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 25-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 9) >> 12 | SafeLoadAs(in + 4 * 10) << 20, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 30 | SafeLoadAs(in + 4 * 11) << 2, - SafeLoadAs(in + 4 * 11) >> 23 | SafeLoadAs(in + 4 * 12) << 9, - }; - shifts = simd_batch{ 0, 5, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 25-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 12) >> 16 | SafeLoadAs(in + 4 * 13) << 16, - SafeLoadAs(in + 4 * 13) >> 9 | SafeLoadAs(in + 4 * 14) << 23, - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14) >> 27 | SafeLoadAs(in + 4 * 15) << 5, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 25-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 15) >> 20 | SafeLoadAs(in + 4 * 16) << 12, - SafeLoadAs(in + 4 * 16) >> 13 | SafeLoadAs(in + 4 * 17) << 19, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 31 | SafeLoadAs(in + 4 * 18) << 1, - }; - shifts = simd_batch{ 0, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 25-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, - SafeLoadAs(in + 4 * 19) >> 17 | SafeLoadAs(in + 4 * 20) << 15, - SafeLoadAs(in + 4 * 20) >> 10 | SafeLoadAs(in + 4 * 21) << 22, - SafeLoadAs(in + 4 * 21), - }; - shifts = simd_batch{ 0, 0, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 25-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 21) >> 28 | SafeLoadAs(in + 4 * 22) << 4, - SafeLoadAs(in + 4 * 22) >> 21 | SafeLoadAs(in + 4 * 23) << 11, - SafeLoadAs(in + 4 * 23) >> 14 | SafeLoadAs(in + 4 * 24) << 18, - SafeLoadAs(in + 4 * 24), - }; - shifts = simd_batch{ 0, 0, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 25 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { +struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; + using Dispatch = Kernel; - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3ffffff; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 26-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 26 | SafeLoadAs(in + 4 * 1) << 6, - SafeLoadAs(in + 4 * 1) >> 20 | SafeLoadAs(in + 4 * 2) << 12, - SafeLoadAs(in + 4 * 2) >> 14 | SafeLoadAs(in + 4 * 3) << 18, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 26-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 8 | SafeLoadAs(in + 4 * 4) << 24, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, - SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, - }; - shifts = simd_batch{ 0, 2, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 26-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 16 | SafeLoadAs(in + 4 * 7) << 16, - SafeLoadAs(in + 4 * 7) >> 10 | SafeLoadAs(in + 4 * 8) << 22, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 26-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, - SafeLoadAs(in + 4 * 10) >> 18 | SafeLoadAs(in + 4 * 11) << 14, - SafeLoadAs(in + 4 * 11) >> 12 | SafeLoadAs(in + 4 * 12) << 20, - SafeLoadAs(in + 4 * 12), - }; - shifts = simd_batch{ 0, 0, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 26-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 26 | SafeLoadAs(in + 4 * 14) << 6, - SafeLoadAs(in + 4 * 14) >> 20 | SafeLoadAs(in + 4 * 15) << 12, - SafeLoadAs(in + 4 * 15) >> 14 | SafeLoadAs(in + 4 * 16) << 18, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 26-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 16) >> 8 | SafeLoadAs(in + 4 * 17) << 24, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 28 | SafeLoadAs(in + 4 * 18) << 4, - SafeLoadAs(in + 4 * 18) >> 22 | SafeLoadAs(in + 4 * 19) << 10, - }; - shifts = simd_batch{ 0, 2, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template<> +struct Simd128UnpackerForWidth { - // extract 26-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 19) >> 16 | SafeLoadAs(in + 4 * 20) << 16, - SafeLoadAs(in + 4 * 20) >> 10 | SafeLoadAs(in + 4 * 21) << 22, - SafeLoadAs(in + 4 * 21), - SafeLoadAs(in + 4 * 21) >> 30 | SafeLoadAs(in + 4 * 22) << 2, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 26-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 22) >> 24 | SafeLoadAs(in + 4 * 23) << 8, - SafeLoadAs(in + 4 * 23) >> 18 | SafeLoadAs(in + 4 * 24) << 14, - SafeLoadAs(in + 4 * 24) >> 12 | SafeLoadAs(in + 4 * 25) << 20, - SafeLoadAs(in + 4 * 25), - }; - shifts = simd_batch{ 0, 0, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 26 * 4; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7ffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 27-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 27 | SafeLoadAs(in + 4 * 1) << 5, - SafeLoadAs(in + 4 * 1) >> 22 | SafeLoadAs(in + 4 * 2) << 10, - SafeLoadAs(in + 4 * 2) >> 17 | SafeLoadAs(in + 4 * 3) << 15, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +struct Simd128UnpackerForWidth { - // extract 27-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 12 | SafeLoadAs(in + 4 * 4) << 20, - SafeLoadAs(in + 4 * 4) >> 7 | SafeLoadAs(in + 4 * 5) << 25, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + using Dispatch = Kernel; - // extract 27-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7) >> 19 | SafeLoadAs(in + 4 * 8) << 13, - SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, - SafeLoadAs(in + 4 * 9) >> 9 | SafeLoadAs(in + 4 * 10) << 23, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - // extract 27-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 31 | SafeLoadAs(in + 4 * 11) << 1, - SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, - SafeLoadAs(in + 4 * 12) >> 21 | SafeLoadAs(in + 4 * 13) << 11, - }; - shifts = simd_batch{ 4, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, - SafeLoadAs(in + 4 * 14) >> 11 | SafeLoadAs(in + 4 * 15) << 21, - SafeLoadAs(in + 4 * 15) >> 6 | SafeLoadAs(in + 4 * 16) << 26, - SafeLoadAs(in + 4 * 16), - }; - shifts = simd_batch{ 0, 0, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, - SafeLoadAs(in + 4 * 17) >> 23 | SafeLoadAs(in + 4 * 18) << 9, - SafeLoadAs(in + 4 * 18) >> 18 | SafeLoadAs(in + 4 * 19) << 14, - SafeLoadAs(in + 4 * 19) >> 13 | SafeLoadAs(in + 4 * 20) << 19, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 20) >> 8 | SafeLoadAs(in + 4 * 21) << 24, - SafeLoadAs(in + 4 * 21), - SafeLoadAs(in + 4 * 21) >> 30 | SafeLoadAs(in + 4 * 22) << 2, - SafeLoadAs(in + 4 * 22) >> 25 | SafeLoadAs(in + 4 * 23) << 7, - }; - shifts = simd_batch{ 0, 3, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 23) >> 20 | SafeLoadAs(in + 4 * 24) << 12, - SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, - SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, - SafeLoadAs(in + 4 * 26), - }; - shifts = simd_batch{ 0, 0, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 27 * 4; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xfffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 28-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, - SafeLoadAs(in + 4 * 1) >> 24 | SafeLoadAs(in + 4 * 2) << 8, - SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 16 | SafeLoadAs(in + 4 * 4) << 16, - SafeLoadAs(in + 4 * 4) >> 12 | SafeLoadAs(in + 4 * 5) << 20, - SafeLoadAs(in + 4 * 5) >> 8 | SafeLoadAs(in + 4 * 6) << 24, - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{ 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, - SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, - SafeLoadAs(in + 4 * 11) >> 12 | SafeLoadAs(in + 4 * 12) << 20, - SafeLoadAs(in + 4 * 12) >> 8 | SafeLoadAs(in + 4 * 13) << 24, - SafeLoadAs(in + 4 * 13), - }; - shifts = simd_batch{ 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14) >> 28 | SafeLoadAs(in + 4 * 15) << 4, - SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, - SafeLoadAs(in + 4 * 16) >> 20 | SafeLoadAs(in + 4 * 17) << 12, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 17) >> 16 | SafeLoadAs(in + 4 * 18) << 16, - SafeLoadAs(in + 4 * 18) >> 12 | SafeLoadAs(in + 4 * 19) << 20, - SafeLoadAs(in + 4 * 19) >> 8 | SafeLoadAs(in + 4 * 20) << 24, - SafeLoadAs(in + 4 * 20), - }; - shifts = simd_batch{ 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 21), - SafeLoadAs(in + 4 * 21) >> 28 | SafeLoadAs(in + 4 * 22) << 4, - SafeLoadAs(in + 4 * 22) >> 24 | SafeLoadAs(in + 4 * 23) << 8, - SafeLoadAs(in + 4 * 23) >> 20 | SafeLoadAs(in + 4 * 24) << 12, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 24) >> 16 | SafeLoadAs(in + 4 * 25) << 16, - SafeLoadAs(in + 4 * 25) >> 12 | SafeLoadAs(in + 4 * 26) << 20, - SafeLoadAs(in + 4 * 26) >> 8 | SafeLoadAs(in + 4 * 27) << 24, - SafeLoadAs(in + 4 * 27), - }; - shifts = simd_batch{ 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 28 * 4; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1fffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 29-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 29 | SafeLoadAs(in + 4 * 1) << 3, - SafeLoadAs(in + 4 * 1) >> 26 | SafeLoadAs(in + 4 * 2) << 6, - SafeLoadAs(in + 4 * 2) >> 23 | SafeLoadAs(in + 4 * 3) << 9, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 20 | SafeLoadAs(in + 4 * 4) << 12, - SafeLoadAs(in + 4 * 4) >> 17 | SafeLoadAs(in + 4 * 5) << 15, - SafeLoadAs(in + 4 * 5) >> 14 | SafeLoadAs(in + 4 * 6) << 18, - SafeLoadAs(in + 4 * 6) >> 11 | SafeLoadAs(in + 4 * 7) << 21, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 8 | SafeLoadAs(in + 4 * 8) << 24, - SafeLoadAs(in + 4 * 8) >> 5 | SafeLoadAs(in + 4 * 9) << 27, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 31 | SafeLoadAs(in + 4 * 10) << 1, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 10) >> 28 | SafeLoadAs(in + 4 * 11) << 4, - SafeLoadAs(in + 4 * 11) >> 25 | SafeLoadAs(in + 4 * 12) << 7, - SafeLoadAs(in + 4 * 12) >> 22 | SafeLoadAs(in + 4 * 13) << 10, - SafeLoadAs(in + 4 * 13) >> 19 | SafeLoadAs(in + 4 * 14) << 13, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 14) >> 16 | SafeLoadAs(in + 4 * 15) << 16, - SafeLoadAs(in + 4 * 15) >> 13 | SafeLoadAs(in + 4 * 16) << 19, - SafeLoadAs(in + 4 * 16) >> 10 | SafeLoadAs(in + 4 * 17) << 22, - SafeLoadAs(in + 4 * 17) >> 7 | SafeLoadAs(in + 4 * 18) << 25, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 18) >> 4 | SafeLoadAs(in + 4 * 19) << 28, - SafeLoadAs(in + 4 * 19), - SafeLoadAs(in + 4 * 19) >> 30 | SafeLoadAs(in + 4 * 20) << 2, - SafeLoadAs(in + 4 * 20) >> 27 | SafeLoadAs(in + 4 * 21) << 5, - }; - shifts = simd_batch{ 0, 1, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 21) >> 24 | SafeLoadAs(in + 4 * 22) << 8, - SafeLoadAs(in + 4 * 22) >> 21 | SafeLoadAs(in + 4 * 23) << 11, - SafeLoadAs(in + 4 * 23) >> 18 | SafeLoadAs(in + 4 * 24) << 14, - SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 25) >> 12 | SafeLoadAs(in + 4 * 26) << 20, - SafeLoadAs(in + 4 * 26) >> 9 | SafeLoadAs(in + 4 * 27) << 23, - SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, - SafeLoadAs(in + 4 * 28), - }; - shifts = simd_batch{ 0, 0, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 29 * 4; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3fffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 30-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, - SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, - SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, - SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, - SafeLoadAs(in + 4 * 9) >> 12 | SafeLoadAs(in + 4 * 10) << 20, - SafeLoadAs(in + 4 * 10) >> 10 | SafeLoadAs(in + 4 * 11) << 22, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 11) >> 8 | SafeLoadAs(in + 4 * 12) << 24, - SafeLoadAs(in + 4 * 12) >> 6 | SafeLoadAs(in + 4 * 13) << 26, - SafeLoadAs(in + 4 * 13) >> 4 | SafeLoadAs(in + 4 * 14) << 28, - SafeLoadAs(in + 4 * 14), - }; - shifts = simd_batch{ 0, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 30 | SafeLoadAs(in + 4 * 16) << 2, - SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, - SafeLoadAs(in + 4 * 17) >> 26 | SafeLoadAs(in + 4 * 18) << 6, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, - SafeLoadAs(in + 4 * 19) >> 22 | SafeLoadAs(in + 4 * 20) << 10, - SafeLoadAs(in + 4 * 20) >> 20 | SafeLoadAs(in + 4 * 21) << 12, - SafeLoadAs(in + 4 * 21) >> 18 | SafeLoadAs(in + 4 * 22) << 14, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 22) >> 16 | SafeLoadAs(in + 4 * 23) << 16, - SafeLoadAs(in + 4 * 23) >> 14 | SafeLoadAs(in + 4 * 24) << 18, - SafeLoadAs(in + 4 * 24) >> 12 | SafeLoadAs(in + 4 * 25) << 20, - SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 26) >> 8 | SafeLoadAs(in + 4 * 27) << 24, - SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, - SafeLoadAs(in + 4 * 28) >> 4 | SafeLoadAs(in + 4 * 29) << 28, - SafeLoadAs(in + 4 * 29), - }; - shifts = simd_batch{ 0, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 30 * 4; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7fffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 31-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 31 | SafeLoadAs(in + 4 * 1) << 1, - SafeLoadAs(in + 4 * 1) >> 30 | SafeLoadAs(in + 4 * 2) << 2, - SafeLoadAs(in + 4 * 2) >> 29 | SafeLoadAs(in + 4 * 3) << 3, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 28 | SafeLoadAs(in + 4 * 4) << 4, - SafeLoadAs(in + 4 * 4) >> 27 | SafeLoadAs(in + 4 * 5) << 5, - SafeLoadAs(in + 4 * 5) >> 26 | SafeLoadAs(in + 4 * 6) << 6, - SafeLoadAs(in + 4 * 6) >> 25 | SafeLoadAs(in + 4 * 7) << 7, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 24 | SafeLoadAs(in + 4 * 8) << 8, - SafeLoadAs(in + 4 * 8) >> 23 | SafeLoadAs(in + 4 * 9) << 9, - SafeLoadAs(in + 4 * 9) >> 22 | SafeLoadAs(in + 4 * 10) << 10, - SafeLoadAs(in + 4 * 10) >> 21 | SafeLoadAs(in + 4 * 11) << 11, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 11) >> 20 | SafeLoadAs(in + 4 * 12) << 12, - SafeLoadAs(in + 4 * 12) >> 19 | SafeLoadAs(in + 4 * 13) << 13, - SafeLoadAs(in + 4 * 13) >> 18 | SafeLoadAs(in + 4 * 14) << 14, - SafeLoadAs(in + 4 * 14) >> 17 | SafeLoadAs(in + 4 * 15) << 15, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 15) >> 16 | SafeLoadAs(in + 4 * 16) << 16, - SafeLoadAs(in + 4 * 16) >> 15 | SafeLoadAs(in + 4 * 17) << 17, - SafeLoadAs(in + 4 * 17) >> 14 | SafeLoadAs(in + 4 * 18) << 18, - SafeLoadAs(in + 4 * 18) >> 13 | SafeLoadAs(in + 4 * 19) << 19, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 19) >> 12 | SafeLoadAs(in + 4 * 20) << 20, - SafeLoadAs(in + 4 * 20) >> 11 | SafeLoadAs(in + 4 * 21) << 21, - SafeLoadAs(in + 4 * 21) >> 10 | SafeLoadAs(in + 4 * 22) << 22, - SafeLoadAs(in + 4 * 22) >> 9 | SafeLoadAs(in + 4 * 23) << 23, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 23) >> 8 | SafeLoadAs(in + 4 * 24) << 24, - SafeLoadAs(in + 4 * 24) >> 7 | SafeLoadAs(in + 4 * 25) << 25, - SafeLoadAs(in + 4 * 25) >> 6 | SafeLoadAs(in + 4 * 26) << 26, - SafeLoadAs(in + 4 * 26) >> 5 | SafeLoadAs(in + 4 * 27) << 27, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 27) >> 4 | SafeLoadAs(in + 4 * 28) << 28, - SafeLoadAs(in + 4 * 28) >> 3 | SafeLoadAs(in + 4 * 29) << 29, - SafeLoadAs(in + 4 * 29) >> 2 | SafeLoadAs(in + 4 * 30) << 30, - SafeLoadAs(in + 4 * 30), - }; - shifts = simd_batch{ 0, 0, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 31 * 4; - return in; - } -}; - - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 1-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 2, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 4, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 6, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 8, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 10, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 12, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 14, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 16, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 18, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 20, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 22, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 24, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 26, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 28, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 30, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 32, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 34, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 36, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 38, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 40, 41 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 42, 43 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 44, 45 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 46, 47 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 48, 49 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 50, 51 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 52, 53 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 54, 55 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 56, 57 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 58, 59 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 60, 61 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 1-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 62, 63 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 1 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 2-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 4, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 8, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 12, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 16, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 20, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 24, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 28, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 32, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 36, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 40, 42 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 44, 46 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 48, 50 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 52, 54 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 56, 58 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 60, 62 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 4, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 8, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 12, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 16, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 20, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 24, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 28, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 32, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 36, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 40, 42 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 44, 46 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 48, 50 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 52, 54 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 56, 58 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 2-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 60, 62 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 2 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 3-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 6, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 12, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 18, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 24, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 30, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 36, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 42, 45 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 48, 51 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 54, 57 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 63 | SafeLoadAs(in + 8 * 1) << 1, - }; - shifts = simd_batch{ 60, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 2, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 8, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 14, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 20, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 26, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 32, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 38, 41 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 44, 47 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 50, 53 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 56, 59 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 62 | SafeLoadAs(in + 8 * 2) << 2, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 4, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 10, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 16, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 22, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 28, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 34, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 40, 43 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 46, 49 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 52, 55 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 3-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 58, 61 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 3 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xf; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 4-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 8, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 16, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 24, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 32, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 40, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 48, 52 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 56, 60 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 8, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 16, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 24, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 32, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 40, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 48, 52 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 56, 60 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 8, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 16, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 24, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 32, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 40, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 48, 52 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 56, 60 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 8, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 16, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 24, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 32, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 40, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 48, 52 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 4-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 56, 60 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 4 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 5-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 10, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 20, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 30, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 40, 45 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 50, 55 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 60 | SafeLoadAs(in + 8 * 1) << 4, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 6, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 16, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 26, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 36, 41 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 46, 51 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 61 | SafeLoadAs(in + 8 * 2) << 3, - }; - shifts = simd_batch{ 56, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 2, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 12, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 22, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 32, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 42, 47 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 52, 57 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 62 | SafeLoadAs(in + 8 * 3) << 2, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 8, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 18, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 28, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 38, 43 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 48, 53 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 63 | SafeLoadAs(in + 8 * 4) << 1, - }; - shifts = simd_batch{ 58, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 4, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 14, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 24, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 34, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 44, 49 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 5-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 54, 59 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 5 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 6-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 12, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 24, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 36, 42 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 48, 54 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 60 | SafeLoadAs(in + 8 * 1) << 4, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 8, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 20, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 32, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 44, 50 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 62 | SafeLoadAs(in + 8 * 2) << 2, - }; - shifts = simd_batch{ 56, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 4, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 16, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 28, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 40, 46 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 52, 58 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 12, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 24, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 36, 42 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 48, 54 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 8, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 20, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 32, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 44, 50 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 62 | SafeLoadAs(in + 8 * 5) << 2, - }; - shifts = simd_batch{ 56, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 4, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 16, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 28, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 40, 46 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 6-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 52, 58 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 6 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 7-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 14, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 28, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 42, 49 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 63 | SafeLoadAs(in + 8 * 1) << 1, - }; - shifts = simd_batch{ 56, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 6, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 20, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 34, 41 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 48, 55 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 62 | SafeLoadAs(in + 8 * 2) << 2, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 12, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 26, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 40, 47 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 61 | SafeLoadAs(in + 8 * 3) << 3, - }; - shifts = simd_batch{ 54, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 4, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 18, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 32, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 46, 53 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 10, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 24, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 38, 45 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 59 | SafeLoadAs(in + 8 * 5) << 5, - }; - shifts = simd_batch{ 52, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 2, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 16, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 30, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 44, 51 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 58 | SafeLoadAs(in + 8 * 6) << 6, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 8, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 22, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 36, 43 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 7-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 50, 57 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 7 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 8-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 32, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 48, 56 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 32, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 48, 56 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 32, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 48, 56 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 32, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 48, 56 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 32, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 48, 56 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 32, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 48, 56 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 32, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 48, 56 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 32, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 8-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 48, 56 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 8 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 9-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 18, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 36, 45 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 63 | SafeLoadAs(in + 8 * 1) << 1, - }; - shifts = simd_batch{ 54, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 8, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 26, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 44, 53 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 62 | SafeLoadAs(in + 8 * 2) << 2, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 16, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 34, 43 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 61 | SafeLoadAs(in + 8 * 3) << 3, - }; - shifts = simd_batch{ 52, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 6, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 24, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 42, 51 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 14, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 32, 41 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 59 | SafeLoadAs(in + 8 * 5) << 5, - }; - shifts = simd_batch{ 50, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 4, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 22, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 40, 49 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 58 | SafeLoadAs(in + 8 * 6) << 6, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 12, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 30, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 57 | SafeLoadAs(in + 8 * 7) << 7, - }; - shifts = simd_batch{ 48, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 2, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 20, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 38, 47 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 10, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 28, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 9-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 46, 55 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 9 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 10-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 20, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 40, 50 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 60 | SafeLoadAs(in + 8 * 1) << 4, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 16, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 36, 46 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 56 | SafeLoadAs(in + 8 * 2) << 8, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 12, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 32, 42 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 62 | SafeLoadAs(in + 8 * 3) << 2, - }; - shifts = simd_batch{ 52, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 8, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 28, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 58 | SafeLoadAs(in + 8 * 4) << 6, - }; - shifts = simd_batch{ 48, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 4, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 24, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 44, 54 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 20, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 40, 50 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 60 | SafeLoadAs(in + 8 * 6) << 4, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 16, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 36, 46 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 56 | SafeLoadAs(in + 8 * 7) << 8, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 12, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 32, 42 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 62 | SafeLoadAs(in + 8 * 8) << 2, - }; - shifts = simd_batch{ 52, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 8, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 28, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 58 | SafeLoadAs(in + 8 * 9) << 6, - }; - shifts = simd_batch{ 48, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 4, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 24, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 10-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 44, 54 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 10 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 11-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 22, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 55 | SafeLoadAs(in + 8 * 1) << 9, - }; - shifts = simd_batch{ 44, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 2, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 24, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 57 | SafeLoadAs(in + 8 * 2) << 7, - }; - shifts = simd_batch{ 46, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 4, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 26, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 59 | SafeLoadAs(in + 8 * 3) << 5, - }; - shifts = simd_batch{ 48, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 6, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 28, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 61 | SafeLoadAs(in + 8 * 4) << 3, - }; - shifts = simd_batch{ 50, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 8, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 30, 41 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 63 | SafeLoadAs(in + 8 * 5) << 1, - }; - shifts = simd_batch{ 52, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 10, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 32, 43 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 54 | SafeLoadAs(in + 8 * 6) << 10, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 12, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 34, 45 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 56 | SafeLoadAs(in + 8 * 7) << 8, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 14, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 36, 47 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 58 | SafeLoadAs(in + 8 * 8) << 6, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 16, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 38, 49 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 60 | SafeLoadAs(in + 8 * 9) << 4, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 18, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 40, 51 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 62 | SafeLoadAs(in + 8 * 10) << 2, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 20, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 11-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 42, 53 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 11 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xfff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 12-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 24, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 60 | SafeLoadAs(in + 8 * 1) << 4, - }; - shifts = simd_batch{ 48, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 8, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 32, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 56 | SafeLoadAs(in + 8 * 2) << 8, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 16, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 40, 52 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 24, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - }; - shifts = simd_batch{ 48, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 8, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 32, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 56 | SafeLoadAs(in + 8 * 5) << 8, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 16, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 40, 52 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 24, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 60 | SafeLoadAs(in + 8 * 7) << 4, - }; - shifts = simd_batch{ 48, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 8, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 32, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 16, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 40, 52 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 24, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 60 | SafeLoadAs(in + 8 * 10) << 4, - }; - shifts = simd_batch{ 48, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 8, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 32, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 56 | SafeLoadAs(in + 8 * 11) << 8, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 16, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 12-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 40, 52 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 12 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 13-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 26, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 52 | SafeLoadAs(in + 8 * 1) << 12, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 14, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 53 | SafeLoadAs(in + 8 * 2) << 11, - }; - shifts = simd_batch{ 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 2, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 28, 41 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 54 | SafeLoadAs(in + 8 * 3) << 10, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 16, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 55 | SafeLoadAs(in + 8 * 4) << 9, - }; - shifts = simd_batch{ 42, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 4, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 30, 43 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 56 | SafeLoadAs(in + 8 * 5) << 8, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 18, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 57 | SafeLoadAs(in + 8 * 6) << 7, - }; - shifts = simd_batch{ 44, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 6, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 32, 45 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 58 | SafeLoadAs(in + 8 * 7) << 6, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 20, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 59 | SafeLoadAs(in + 8 * 8) << 5, - }; - shifts = simd_batch{ 46, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 8, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 34, 47 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 60 | SafeLoadAs(in + 8 * 9) << 4, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 22, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 61 | SafeLoadAs(in + 8 * 10) << 3, - }; - shifts = simd_batch{ 48, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 10, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 36, 49 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 62 | SafeLoadAs(in + 8 * 11) << 2, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 24, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 63 | SafeLoadAs(in + 8 * 12) << 1, - }; - shifts = simd_batch{ 50, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 12, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 13-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 38, 51 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 13 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 14-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 28, 42 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 56 | SafeLoadAs(in + 8 * 1) << 8, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 20, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 62 | SafeLoadAs(in + 8 * 2) << 2, - }; - shifts = simd_batch{ 48, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 12, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 54 | SafeLoadAs(in + 8 * 3) << 10, - }; - shifts = simd_batch{ 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 4, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 32, 46 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 24, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 52 | SafeLoadAs(in + 8 * 5) << 12, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 16, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 58 | SafeLoadAs(in + 8 * 6) << 6, - }; - shifts = simd_batch{ 44, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 8, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 36, 50 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 28, 42 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 20, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 62 | SafeLoadAs(in + 8 * 9) << 2, - }; - shifts = simd_batch{ 48, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 12, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 54 | SafeLoadAs(in + 8 * 10) << 10, - }; - shifts = simd_batch{ 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 4, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 32, 46 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 60 | SafeLoadAs(in + 8 * 11) << 4, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 24, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 52 | SafeLoadAs(in + 8 * 12) << 12, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 16, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 58 | SafeLoadAs(in + 8 * 13) << 6, - }; - shifts = simd_batch{ 44, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 8, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 14-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 36, 50 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 14 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 15-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 30, 45 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 60 | SafeLoadAs(in + 8 * 1) << 4, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 26, 41 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 56 | SafeLoadAs(in + 8 * 2) << 8, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 22, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 52 | SafeLoadAs(in + 8 * 3) << 12, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 18, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 63 | SafeLoadAs(in + 8 * 4) << 1, - }; - shifts = simd_batch{ 48, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 14, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 59 | SafeLoadAs(in + 8 * 5) << 5, - }; - shifts = simd_batch{ 44, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 10, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 55 | SafeLoadAs(in + 8 * 6) << 9, - }; - shifts = simd_batch{ 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 6, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 51 | SafeLoadAs(in + 8 * 7) << 13, - }; - shifts = simd_batch{ 36, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 2, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 32, 47 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 62 | SafeLoadAs(in + 8 * 8) << 2, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 28, 43 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 58 | SafeLoadAs(in + 8 * 9) << 6, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 24, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 54 | SafeLoadAs(in + 8 * 10) << 10, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 20, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 50 | SafeLoadAs(in + 8 * 11) << 14, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 16, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 61 | SafeLoadAs(in + 8 * 12) << 3, - }; - shifts = simd_batch{ 46, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 12, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 57 | SafeLoadAs(in + 8 * 13) << 7, - }; - shifts = simd_batch{ 42, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 8, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 53 | SafeLoadAs(in + 8 * 14) << 11, - }; - shifts = simd_batch{ 38, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 4, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 15-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 34, 49 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 15 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 16-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 16-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 16 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1ffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 17-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 51 | SafeLoadAs(in + 8 * 1) << 13, - }; - shifts = simd_batch{ 34, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 4, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 55 | SafeLoadAs(in + 8 * 2) << 9, - }; - shifts = simd_batch{ 38, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 8, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 59 | SafeLoadAs(in + 8 * 3) << 5, - }; - shifts = simd_batch{ 42, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 12, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 63 | SafeLoadAs(in + 8 * 4) << 1, - }; - shifts = simd_batch{ 46, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 16, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 50 | SafeLoadAs(in + 8 * 5) << 14, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 20, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 54 | SafeLoadAs(in + 8 * 6) << 10, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 24, 41 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 58 | SafeLoadAs(in + 8 * 7) << 6, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 28, 45 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 62 | SafeLoadAs(in + 8 * 8) << 2, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 49 | SafeLoadAs(in + 8 * 9) << 15, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 2, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 53 | SafeLoadAs(in + 8 * 10) << 11, - }; - shifts = simd_batch{ 36, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 6, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 57 | SafeLoadAs(in + 8 * 11) << 7, - }; - shifts = simd_batch{ 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 10, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 61 | SafeLoadAs(in + 8 * 12) << 3, - }; - shifts = simd_batch{ 44, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 14, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 48 | SafeLoadAs(in + 8 * 13) << 16, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 18, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 52 | SafeLoadAs(in + 8 * 14) << 12, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 22, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 56 | SafeLoadAs(in + 8 * 15) << 8, - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 26, 43 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 60 | SafeLoadAs(in + 8 * 16) << 4, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 17-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 30, 47 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 17 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3ffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 18-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 54 | SafeLoadAs(in + 8 * 1) << 10, - }; - shifts = simd_batch{ 36, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 8, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 62 | SafeLoadAs(in + 8 * 2) << 2, - }; - shifts = simd_batch{ 44, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 16, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 52 | SafeLoadAs(in + 8 * 3) << 12, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 24, 42 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 50 | SafeLoadAs(in + 8 * 5) << 14, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 4, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 58 | SafeLoadAs(in + 8 * 6) << 6, - }; - shifts = simd_batch{ 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 12, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 48 | SafeLoadAs(in + 8 * 7) << 16, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 20, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 28, 46 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 54 | SafeLoadAs(in + 8 * 10) << 10, - }; - shifts = simd_batch{ 36, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 8, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 62 | SafeLoadAs(in + 8 * 11) << 2, - }; - shifts = simd_batch{ 44, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 16, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 52 | SafeLoadAs(in + 8 * 12) << 12, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 24, 42 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 60 | SafeLoadAs(in + 8 * 13) << 4, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 50 | SafeLoadAs(in + 8 * 14) << 14, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 4, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 58 | SafeLoadAs(in + 8 * 15) << 6, - }; - shifts = simd_batch{ 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 12, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 48 | SafeLoadAs(in + 8 * 16) << 16, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 20, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 18-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 28, 46 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 18 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7ffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 19-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 57 | SafeLoadAs(in + 8 * 1) << 7, - }; - shifts = simd_batch{ 38, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 12, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 50 | SafeLoadAs(in + 8 * 2) << 14, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 24, 43 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 62 | SafeLoadAs(in + 8 * 3) << 2, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 55 | SafeLoadAs(in + 8 * 4) << 9, - }; - shifts = simd_batch{ 36, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 10, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 48 | SafeLoadAs(in + 8 * 5) << 16, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 22, 41 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 60 | SafeLoadAs(in + 8 * 6) << 4, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 53 | SafeLoadAs(in + 8 * 7) << 11, - }; - shifts = simd_batch{ 34, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 8, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 46 | SafeLoadAs(in + 8 * 8) << 18, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 20, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 58 | SafeLoadAs(in + 8 * 9) << 6, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 51 | SafeLoadAs(in + 8 * 10) << 13, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 6, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 63 | SafeLoadAs(in + 8 * 11) << 1, - }; - shifts = simd_batch{ 44, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 18, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 56 | SafeLoadAs(in + 8 * 12) << 8, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 49 | SafeLoadAs(in + 8 * 13) << 15, - }; - shifts = simd_batch{ 30, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 4, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 61 | SafeLoadAs(in + 8 * 14) << 3, - }; - shifts = simd_batch{ 42, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 16, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 54 | SafeLoadAs(in + 8 * 15) << 10, - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 47 | SafeLoadAs(in + 8 * 16) << 17, - }; - shifts = simd_batch{ 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 2, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 59 | SafeLoadAs(in + 8 * 17) << 5, - }; - shifts = simd_batch{ 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 14, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 52 | SafeLoadAs(in + 8 * 18) << 12, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 19-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 26, 45 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 19 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xfffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 20-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 60 | SafeLoadAs(in + 8 * 1) << 4, - }; - shifts = simd_batch{ 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 16, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 56 | SafeLoadAs(in + 8 * 2) << 8, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 52 | SafeLoadAs(in + 8 * 3) << 12, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 8, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 48 | SafeLoadAs(in + 8 * 4) << 16, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 24, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 60 | SafeLoadAs(in + 8 * 6) << 4, - }; - shifts = simd_batch{ 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 16, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 56 | SafeLoadAs(in + 8 * 7) << 8, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 52 | SafeLoadAs(in + 8 * 8) << 12, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 8, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 48 | SafeLoadAs(in + 8 * 9) << 16, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 24, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 60 | SafeLoadAs(in + 8 * 11) << 4, - }; - shifts = simd_batch{ 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 16, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 56 | SafeLoadAs(in + 8 * 12) << 8, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 52 | SafeLoadAs(in + 8 * 13) << 12, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 8, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 48 | SafeLoadAs(in + 8 * 14) << 16, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 24, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 60 | SafeLoadAs(in + 8 * 16) << 4, - }; - shifts = simd_batch{ 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 16, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 52 | SafeLoadAs(in + 8 * 18) << 12, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 8, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 48 | SafeLoadAs(in + 8 * 19) << 16, - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 20-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 24, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 20 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1fffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 21-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 63 | SafeLoadAs(in + 8 * 1) << 1, - }; - shifts = simd_batch{ 42, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 20, 41 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 62 | SafeLoadAs(in + 8 * 2) << 2, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 61 | SafeLoadAs(in + 8 * 3) << 3, - }; - shifts = simd_batch{ 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 18, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 59 | SafeLoadAs(in + 8 * 5) << 5, - }; - shifts = simd_batch{ 38, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 16, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 58 | SafeLoadAs(in + 8 * 6) << 6, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 57 | SafeLoadAs(in + 8 * 7) << 7, - }; - shifts = simd_batch{ 36, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 14, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 55 | SafeLoadAs(in + 8 * 9) << 9, - }; - shifts = simd_batch{ 34, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 12, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 54 | SafeLoadAs(in + 8 * 10) << 10, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 53 | SafeLoadAs(in + 8 * 11) << 11, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 10, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 52 | SafeLoadAs(in + 8 * 12) << 12, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 51 | SafeLoadAs(in + 8 * 13) << 13, - }; - shifts = simd_batch{ 30, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 8, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 50 | SafeLoadAs(in + 8 * 14) << 14, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 49 | SafeLoadAs(in + 8 * 15) << 15, - }; - shifts = simd_batch{ 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 6, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 48 | SafeLoadAs(in + 8 * 16) << 16, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 47 | SafeLoadAs(in + 8 * 17) << 17, - }; - shifts = simd_batch{ 26, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 4, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 46 | SafeLoadAs(in + 8 * 18) << 18, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 45 | SafeLoadAs(in + 8 * 19) << 19, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 2, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 44 | SafeLoadAs(in + 8 * 20) << 20, - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 21-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 22, 43 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 21 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3fffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 22-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 44 | SafeLoadAs(in + 8 * 1) << 20, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 46 | SafeLoadAs(in + 8 * 2) << 18, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 4, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 48 | SafeLoadAs(in + 8 * 3) << 16, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 50 | SafeLoadAs(in + 8 * 4) << 14, - }; - shifts = simd_batch{ 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 8, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 52 | SafeLoadAs(in + 8 * 5) << 12, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 54 | SafeLoadAs(in + 8 * 6) << 10, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 12, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 56 | SafeLoadAs(in + 8 * 7) << 8, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 58 | SafeLoadAs(in + 8 * 8) << 6, - }; - shifts = simd_batch{ 36, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 16, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 60 | SafeLoadAs(in + 8 * 9) << 4, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 62 | SafeLoadAs(in + 8 * 10) << 2, - }; - shifts = simd_batch{ 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 20, 42 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 44 | SafeLoadAs(in + 8 * 12) << 20, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 46 | SafeLoadAs(in + 8 * 13) << 18, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 4, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 48 | SafeLoadAs(in + 8 * 14) << 16, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 50 | SafeLoadAs(in + 8 * 15) << 14, - }; - shifts = simd_batch{ 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 8, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 52 | SafeLoadAs(in + 8 * 16) << 12, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 54 | SafeLoadAs(in + 8 * 17) << 10, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 12, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 56 | SafeLoadAs(in + 8 * 18) << 8, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 58 | SafeLoadAs(in + 8 * 19) << 6, - }; - shifts = simd_batch{ 36, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 16, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 60 | SafeLoadAs(in + 8 * 20) << 4, - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 62 | SafeLoadAs(in + 8 * 21) << 2, - }; - shifts = simd_batch{ 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 22-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 20, 42 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 22 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7fffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 23-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 46 | SafeLoadAs(in + 8 * 1) << 18, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 51 | SafeLoadAs(in + 8 * 2) << 13, - }; - shifts = simd_batch{ 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 10, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 56 | SafeLoadAs(in + 8 * 3) << 8, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 61 | SafeLoadAs(in + 8 * 4) << 3, - }; - shifts = simd_batch{ 38, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 43 | SafeLoadAs(in + 8 * 5) << 21, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 2, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 48 | SafeLoadAs(in + 8 * 6) << 16, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 53 | SafeLoadAs(in + 8 * 7) << 11, - }; - shifts = simd_batch{ 30, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 12, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 58 | SafeLoadAs(in + 8 * 8) << 6, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 63 | SafeLoadAs(in + 8 * 9) << 1, - }; - shifts = simd_batch{ 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 45 | SafeLoadAs(in + 8 * 10) << 19, - }; - shifts = simd_batch{ 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 4, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 50 | SafeLoadAs(in + 8 * 11) << 14, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 55 | SafeLoadAs(in + 8 * 12) << 9, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 14, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 60 | SafeLoadAs(in + 8 * 13) << 4, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 42 | SafeLoadAs(in + 8 * 14) << 22, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 47 | SafeLoadAs(in + 8 * 15) << 17, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 6, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 52 | SafeLoadAs(in + 8 * 16) << 12, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 57 | SafeLoadAs(in + 8 * 17) << 7, - }; - shifts = simd_batch{ 34, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 16, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 62 | SafeLoadAs(in + 8 * 18) << 2, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 44 | SafeLoadAs(in + 8 * 19) << 20, - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 49 | SafeLoadAs(in + 8 * 20) << 15, - }; - shifts = simd_batch{ 26, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 8, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 54 | SafeLoadAs(in + 8 * 21) << 10, - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 59 | SafeLoadAs(in + 8 * 22) << 5, - }; - shifts = simd_batch{ 36, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 23-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 18, 41 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 23 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 24-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 48 | SafeLoadAs(in + 8 * 1) << 16, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 56 | SafeLoadAs(in + 8 * 2) << 8, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 16, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 48 | SafeLoadAs(in + 8 * 4) << 16, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 56 | SafeLoadAs(in + 8 * 5) << 8, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 16, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 48 | SafeLoadAs(in + 8 * 7) << 16, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 16, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 48 | SafeLoadAs(in + 8 * 10) << 16, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 56 | SafeLoadAs(in + 8 * 11) << 8, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 16, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 48 | SafeLoadAs(in + 8 * 13) << 16, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 56 | SafeLoadAs(in + 8 * 14) << 8, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 16, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 48 | SafeLoadAs(in + 8 * 16) << 16, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 16, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 48 | SafeLoadAs(in + 8 * 19) << 16, - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 56 | SafeLoadAs(in + 8 * 20) << 8, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 16, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 48 | SafeLoadAs(in + 8 * 22) << 16, - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 56 | SafeLoadAs(in + 8 * 23) << 8, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 24-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 16, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 24 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1ffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 25-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 50 | SafeLoadAs(in + 8 * 1) << 14, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 61 | SafeLoadAs(in + 8 * 2) << 3, - }; - shifts = simd_batch{ 36, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 47 | SafeLoadAs(in + 8 * 3) << 17, - }; - shifts = simd_batch{ 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 8, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 58 | SafeLoadAs(in + 8 * 4) << 6, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 44 | SafeLoadAs(in + 8 * 5) << 20, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 55 | SafeLoadAs(in + 8 * 6) << 9, - }; - shifts = simd_batch{ 30, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 41 | SafeLoadAs(in + 8 * 7) << 23, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 2, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 52 | SafeLoadAs(in + 8 * 8) << 12, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 63 | SafeLoadAs(in + 8 * 9) << 1, - }; - shifts = simd_batch{ 38, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 49 | SafeLoadAs(in + 8 * 10) << 15, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 10, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 60 | SafeLoadAs(in + 8 * 11) << 4, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 46 | SafeLoadAs(in + 8 * 12) << 18, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 57 | SafeLoadAs(in + 8 * 13) << 7, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 43 | SafeLoadAs(in + 8 * 14) << 21, - }; - shifts = simd_batch{ 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 4, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 54 | SafeLoadAs(in + 8 * 15) << 10, - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 40 | SafeLoadAs(in + 8 * 16) << 24, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 51 | SafeLoadAs(in + 8 * 17) << 13, - }; - shifts = simd_batch{ 26, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 12, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 62 | SafeLoadAs(in + 8 * 18) << 2, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 48 | SafeLoadAs(in + 8 * 19) << 16, - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 59 | SafeLoadAs(in + 8 * 20) << 5, - }; - shifts = simd_batch{ 34, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 45 | SafeLoadAs(in + 8 * 21) << 19, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 6, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 56 | SafeLoadAs(in + 8 * 22) << 8, - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 42 | SafeLoadAs(in + 8 * 23) << 22, - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 53 | SafeLoadAs(in + 8 * 24) << 11, - }; - shifts = simd_batch{ 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 25-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 14, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 25 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3ffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 26-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 52 | SafeLoadAs(in + 8 * 1) << 12, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 40 | SafeLoadAs(in + 8 * 2) << 24, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 54 | SafeLoadAs(in + 8 * 3) << 10, - }; - shifts = simd_batch{ 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 42 | SafeLoadAs(in + 8 * 4) << 22, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 4, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 56 | SafeLoadAs(in + 8 * 5) << 8, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 44 | SafeLoadAs(in + 8 * 6) << 20, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 58 | SafeLoadAs(in + 8 * 7) << 6, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 46 | SafeLoadAs(in + 8 * 8) << 18, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 8, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 60 | SafeLoadAs(in + 8 * 9) << 4, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 48 | SafeLoadAs(in + 8 * 10) << 16, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 62 | SafeLoadAs(in + 8 * 11) << 2, - }; - shifts = simd_batch{ 36, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 50 | SafeLoadAs(in + 8 * 12) << 14, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 12, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 52 | SafeLoadAs(in + 8 * 14) << 12, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 40 | SafeLoadAs(in + 8 * 15) << 24, - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 54 | SafeLoadAs(in + 8 * 16) << 10, - }; - shifts = simd_batch{ 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 42 | SafeLoadAs(in + 8 * 17) << 22, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 4, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 56 | SafeLoadAs(in + 8 * 18) << 8, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 44 | SafeLoadAs(in + 8 * 19) << 20, - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 58 | SafeLoadAs(in + 8 * 20) << 6, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 46 | SafeLoadAs(in + 8 * 21) << 18, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 8, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 60 | SafeLoadAs(in + 8 * 22) << 4, - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 0, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 48 | SafeLoadAs(in + 8 * 23) << 16, - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 62 | SafeLoadAs(in + 8 * 24) << 2, - }; - shifts = simd_batch{ 36, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 50 | SafeLoadAs(in + 8 * 25) << 14, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 26-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25), - }; - shifts = simd_batch{ 12, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 26 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7ffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 27-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 54 | SafeLoadAs(in + 8 * 1) << 10, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 44 | SafeLoadAs(in + 8 * 2) << 20, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 61 | SafeLoadAs(in + 8 * 3) << 3, - }; - shifts = simd_batch{ 34, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 51 | SafeLoadAs(in + 8 * 4) << 13, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 41 | SafeLoadAs(in + 8 * 5) << 23, - }; - shifts = simd_batch{ 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 4, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 58 | SafeLoadAs(in + 8 * 6) << 6, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 48 | SafeLoadAs(in + 8 * 7) << 16, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 38 | SafeLoadAs(in + 8 * 8) << 26, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 55 | SafeLoadAs(in + 8 * 9) << 9, - }; - shifts = simd_batch{ 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 45 | SafeLoadAs(in + 8 * 10) << 19, - }; - shifts = simd_batch{ 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 8, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 62 | SafeLoadAs(in + 8 * 11) << 2, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 52 | SafeLoadAs(in + 8 * 12) << 12, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 42 | SafeLoadAs(in + 8 * 13) << 22, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 59 | SafeLoadAs(in + 8 * 14) << 5, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 49 | SafeLoadAs(in + 8 * 15) << 15, - }; - shifts = simd_batch{ 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 39 | SafeLoadAs(in + 8 * 16) << 25, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 2, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 46 | SafeLoadAs(in + 8 * 18) << 18, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 63 | SafeLoadAs(in + 8 * 19) << 1, - }; - shifts = simd_batch{ 36, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 53 | SafeLoadAs(in + 8 * 20) << 11, - }; - shifts = simd_batch{ 26, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 43 | SafeLoadAs(in + 8 * 21) << 21, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 6, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 60 | SafeLoadAs(in + 8 * 22) << 4, - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 0, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 50 | SafeLoadAs(in + 8 * 23) << 14, - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 40 | SafeLoadAs(in + 8 * 24) << 24, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 57 | SafeLoadAs(in + 8 * 25) << 7, - }; - shifts = simd_batch{ 30, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25) >> 47 | SafeLoadAs(in + 8 * 26) << 17, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 27-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26), - }; - shifts = simd_batch{ 10, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 27 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xfffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 28-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 56 | SafeLoadAs(in + 8 * 1) << 8, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 48 | SafeLoadAs(in + 8 * 2) << 16, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 40 | SafeLoadAs(in + 8 * 3) << 24, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 52 | SafeLoadAs(in + 8 * 5) << 12, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 44 | SafeLoadAs(in + 8 * 6) << 20, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 8, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 48 | SafeLoadAs(in + 8 * 9) << 16, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 40 | SafeLoadAs(in + 8 * 10) << 24, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 60 | SafeLoadAs(in + 8 * 11) << 4, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 52 | SafeLoadAs(in + 8 * 12) << 12, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 44 | SafeLoadAs(in + 8 * 13) << 20, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 8, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 56 | SafeLoadAs(in + 8 * 15) << 8, - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 48 | SafeLoadAs(in + 8 * 16) << 16, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 40 | SafeLoadAs(in + 8 * 17) << 24, - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 60 | SafeLoadAs(in + 8 * 18) << 4, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 52 | SafeLoadAs(in + 8 * 19) << 12, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 44 | SafeLoadAs(in + 8 * 20) << 20, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 8, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 0, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 56 | SafeLoadAs(in + 8 * 22) << 8, - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 48 | SafeLoadAs(in + 8 * 23) << 16, - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 40 | SafeLoadAs(in + 8 * 24) << 24, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 60 | SafeLoadAs(in + 8 * 25) << 4, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25) >> 52 | SafeLoadAs(in + 8 * 26) << 12, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 44 | SafeLoadAs(in + 8 * 27) << 20, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 28-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27), - }; - shifts = simd_batch{ 8, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 28 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1fffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 29-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 58 | SafeLoadAs(in + 8 * 1) << 6, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 52 | SafeLoadAs(in + 8 * 2) << 12, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 46 | SafeLoadAs(in + 8 * 3) << 18, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 40 | SafeLoadAs(in + 8 * 4) << 24, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 63 | SafeLoadAs(in + 8 * 5) << 1, - }; - shifts = simd_batch{ 34, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 57 | SafeLoadAs(in + 8 * 6) << 7, - }; - shifts = simd_batch{ 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 51 | SafeLoadAs(in + 8 * 7) << 13, - }; - shifts = simd_batch{ 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 45 | SafeLoadAs(in + 8 * 8) << 19, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 39 | SafeLoadAs(in + 8 * 9) << 25, - }; - shifts = simd_batch{ 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 4, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 62 | SafeLoadAs(in + 8 * 10) << 2, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 56 | SafeLoadAs(in + 8 * 11) << 8, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 50 | SafeLoadAs(in + 8 * 12) << 14, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 44 | SafeLoadAs(in + 8 * 13) << 20, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 38 | SafeLoadAs(in + 8 * 14) << 26, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 61 | SafeLoadAs(in + 8 * 15) << 3, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 55 | SafeLoadAs(in + 8 * 16) << 9, - }; - shifts = simd_batch{ 26, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 49 | SafeLoadAs(in + 8 * 17) << 15, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 43 | SafeLoadAs(in + 8 * 18) << 21, - }; - shifts = simd_batch{ 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 37 | SafeLoadAs(in + 8 * 19) << 27, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 2, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 60 | SafeLoadAs(in + 8 * 20) << 4, - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 54 | SafeLoadAs(in + 8 * 21) << 10, - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 48 | SafeLoadAs(in + 8 * 22) << 16, - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 42 | SafeLoadAs(in + 8 * 23) << 22, - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 36 | SafeLoadAs(in + 8 * 24) << 28, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 59 | SafeLoadAs(in + 8 * 25) << 5, - }; - shifts = simd_batch{ 30, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25) >> 53 | SafeLoadAs(in + 8 * 26) << 11, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 47 | SafeLoadAs(in + 8 * 27) << 17, - }; - shifts = simd_batch{ 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 41 | SafeLoadAs(in + 8 * 28) << 23, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 29-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28), - }; - shifts = simd_batch{ 6, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 29 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3fffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 30-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 60 | SafeLoadAs(in + 8 * 1) << 4, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 56 | SafeLoadAs(in + 8 * 2) << 8, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 52 | SafeLoadAs(in + 8 * 3) << 12, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 48 | SafeLoadAs(in + 8 * 4) << 16, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 44 | SafeLoadAs(in + 8 * 5) << 20, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 40 | SafeLoadAs(in + 8 * 6) << 24, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 36 | SafeLoadAs(in + 8 * 7) << 28, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 62 | SafeLoadAs(in + 8 * 8) << 2, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 58 | SafeLoadAs(in + 8 * 9) << 6, - }; - shifts = simd_batch{ 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 54 | SafeLoadAs(in + 8 * 10) << 10, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 50 | SafeLoadAs(in + 8 * 11) << 14, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 46 | SafeLoadAs(in + 8 * 12) << 18, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 42 | SafeLoadAs(in + 8 * 13) << 22, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 38 | SafeLoadAs(in + 8 * 14) << 26, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 4, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 60 | SafeLoadAs(in + 8 * 16) << 4, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 0, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 52 | SafeLoadAs(in + 8 * 18) << 12, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 48 | SafeLoadAs(in + 8 * 19) << 16, - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 44 | SafeLoadAs(in + 8 * 20) << 20, - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 40 | SafeLoadAs(in + 8 * 21) << 24, - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 36 | SafeLoadAs(in + 8 * 22) << 28, - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 62 | SafeLoadAs(in + 8 * 23) << 2, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 58 | SafeLoadAs(in + 8 * 24) << 6, - }; - shifts = simd_batch{ 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 54 | SafeLoadAs(in + 8 * 25) << 10, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25) >> 50 | SafeLoadAs(in + 8 * 26) << 14, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 46 | SafeLoadAs(in + 8 * 27) << 18, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 42 | SafeLoadAs(in + 8 * 28) << 22, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 38 | SafeLoadAs(in + 8 * 29) << 26, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 30-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 29), - SafeLoadAs(in + 8 * 29), - }; - shifts = simd_batch{ 4, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 30 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7fffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 31-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 62 | SafeLoadAs(in + 8 * 1) << 2, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 60 | SafeLoadAs(in + 8 * 2) << 4, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 58 | SafeLoadAs(in + 8 * 3) << 6, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 56 | SafeLoadAs(in + 8 * 4) << 8, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 54 | SafeLoadAs(in + 8 * 5) << 10, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 52 | SafeLoadAs(in + 8 * 6) << 12, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 50 | SafeLoadAs(in + 8 * 7) << 14, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 48 | SafeLoadAs(in + 8 * 8) << 16, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 46 | SafeLoadAs(in + 8 * 9) << 18, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 44 | SafeLoadAs(in + 8 * 10) << 20, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 42 | SafeLoadAs(in + 8 * 11) << 22, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 40 | SafeLoadAs(in + 8 * 12) << 24, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 38 | SafeLoadAs(in + 8 * 13) << 26, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 36 | SafeLoadAs(in + 8 * 14) << 28, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 34 | SafeLoadAs(in + 8 * 15) << 30, - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 63 | SafeLoadAs(in + 8 * 16) << 1, - }; - shifts = simd_batch{ 32, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 61 | SafeLoadAs(in + 8 * 17) << 3, - }; - shifts = simd_batch{ 30, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 59 | SafeLoadAs(in + 8 * 18) << 5, - }; - shifts = simd_batch{ 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 57 | SafeLoadAs(in + 8 * 19) << 7, - }; - shifts = simd_batch{ 26, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 55 | SafeLoadAs(in + 8 * 20) << 9, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 53 | SafeLoadAs(in + 8 * 21) << 11, - }; - shifts = simd_batch{ 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 51 | SafeLoadAs(in + 8 * 22) << 13, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 49 | SafeLoadAs(in + 8 * 23) << 15, - }; - shifts = simd_batch{ 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 47 | SafeLoadAs(in + 8 * 24) << 17, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 45 | SafeLoadAs(in + 8 * 25) << 19, - }; - shifts = simd_batch{ 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25) >> 43 | SafeLoadAs(in + 8 * 26) << 21, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 41 | SafeLoadAs(in + 8 * 27) << 23, - }; - shifts = simd_batch{ 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 39 | SafeLoadAs(in + 8 * 28) << 25, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 37 | SafeLoadAs(in + 8 * 29) << 27, - }; - shifts = simd_batch{ 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 29), - SafeLoadAs(in + 8 * 29) >> 35 | SafeLoadAs(in + 8 * 30) << 29, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 31-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 2, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 31 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 32-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 29), - SafeLoadAs(in + 8 * 29), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 32-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 31), - SafeLoadAs(in + 8 * 31), - }; - shifts = simd_batch{ 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 32 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1ffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 33-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 33 | SafeLoadAs(in + 8 * 1) << 31, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 35 | SafeLoadAs(in + 8 * 2) << 29, - }; - shifts = simd_batch{ 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 37 | SafeLoadAs(in + 8 * 3) << 27, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 39 | SafeLoadAs(in + 8 * 4) << 25, - }; - shifts = simd_batch{ 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 41 | SafeLoadAs(in + 8 * 5) << 23, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 43 | SafeLoadAs(in + 8 * 6) << 21, - }; - shifts = simd_batch{ 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 45 | SafeLoadAs(in + 8 * 7) << 19, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 47 | SafeLoadAs(in + 8 * 8) << 17, - }; - shifts = simd_batch{ 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 49 | SafeLoadAs(in + 8 * 9) << 15, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 51 | SafeLoadAs(in + 8 * 10) << 13, - }; - shifts = simd_batch{ 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 53 | SafeLoadAs(in + 8 * 11) << 11, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 55 | SafeLoadAs(in + 8 * 12) << 9, - }; - shifts = simd_batch{ 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 57 | SafeLoadAs(in + 8 * 13) << 7, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 59 | SafeLoadAs(in + 8 * 14) << 5, - }; - shifts = simd_batch{ 26, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 61 | SafeLoadAs(in + 8 * 15) << 3, - }; - shifts = simd_batch{ 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 63 | SafeLoadAs(in + 8 * 16) << 1, - }; - shifts = simd_batch{ 30, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 32 | SafeLoadAs(in + 8 * 17) << 32, - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 34 | SafeLoadAs(in + 8 * 18) << 30, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 36 | SafeLoadAs(in + 8 * 19) << 28, - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 38 | SafeLoadAs(in + 8 * 20) << 26, - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 40 | SafeLoadAs(in + 8 * 21) << 24, - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 42 | SafeLoadAs(in + 8 * 22) << 22, - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 44 | SafeLoadAs(in + 8 * 23) << 20, - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 46 | SafeLoadAs(in + 8 * 24) << 18, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 48 | SafeLoadAs(in + 8 * 25) << 16, - SafeLoadAs(in + 8 * 25), - }; - shifts = simd_batch{ 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 50 | SafeLoadAs(in + 8 * 26) << 14, - SafeLoadAs(in + 8 * 26), - }; - shifts = simd_batch{ 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 52 | SafeLoadAs(in + 8 * 27) << 12, - SafeLoadAs(in + 8 * 27), - }; - shifts = simd_batch{ 0, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 54 | SafeLoadAs(in + 8 * 28) << 10, - SafeLoadAs(in + 8 * 28), - }; - shifts = simd_batch{ 0, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 56 | SafeLoadAs(in + 8 * 29) << 8, - SafeLoadAs(in + 8 * 29), - }; - shifts = simd_batch{ 0, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 58 | SafeLoadAs(in + 8 * 30) << 6, - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 0, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 60 | SafeLoadAs(in + 8 * 31) << 4, - SafeLoadAs(in + 8 * 31), - }; - shifts = simd_batch{ 0, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 33-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 62 | SafeLoadAs(in + 8 * 32) << 2, - SafeLoadAs(in + 8 * 32), - }; - shifts = simd_batch{ 0, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 33 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3ffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 34-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 34 | SafeLoadAs(in + 8 * 1) << 30, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 38 | SafeLoadAs(in + 8 * 2) << 26, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 42 | SafeLoadAs(in + 8 * 3) << 22, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 46 | SafeLoadAs(in + 8 * 4) << 18, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 50 | SafeLoadAs(in + 8 * 5) << 14, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 54 | SafeLoadAs(in + 8 * 6) << 10, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 58 | SafeLoadAs(in + 8 * 7) << 6, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 62 | SafeLoadAs(in + 8 * 8) << 2, - }; - shifts = simd_batch{ 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 32 | SafeLoadAs(in + 8 * 9) << 32, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 36 | SafeLoadAs(in + 8 * 10) << 28, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 40 | SafeLoadAs(in + 8 * 11) << 24, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 44 | SafeLoadAs(in + 8 * 12) << 20, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 48 | SafeLoadAs(in + 8 * 13) << 16, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 52 | SafeLoadAs(in + 8 * 14) << 12, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 56 | SafeLoadAs(in + 8 * 15) << 8, - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 60 | SafeLoadAs(in + 8 * 16) << 4, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 34 | SafeLoadAs(in + 8 * 18) << 30, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 38 | SafeLoadAs(in + 8 * 19) << 26, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 42 | SafeLoadAs(in + 8 * 20) << 22, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 46 | SafeLoadAs(in + 8 * 21) << 18, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 50 | SafeLoadAs(in + 8 * 22) << 14, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 54 | SafeLoadAs(in + 8 * 23) << 10, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 58 | SafeLoadAs(in + 8 * 24) << 6, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 62 | SafeLoadAs(in + 8 * 25) << 2, - }; - shifts = simd_batch{ 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 32 | SafeLoadAs(in + 8 * 26) << 32, - SafeLoadAs(in + 8 * 26), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 36 | SafeLoadAs(in + 8 * 27) << 28, - SafeLoadAs(in + 8 * 27), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 40 | SafeLoadAs(in + 8 * 28) << 24, - SafeLoadAs(in + 8 * 28), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 44 | SafeLoadAs(in + 8 * 29) << 20, - SafeLoadAs(in + 8 * 29), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 48 | SafeLoadAs(in + 8 * 30) << 16, - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 52 | SafeLoadAs(in + 8 * 31) << 12, - SafeLoadAs(in + 8 * 31), - }; - shifts = simd_batch{ 0, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 56 | SafeLoadAs(in + 8 * 32) << 8, - SafeLoadAs(in + 8 * 32), - }; - shifts = simd_batch{ 0, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 34-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 32) >> 60 | SafeLoadAs(in + 8 * 33) << 4, - SafeLoadAs(in + 8 * 33), - }; - shifts = simd_batch{ 0, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 34 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7ffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 35-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 35 | SafeLoadAs(in + 8 * 1) << 29, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 41 | SafeLoadAs(in + 8 * 2) << 23, - }; - shifts = simd_batch{ 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 47 | SafeLoadAs(in + 8 * 3) << 17, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 53 | SafeLoadAs(in + 8 * 4) << 11, - }; - shifts = simd_batch{ 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 59 | SafeLoadAs(in + 8 * 5) << 5, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 30 | SafeLoadAs(in + 8 * 6) << 34, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 36 | SafeLoadAs(in + 8 * 7) << 28, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 42 | SafeLoadAs(in + 8 * 8) << 22, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 48 | SafeLoadAs(in + 8 * 9) << 16, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 54 | SafeLoadAs(in + 8 * 10) << 10, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 60 | SafeLoadAs(in + 8 * 11) << 4, - SafeLoadAs(in + 8 * 11) >> 31 | SafeLoadAs(in + 8 * 12) << 33, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 37 | SafeLoadAs(in + 8 * 13) << 27, - }; - shifts = simd_batch{ 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 43 | SafeLoadAs(in + 8 * 14) << 21, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 49 | SafeLoadAs(in + 8 * 15) << 15, - }; - shifts = simd_batch{ 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 55 | SafeLoadAs(in + 8 * 16) << 9, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 61 | SafeLoadAs(in + 8 * 17) << 3, - }; - shifts = simd_batch{ 26, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 32 | SafeLoadAs(in + 8 * 18) << 32, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 38 | SafeLoadAs(in + 8 * 19) << 26, - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 44 | SafeLoadAs(in + 8 * 20) << 20, - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 50 | SafeLoadAs(in + 8 * 21) << 14, - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 0, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 56 | SafeLoadAs(in + 8 * 22) << 8, - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 0, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 62 | SafeLoadAs(in + 8 * 23) << 2, - SafeLoadAs(in + 8 * 23) >> 33 | SafeLoadAs(in + 8 * 24) << 31, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 39 | SafeLoadAs(in + 8 * 25) << 25, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25) >> 45 | SafeLoadAs(in + 8 * 26) << 19, - }; - shifts = simd_batch{ 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 51 | SafeLoadAs(in + 8 * 27) << 13, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 57 | SafeLoadAs(in + 8 * 28) << 7, - }; - shifts = simd_batch{ 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 63 | SafeLoadAs(in + 8 * 29) << 1, - }; - shifts = simd_batch{ 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 34 | SafeLoadAs(in + 8 * 30) << 30, - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 40 | SafeLoadAs(in + 8 * 31) << 24, - SafeLoadAs(in + 8 * 31), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 46 | SafeLoadAs(in + 8 * 32) << 18, - SafeLoadAs(in + 8 * 32), - }; - shifts = simd_batch{ 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 32) >> 52 | SafeLoadAs(in + 8 * 33) << 12, - SafeLoadAs(in + 8 * 33), - }; - shifts = simd_batch{ 0, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 35-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 58 | SafeLoadAs(in + 8 * 34) << 6, - SafeLoadAs(in + 8 * 34), - }; - shifts = simd_batch{ 0, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 35 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xfffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 36-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 36 | SafeLoadAs(in + 8 * 1) << 28, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 44 | SafeLoadAs(in + 8 * 2) << 20, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 52 | SafeLoadAs(in + 8 * 3) << 12, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 32 | SafeLoadAs(in + 8 * 5) << 32, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 40 | SafeLoadAs(in + 8 * 6) << 24, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 48 | SafeLoadAs(in + 8 * 7) << 16, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 36 | SafeLoadAs(in + 8 * 10) << 28, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 44 | SafeLoadAs(in + 8 * 11) << 20, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 52 | SafeLoadAs(in + 8 * 12) << 12, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 60 | SafeLoadAs(in + 8 * 13) << 4, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 32 | SafeLoadAs(in + 8 * 14) << 32, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 40 | SafeLoadAs(in + 8 * 15) << 24, - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 48 | SafeLoadAs(in + 8 * 16) << 16, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 0, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 36 | SafeLoadAs(in + 8 * 19) << 28, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 44 | SafeLoadAs(in + 8 * 20) << 20, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 52 | SafeLoadAs(in + 8 * 21) << 12, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 60 | SafeLoadAs(in + 8 * 22) << 4, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 32 | SafeLoadAs(in + 8 * 23) << 32, - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 40 | SafeLoadAs(in + 8 * 24) << 24, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 48 | SafeLoadAs(in + 8 * 25) << 16, - SafeLoadAs(in + 8 * 25), - }; - shifts = simd_batch{ 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 56 | SafeLoadAs(in + 8 * 26) << 8, - SafeLoadAs(in + 8 * 26), - }; - shifts = simd_batch{ 0, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 36 | SafeLoadAs(in + 8 * 28) << 28, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 44 | SafeLoadAs(in + 8 * 29) << 20, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 29), - SafeLoadAs(in + 8 * 29) >> 52 | SafeLoadAs(in + 8 * 30) << 12, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 60 | SafeLoadAs(in + 8 * 31) << 4, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 32 | SafeLoadAs(in + 8 * 32) << 32, - SafeLoadAs(in + 8 * 32), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 32) >> 40 | SafeLoadAs(in + 8 * 33) << 24, - SafeLoadAs(in + 8 * 33), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 48 | SafeLoadAs(in + 8 * 34) << 16, - SafeLoadAs(in + 8 * 34), - }; - shifts = simd_batch{ 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 36-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 56 | SafeLoadAs(in + 8 * 35) << 8, - SafeLoadAs(in + 8 * 35), - }; - shifts = simd_batch{ 0, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 36 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1fffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 37-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 37 | SafeLoadAs(in + 8 * 1) << 27, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 47 | SafeLoadAs(in + 8 * 2) << 17, - }; - shifts = simd_batch{ 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 57 | SafeLoadAs(in + 8 * 3) << 7, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 30 | SafeLoadAs(in + 8 * 4) << 34, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 40 | SafeLoadAs(in + 8 * 5) << 24, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 50 | SafeLoadAs(in + 8 * 6) << 14, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 60 | SafeLoadAs(in + 8 * 7) << 4, - SafeLoadAs(in + 8 * 7) >> 33 | SafeLoadAs(in + 8 * 8) << 31, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 43 | SafeLoadAs(in + 8 * 9) << 21, - }; - shifts = simd_batch{ 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 53 | SafeLoadAs(in + 8 * 10) << 11, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 63 | SafeLoadAs(in + 8 * 11) << 1, - }; - shifts = simd_batch{ 26, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 36 | SafeLoadAs(in + 8 * 12) << 28, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 46 | SafeLoadAs(in + 8 * 13) << 18, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 56 | SafeLoadAs(in + 8 * 14) << 8, - SafeLoadAs(in + 8 * 14) >> 29 | SafeLoadAs(in + 8 * 15) << 35, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 39 | SafeLoadAs(in + 8 * 16) << 25, - }; - shifts = simd_batch{ 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 49 | SafeLoadAs(in + 8 * 17) << 15, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 59 | SafeLoadAs(in + 8 * 18) << 5, - }; - shifts = simd_batch{ 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 32 | SafeLoadAs(in + 8 * 19) << 32, - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 42 | SafeLoadAs(in + 8 * 20) << 22, - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 52 | SafeLoadAs(in + 8 * 21) << 12, - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 0, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 62 | SafeLoadAs(in + 8 * 22) << 2, - SafeLoadAs(in + 8 * 22) >> 35 | SafeLoadAs(in + 8 * 23) << 29, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 45 | SafeLoadAs(in + 8 * 24) << 19, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 55 | SafeLoadAs(in + 8 * 25) << 9, - }; - shifts = simd_batch{ 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 28 | SafeLoadAs(in + 8 * 26) << 36, - SafeLoadAs(in + 8 * 26), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 38 | SafeLoadAs(in + 8 * 27) << 26, - SafeLoadAs(in + 8 * 27), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 48 | SafeLoadAs(in + 8 * 28) << 16, - SafeLoadAs(in + 8 * 28), - }; - shifts = simd_batch{ 0, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 58 | SafeLoadAs(in + 8 * 29) << 6, - SafeLoadAs(in + 8 * 29) >> 31 | SafeLoadAs(in + 8 * 30) << 33, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 41 | SafeLoadAs(in + 8 * 31) << 23, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 31), - SafeLoadAs(in + 8 * 31) >> 51 | SafeLoadAs(in + 8 * 32) << 13, - }; - shifts = simd_batch{ 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 32), - SafeLoadAs(in + 8 * 32) >> 61 | SafeLoadAs(in + 8 * 33) << 3, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 34 | SafeLoadAs(in + 8 * 34) << 30, - SafeLoadAs(in + 8 * 34), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 44 | SafeLoadAs(in + 8 * 35) << 20, - SafeLoadAs(in + 8 * 35), - }; - shifts = simd_batch{ 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 37-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 35) >> 54 | SafeLoadAs(in + 8 * 36) << 10, - SafeLoadAs(in + 8 * 36), - }; - shifts = simd_batch{ 0, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 37 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3fffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 38-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 38 | SafeLoadAs(in + 8 * 1) << 26, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 50 | SafeLoadAs(in + 8 * 2) << 14, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 62 | SafeLoadAs(in + 8 * 3) << 2, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 36 | SafeLoadAs(in + 8 * 4) << 28, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 48 | SafeLoadAs(in + 8 * 5) << 16, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 60 | SafeLoadAs(in + 8 * 6) << 4, - SafeLoadAs(in + 8 * 6) >> 34 | SafeLoadAs(in + 8 * 7) << 30, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 46 | SafeLoadAs(in + 8 * 8) << 18, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 58 | SafeLoadAs(in + 8 * 9) << 6, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 32 | SafeLoadAs(in + 8 * 10) << 32, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 44 | SafeLoadAs(in + 8 * 11) << 20, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 56 | SafeLoadAs(in + 8 * 12) << 8, - SafeLoadAs(in + 8 * 12) >> 30 | SafeLoadAs(in + 8 * 13) << 34, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 42 | SafeLoadAs(in + 8 * 14) << 22, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 54 | SafeLoadAs(in + 8 * 15) << 10, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 28 | SafeLoadAs(in + 8 * 16) << 36, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 40 | SafeLoadAs(in + 8 * 17) << 24, - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 52 | SafeLoadAs(in + 8 * 18) << 12, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 38 | SafeLoadAs(in + 8 * 20) << 26, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 50 | SafeLoadAs(in + 8 * 21) << 14, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 62 | SafeLoadAs(in + 8 * 22) << 2, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 36 | SafeLoadAs(in + 8 * 23) << 28, - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 48 | SafeLoadAs(in + 8 * 24) << 16, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 60 | SafeLoadAs(in + 8 * 25) << 4, - SafeLoadAs(in + 8 * 25) >> 34 | SafeLoadAs(in + 8 * 26) << 30, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 46 | SafeLoadAs(in + 8 * 27) << 18, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 58 | SafeLoadAs(in + 8 * 28) << 6, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 32 | SafeLoadAs(in + 8 * 29) << 32, - SafeLoadAs(in + 8 * 29), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 44 | SafeLoadAs(in + 8 * 30) << 20, - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 56 | SafeLoadAs(in + 8 * 31) << 8, - SafeLoadAs(in + 8 * 31) >> 30 | SafeLoadAs(in + 8 * 32) << 34, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 32), - SafeLoadAs(in + 8 * 32) >> 42 | SafeLoadAs(in + 8 * 33) << 22, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 33), - SafeLoadAs(in + 8 * 33) >> 54 | SafeLoadAs(in + 8 * 34) << 10, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 28 | SafeLoadAs(in + 8 * 35) << 36, - SafeLoadAs(in + 8 * 35), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 35) >> 40 | SafeLoadAs(in + 8 * 36) << 24, - SafeLoadAs(in + 8 * 36), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 38-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 52 | SafeLoadAs(in + 8 * 37) << 12, - SafeLoadAs(in + 8 * 37), - }; - shifts = simd_batch{ 0, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 38 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7fffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 39-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 39 | SafeLoadAs(in + 8 * 1) << 25, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 53 | SafeLoadAs(in + 8 * 2) << 11, - }; - shifts = simd_batch{ 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 28 | SafeLoadAs(in + 8 * 3) << 36, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 42 | SafeLoadAs(in + 8 * 4) << 22, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 56 | SafeLoadAs(in + 8 * 5) << 8, - SafeLoadAs(in + 8 * 5) >> 31 | SafeLoadAs(in + 8 * 6) << 33, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 45 | SafeLoadAs(in + 8 * 7) << 19, - }; - shifts = simd_batch{ 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 59 | SafeLoadAs(in + 8 * 8) << 5, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 34 | SafeLoadAs(in + 8 * 9) << 30, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 48 | SafeLoadAs(in + 8 * 10) << 16, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 62 | SafeLoadAs(in + 8 * 11) << 2, - SafeLoadAs(in + 8 * 11) >> 37 | SafeLoadAs(in + 8 * 12) << 27, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 51 | SafeLoadAs(in + 8 * 13) << 13, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 26 | SafeLoadAs(in + 8 * 14) << 38, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 40 | SafeLoadAs(in + 8 * 15) << 24, - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 54 | SafeLoadAs(in + 8 * 16) << 10, - SafeLoadAs(in + 8 * 16) >> 29 | SafeLoadAs(in + 8 * 17) << 35, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 43 | SafeLoadAs(in + 8 * 18) << 21, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 57 | SafeLoadAs(in + 8 * 19) << 7, - }; - shifts = simd_batch{ 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 32 | SafeLoadAs(in + 8 * 20) << 32, - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 46 | SafeLoadAs(in + 8 * 21) << 18, - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 0, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 60 | SafeLoadAs(in + 8 * 22) << 4, - SafeLoadAs(in + 8 * 22) >> 35 | SafeLoadAs(in + 8 * 23) << 29, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 49 | SafeLoadAs(in + 8 * 24) << 15, - }; - shifts = simd_batch{ 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 63 | SafeLoadAs(in + 8 * 25) << 1, - }; - shifts = simd_batch{ 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 38 | SafeLoadAs(in + 8 * 26) << 26, - SafeLoadAs(in + 8 * 26), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 52 | SafeLoadAs(in + 8 * 27) << 12, - SafeLoadAs(in + 8 * 27) >> 27 | SafeLoadAs(in + 8 * 28) << 37, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 41 | SafeLoadAs(in + 8 * 29) << 23, - }; - shifts = simd_batch{ 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 29), - SafeLoadAs(in + 8 * 29) >> 55 | SafeLoadAs(in + 8 * 30) << 9, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 30 | SafeLoadAs(in + 8 * 31) << 34, - SafeLoadAs(in + 8 * 31), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 44 | SafeLoadAs(in + 8 * 32) << 20, - SafeLoadAs(in + 8 * 32), - }; - shifts = simd_batch{ 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 32) >> 58 | SafeLoadAs(in + 8 * 33) << 6, - SafeLoadAs(in + 8 * 33) >> 33 | SafeLoadAs(in + 8 * 34) << 31, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 34), - SafeLoadAs(in + 8 * 34) >> 47 | SafeLoadAs(in + 8 * 35) << 17, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 35), - SafeLoadAs(in + 8 * 35) >> 61 | SafeLoadAs(in + 8 * 36) << 3, - }; - shifts = simd_batch{ 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 36 | SafeLoadAs(in + 8 * 37) << 28, - SafeLoadAs(in + 8 * 37), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 39-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 37) >> 50 | SafeLoadAs(in + 8 * 38) << 14, - SafeLoadAs(in + 8 * 38), - }; - shifts = simd_batch{ 0, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 39 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 40-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 40 | SafeLoadAs(in + 8 * 1) << 24, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 56 | SafeLoadAs(in + 8 * 2) << 8, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 32 | SafeLoadAs(in + 8 * 3) << 32, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 48 | SafeLoadAs(in + 8 * 4) << 16, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 40 | SafeLoadAs(in + 8 * 6) << 24, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 56 | SafeLoadAs(in + 8 * 7) << 8, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 32 | SafeLoadAs(in + 8 * 8) << 32, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 48 | SafeLoadAs(in + 8 * 9) << 16, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 40 | SafeLoadAs(in + 8 * 11) << 24, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 56 | SafeLoadAs(in + 8 * 12) << 8, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 32 | SafeLoadAs(in + 8 * 13) << 32, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 48 | SafeLoadAs(in + 8 * 14) << 16, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 40 | SafeLoadAs(in + 8 * 16) << 24, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 32 | SafeLoadAs(in + 8 * 18) << 32, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 48 | SafeLoadAs(in + 8 * 19) << 16, - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 40 | SafeLoadAs(in + 8 * 21) << 24, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 56 | SafeLoadAs(in + 8 * 22) << 8, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 32 | SafeLoadAs(in + 8 * 23) << 32, - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 48 | SafeLoadAs(in + 8 * 24) << 16, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25) >> 40 | SafeLoadAs(in + 8 * 26) << 24, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 56 | SafeLoadAs(in + 8 * 27) << 8, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 32 | SafeLoadAs(in + 8 * 28) << 32, - SafeLoadAs(in + 8 * 28), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 48 | SafeLoadAs(in + 8 * 29) << 16, - SafeLoadAs(in + 8 * 29), - }; - shifts = simd_batch{ 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 40 | SafeLoadAs(in + 8 * 31) << 24, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 31), - SafeLoadAs(in + 8 * 31) >> 56 | SafeLoadAs(in + 8 * 32) << 8, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 32) >> 32 | SafeLoadAs(in + 8 * 33) << 32, - SafeLoadAs(in + 8 * 33), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 48 | SafeLoadAs(in + 8 * 34) << 16, - SafeLoadAs(in + 8 * 34), - }; - shifts = simd_batch{ 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 35), - SafeLoadAs(in + 8 * 35) >> 40 | SafeLoadAs(in + 8 * 36) << 24, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 36), - SafeLoadAs(in + 8 * 36) >> 56 | SafeLoadAs(in + 8 * 37) << 8, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 37) >> 32 | SafeLoadAs(in + 8 * 38) << 32, - SafeLoadAs(in + 8 * 38), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 40-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 38) >> 48 | SafeLoadAs(in + 8 * 39) << 16, - SafeLoadAs(in + 8 * 39), - }; - shifts = simd_batch{ 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 40 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1ffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 41-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 41 | SafeLoadAs(in + 8 * 1) << 23, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 59 | SafeLoadAs(in + 8 * 2) << 5, - }; - shifts = simd_batch{ 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 36 | SafeLoadAs(in + 8 * 3) << 28, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 54 | SafeLoadAs(in + 8 * 4) << 10, - SafeLoadAs(in + 8 * 4) >> 31 | SafeLoadAs(in + 8 * 5) << 33, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 49 | SafeLoadAs(in + 8 * 6) << 15, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 26 | SafeLoadAs(in + 8 * 7) << 38, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 44 | SafeLoadAs(in + 8 * 8) << 20, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 62 | SafeLoadAs(in + 8 * 9) << 2, - SafeLoadAs(in + 8 * 9) >> 39 | SafeLoadAs(in + 8 * 10) << 25, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 57 | SafeLoadAs(in + 8 * 11) << 7, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 34 | SafeLoadAs(in + 8 * 12) << 30, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 52 | SafeLoadAs(in + 8 * 13) << 12, - SafeLoadAs(in + 8 * 13) >> 29 | SafeLoadAs(in + 8 * 14) << 35, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 47 | SafeLoadAs(in + 8 * 15) << 17, - }; - shifts = simd_batch{ 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 24 | SafeLoadAs(in + 8 * 16) << 40, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 42 | SafeLoadAs(in + 8 * 17) << 22, - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 60 | SafeLoadAs(in + 8 * 18) << 4, - SafeLoadAs(in + 8 * 18) >> 37 | SafeLoadAs(in + 8 * 19) << 27, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 55 | SafeLoadAs(in + 8 * 20) << 9, - }; - shifts = simd_batch{ 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 32 | SafeLoadAs(in + 8 * 21) << 32, - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 50 | SafeLoadAs(in + 8 * 22) << 14, - SafeLoadAs(in + 8 * 22) >> 27 | SafeLoadAs(in + 8 * 23) << 37, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 45 | SafeLoadAs(in + 8 * 24) << 19, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 63 | SafeLoadAs(in + 8 * 25) << 1, - }; - shifts = simd_batch{ 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 40 | SafeLoadAs(in + 8 * 26) << 24, - SafeLoadAs(in + 8 * 26), - }; - shifts = simd_batch{ 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 58 | SafeLoadAs(in + 8 * 27) << 6, - SafeLoadAs(in + 8 * 27) >> 35 | SafeLoadAs(in + 8 * 28) << 29, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 53 | SafeLoadAs(in + 8 * 29) << 11, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 30 | SafeLoadAs(in + 8 * 30) << 34, - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 48 | SafeLoadAs(in + 8 * 31) << 16, - SafeLoadAs(in + 8 * 31) >> 25 | SafeLoadAs(in + 8 * 32) << 39, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 32), - SafeLoadAs(in + 8 * 32) >> 43 | SafeLoadAs(in + 8 * 33) << 21, - }; - shifts = simd_batch{ 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 33), - SafeLoadAs(in + 8 * 33) >> 61 | SafeLoadAs(in + 8 * 34) << 3, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 38 | SafeLoadAs(in + 8 * 35) << 26, - SafeLoadAs(in + 8 * 35), - }; - shifts = simd_batch{ 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 35) >> 56 | SafeLoadAs(in + 8 * 36) << 8, - SafeLoadAs(in + 8 * 36) >> 33 | SafeLoadAs(in + 8 * 37) << 31, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 37), - SafeLoadAs(in + 8 * 37) >> 51 | SafeLoadAs(in + 8 * 38) << 13, - }; - shifts = simd_batch{ 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 38) >> 28 | SafeLoadAs(in + 8 * 39) << 36, - SafeLoadAs(in + 8 * 39), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 41-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 46 | SafeLoadAs(in + 8 * 40) << 18, - SafeLoadAs(in + 8 * 40), - }; - shifts = simd_batch{ 0, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 41 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3ffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 42-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 42 | SafeLoadAs(in + 8 * 1) << 22, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 62 | SafeLoadAs(in + 8 * 2) << 2, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 40 | SafeLoadAs(in + 8 * 3) << 24, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - SafeLoadAs(in + 8 * 4) >> 38 | SafeLoadAs(in + 8 * 5) << 26, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 58 | SafeLoadAs(in + 8 * 6) << 6, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 36 | SafeLoadAs(in + 8 * 7) << 28, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8) >> 34 | SafeLoadAs(in + 8 * 9) << 30, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 54 | SafeLoadAs(in + 8 * 10) << 10, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 32 | SafeLoadAs(in + 8 * 11) << 32, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 52 | SafeLoadAs(in + 8 * 12) << 12, - SafeLoadAs(in + 8 * 12) >> 30 | SafeLoadAs(in + 8 * 13) << 34, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 50 | SafeLoadAs(in + 8 * 14) << 14, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 28 | SafeLoadAs(in + 8 * 15) << 36, - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 48 | SafeLoadAs(in + 8 * 16) << 16, - SafeLoadAs(in + 8 * 16) >> 26 | SafeLoadAs(in + 8 * 17) << 38, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 46 | SafeLoadAs(in + 8 * 18) << 18, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 24 | SafeLoadAs(in + 8 * 19) << 40, - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 44 | SafeLoadAs(in + 8 * 20) << 20, - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 42 | SafeLoadAs(in + 8 * 22) << 22, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 62 | SafeLoadAs(in + 8 * 23) << 2, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 40 | SafeLoadAs(in + 8 * 24) << 24, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 60 | SafeLoadAs(in + 8 * 25) << 4, - SafeLoadAs(in + 8 * 25) >> 38 | SafeLoadAs(in + 8 * 26) << 26, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 58 | SafeLoadAs(in + 8 * 27) << 6, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 36 | SafeLoadAs(in + 8 * 28) << 28, - SafeLoadAs(in + 8 * 28), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 56 | SafeLoadAs(in + 8 * 29) << 8, - SafeLoadAs(in + 8 * 29) >> 34 | SafeLoadAs(in + 8 * 30) << 30, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 54 | SafeLoadAs(in + 8 * 31) << 10, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 32 | SafeLoadAs(in + 8 * 32) << 32, - SafeLoadAs(in + 8 * 32), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 32) >> 52 | SafeLoadAs(in + 8 * 33) << 12, - SafeLoadAs(in + 8 * 33) >> 30 | SafeLoadAs(in + 8 * 34) << 34, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 34), - SafeLoadAs(in + 8 * 34) >> 50 | SafeLoadAs(in + 8 * 35) << 14, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 35) >> 28 | SafeLoadAs(in + 8 * 36) << 36, - SafeLoadAs(in + 8 * 36), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 48 | SafeLoadAs(in + 8 * 37) << 16, - SafeLoadAs(in + 8 * 37) >> 26 | SafeLoadAs(in + 8 * 38) << 38, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 38), - SafeLoadAs(in + 8 * 38) >> 46 | SafeLoadAs(in + 8 * 39) << 18, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 24 | SafeLoadAs(in + 8 * 40) << 40, - SafeLoadAs(in + 8 * 40), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 42-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 40) >> 44 | SafeLoadAs(in + 8 * 41) << 20, - SafeLoadAs(in + 8 * 41), - }; - shifts = simd_batch{ 0, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 42 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7ffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 43-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 43 | SafeLoadAs(in + 8 * 1) << 21, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 22 | SafeLoadAs(in + 8 * 2) << 42, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 44 | SafeLoadAs(in + 8 * 3) << 20, - SafeLoadAs(in + 8 * 3) >> 23 | SafeLoadAs(in + 8 * 4) << 41, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 45 | SafeLoadAs(in + 8 * 5) << 19, - }; - shifts = simd_batch{ 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 24 | SafeLoadAs(in + 8 * 6) << 40, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 46 | SafeLoadAs(in + 8 * 7) << 18, - SafeLoadAs(in + 8 * 7) >> 25 | SafeLoadAs(in + 8 * 8) << 39, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 47 | SafeLoadAs(in + 8 * 9) << 17, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 26 | SafeLoadAs(in + 8 * 10) << 38, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 48 | SafeLoadAs(in + 8 * 11) << 16, - SafeLoadAs(in + 8 * 11) >> 27 | SafeLoadAs(in + 8 * 12) << 37, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 49 | SafeLoadAs(in + 8 * 13) << 15, - }; - shifts = simd_batch{ 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 28 | SafeLoadAs(in + 8 * 14) << 36, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 50 | SafeLoadAs(in + 8 * 15) << 14, - SafeLoadAs(in + 8 * 15) >> 29 | SafeLoadAs(in + 8 * 16) << 35, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 51 | SafeLoadAs(in + 8 * 17) << 13, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 30 | SafeLoadAs(in + 8 * 18) << 34, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 52 | SafeLoadAs(in + 8 * 19) << 12, - SafeLoadAs(in + 8 * 19) >> 31 | SafeLoadAs(in + 8 * 20) << 33, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 53 | SafeLoadAs(in + 8 * 21) << 11, - }; - shifts = simd_batch{ 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 32 | SafeLoadAs(in + 8 * 22) << 32, - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 54 | SafeLoadAs(in + 8 * 23) << 10, - SafeLoadAs(in + 8 * 23) >> 33 | SafeLoadAs(in + 8 * 24) << 31, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 55 | SafeLoadAs(in + 8 * 25) << 9, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 34 | SafeLoadAs(in + 8 * 26) << 30, - SafeLoadAs(in + 8 * 26), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 56 | SafeLoadAs(in + 8 * 27) << 8, - SafeLoadAs(in + 8 * 27) >> 35 | SafeLoadAs(in + 8 * 28) << 29, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 57 | SafeLoadAs(in + 8 * 29) << 7, - }; - shifts = simd_batch{ 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 36 | SafeLoadAs(in + 8 * 30) << 28, - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 58 | SafeLoadAs(in + 8 * 31) << 6, - SafeLoadAs(in + 8 * 31) >> 37 | SafeLoadAs(in + 8 * 32) << 27, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 32), - SafeLoadAs(in + 8 * 32) >> 59 | SafeLoadAs(in + 8 * 33) << 5, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 38 | SafeLoadAs(in + 8 * 34) << 26, - SafeLoadAs(in + 8 * 34), - }; - shifts = simd_batch{ 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 60 | SafeLoadAs(in + 8 * 35) << 4, - SafeLoadAs(in + 8 * 35) >> 39 | SafeLoadAs(in + 8 * 36) << 25, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 36), - SafeLoadAs(in + 8 * 36) >> 61 | SafeLoadAs(in + 8 * 37) << 3, - }; - shifts = simd_batch{ 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 37) >> 40 | SafeLoadAs(in + 8 * 38) << 24, - SafeLoadAs(in + 8 * 38), - }; - shifts = simd_batch{ 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 38) >> 62 | SafeLoadAs(in + 8 * 39) << 2, - SafeLoadAs(in + 8 * 39) >> 41 | SafeLoadAs(in + 8 * 40) << 23, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 40), - SafeLoadAs(in + 8 * 40) >> 63 | SafeLoadAs(in + 8 * 41) << 1, - }; - shifts = simd_batch{ 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 43-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 41) >> 42 | SafeLoadAs(in + 8 * 42) << 22, - SafeLoadAs(in + 8 * 42), - }; - shifts = simd_batch{ 0, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 43 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xfffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 44-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 44 | SafeLoadAs(in + 8 * 1) << 20, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 24 | SafeLoadAs(in + 8 * 2) << 40, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 48 | SafeLoadAs(in + 8 * 3) << 16, - SafeLoadAs(in + 8 * 3) >> 28 | SafeLoadAs(in + 8 * 4) << 36, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 52 | SafeLoadAs(in + 8 * 5) << 12, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 32 | SafeLoadAs(in + 8 * 6) << 32, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 56 | SafeLoadAs(in + 8 * 7) << 8, - SafeLoadAs(in + 8 * 7) >> 36 | SafeLoadAs(in + 8 * 8) << 28, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 60 | SafeLoadAs(in + 8 * 9) << 4, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 40 | SafeLoadAs(in + 8 * 10) << 24, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 44 | SafeLoadAs(in + 8 * 12) << 20, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 24 | SafeLoadAs(in + 8 * 13) << 40, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 48 | SafeLoadAs(in + 8 * 14) << 16, - SafeLoadAs(in + 8 * 14) >> 28 | SafeLoadAs(in + 8 * 15) << 36, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 52 | SafeLoadAs(in + 8 * 16) << 12, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 32 | SafeLoadAs(in + 8 * 17) << 32, - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 56 | SafeLoadAs(in + 8 * 18) << 8, - SafeLoadAs(in + 8 * 18) >> 36 | SafeLoadAs(in + 8 * 19) << 28, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 60 | SafeLoadAs(in + 8 * 20) << 4, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 40 | SafeLoadAs(in + 8 * 21) << 24, - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 44 | SafeLoadAs(in + 8 * 23) << 20, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 24 | SafeLoadAs(in + 8 * 24) << 40, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 48 | SafeLoadAs(in + 8 * 25) << 16, - SafeLoadAs(in + 8 * 25) >> 28 | SafeLoadAs(in + 8 * 26) << 36, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 52 | SafeLoadAs(in + 8 * 27) << 12, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 32 | SafeLoadAs(in + 8 * 28) << 32, - SafeLoadAs(in + 8 * 28), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 56 | SafeLoadAs(in + 8 * 29) << 8, - SafeLoadAs(in + 8 * 29) >> 36 | SafeLoadAs(in + 8 * 30) << 28, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 60 | SafeLoadAs(in + 8 * 31) << 4, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 40 | SafeLoadAs(in + 8 * 32) << 24, - SafeLoadAs(in + 8 * 32), - }; - shifts = simd_batch{ 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 33), - SafeLoadAs(in + 8 * 33) >> 44 | SafeLoadAs(in + 8 * 34) << 20, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 24 | SafeLoadAs(in + 8 * 35) << 40, - SafeLoadAs(in + 8 * 35), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 35) >> 48 | SafeLoadAs(in + 8 * 36) << 16, - SafeLoadAs(in + 8 * 36) >> 28 | SafeLoadAs(in + 8 * 37) << 36, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 37), - SafeLoadAs(in + 8 * 37) >> 52 | SafeLoadAs(in + 8 * 38) << 12, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 38) >> 32 | SafeLoadAs(in + 8 * 39) << 32, - SafeLoadAs(in + 8 * 39), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 56 | SafeLoadAs(in + 8 * 40) << 8, - SafeLoadAs(in + 8 * 40) >> 36 | SafeLoadAs(in + 8 * 41) << 28, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 41), - SafeLoadAs(in + 8 * 41) >> 60 | SafeLoadAs(in + 8 * 42) << 4, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 44-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 42) >> 40 | SafeLoadAs(in + 8 * 43) << 24, - SafeLoadAs(in + 8 * 43), - }; - shifts = simd_batch{ 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 44 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1fffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 45-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 45 | SafeLoadAs(in + 8 * 1) << 19, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 26 | SafeLoadAs(in + 8 * 2) << 38, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 52 | SafeLoadAs(in + 8 * 3) << 12, - SafeLoadAs(in + 8 * 3) >> 33 | SafeLoadAs(in + 8 * 4) << 31, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 59 | SafeLoadAs(in + 8 * 5) << 5, - }; - shifts = simd_batch{ 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 40 | SafeLoadAs(in + 8 * 6) << 24, - SafeLoadAs(in + 8 * 6) >> 21 | SafeLoadAs(in + 8 * 7) << 43, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 47 | SafeLoadAs(in + 8 * 8) << 17, - }; - shifts = simd_batch{ 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 28 | SafeLoadAs(in + 8 * 9) << 36, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 54 | SafeLoadAs(in + 8 * 10) << 10, - SafeLoadAs(in + 8 * 10) >> 35 | SafeLoadAs(in + 8 * 11) << 29, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 61 | SafeLoadAs(in + 8 * 12) << 3, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 42 | SafeLoadAs(in + 8 * 13) << 22, - SafeLoadAs(in + 8 * 13) >> 23 | SafeLoadAs(in + 8 * 14) << 41, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 49 | SafeLoadAs(in + 8 * 15) << 15, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 30 | SafeLoadAs(in + 8 * 16) << 34, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - SafeLoadAs(in + 8 * 17) >> 37 | SafeLoadAs(in + 8 * 18) << 27, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 63 | SafeLoadAs(in + 8 * 19) << 1, - }; - shifts = simd_batch{ 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 44 | SafeLoadAs(in + 8 * 20) << 20, - SafeLoadAs(in + 8 * 20) >> 25 | SafeLoadAs(in + 8 * 21) << 39, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 51 | SafeLoadAs(in + 8 * 22) << 13, - }; - shifts = simd_batch{ 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 32 | SafeLoadAs(in + 8 * 23) << 32, - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 58 | SafeLoadAs(in + 8 * 24) << 6, - SafeLoadAs(in + 8 * 24) >> 39 | SafeLoadAs(in + 8 * 25) << 25, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 20 | SafeLoadAs(in + 8 * 26) << 44, - SafeLoadAs(in + 8 * 26), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 46 | SafeLoadAs(in + 8 * 27) << 18, - SafeLoadAs(in + 8 * 27) >> 27 | SafeLoadAs(in + 8 * 28) << 37, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 53 | SafeLoadAs(in + 8 * 29) << 11, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 34 | SafeLoadAs(in + 8 * 30) << 30, - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 60 | SafeLoadAs(in + 8 * 31) << 4, - SafeLoadAs(in + 8 * 31) >> 41 | SafeLoadAs(in + 8 * 32) << 23, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 32) >> 22 | SafeLoadAs(in + 8 * 33) << 42, - SafeLoadAs(in + 8 * 33), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 48 | SafeLoadAs(in + 8 * 34) << 16, - SafeLoadAs(in + 8 * 34) >> 29 | SafeLoadAs(in + 8 * 35) << 35, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 35), - SafeLoadAs(in + 8 * 35) >> 55 | SafeLoadAs(in + 8 * 36) << 9, - }; - shifts = simd_batch{ 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 36 | SafeLoadAs(in + 8 * 37) << 28, - SafeLoadAs(in + 8 * 37), - }; - shifts = simd_batch{ 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 37) >> 62 | SafeLoadAs(in + 8 * 38) << 2, - SafeLoadAs(in + 8 * 38) >> 43 | SafeLoadAs(in + 8 * 39) << 21, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 24 | SafeLoadAs(in + 8 * 40) << 40, - SafeLoadAs(in + 8 * 40), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 40) >> 50 | SafeLoadAs(in + 8 * 41) << 14, - SafeLoadAs(in + 8 * 41) >> 31 | SafeLoadAs(in + 8 * 42) << 33, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 42), - SafeLoadAs(in + 8 * 42) >> 57 | SafeLoadAs(in + 8 * 43) << 7, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 45-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 43) >> 38 | SafeLoadAs(in + 8 * 44) << 26, - SafeLoadAs(in + 8 * 44), - }; - shifts = simd_batch{ 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 45 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3fffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 46-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 46 | SafeLoadAs(in + 8 * 1) << 18, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 28 | SafeLoadAs(in + 8 * 2) << 36, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 56 | SafeLoadAs(in + 8 * 3) << 8, - SafeLoadAs(in + 8 * 3) >> 38 | SafeLoadAs(in + 8 * 4) << 26, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 20 | SafeLoadAs(in + 8 * 5) << 44, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 48 | SafeLoadAs(in + 8 * 6) << 16, - SafeLoadAs(in + 8 * 6) >> 30 | SafeLoadAs(in + 8 * 7) << 34, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 58 | SafeLoadAs(in + 8 * 8) << 6, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 40 | SafeLoadAs(in + 8 * 9) << 24, - SafeLoadAs(in + 8 * 9) >> 22 | SafeLoadAs(in + 8 * 10) << 42, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 50 | SafeLoadAs(in + 8 * 11) << 14, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 32 | SafeLoadAs(in + 8 * 12) << 32, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 60 | SafeLoadAs(in + 8 * 13) << 4, - SafeLoadAs(in + 8 * 13) >> 42 | SafeLoadAs(in + 8 * 14) << 22, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 24 | SafeLoadAs(in + 8 * 15) << 40, - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 52 | SafeLoadAs(in + 8 * 16) << 12, - SafeLoadAs(in + 8 * 16) >> 34 | SafeLoadAs(in + 8 * 17) << 30, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 62 | SafeLoadAs(in + 8 * 18) << 2, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 44 | SafeLoadAs(in + 8 * 19) << 20, - SafeLoadAs(in + 8 * 19) >> 26 | SafeLoadAs(in + 8 * 20) << 38, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 54 | SafeLoadAs(in + 8 * 21) << 10, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 36 | SafeLoadAs(in + 8 * 22) << 28, - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 46 | SafeLoadAs(in + 8 * 24) << 18, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 28 | SafeLoadAs(in + 8 * 25) << 36, - SafeLoadAs(in + 8 * 25), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 56 | SafeLoadAs(in + 8 * 26) << 8, - SafeLoadAs(in + 8 * 26) >> 38 | SafeLoadAs(in + 8 * 27) << 26, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 20 | SafeLoadAs(in + 8 * 28) << 44, - SafeLoadAs(in + 8 * 28), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 48 | SafeLoadAs(in + 8 * 29) << 16, - SafeLoadAs(in + 8 * 29) >> 30 | SafeLoadAs(in + 8 * 30) << 34, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 58 | SafeLoadAs(in + 8 * 31) << 6, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 40 | SafeLoadAs(in + 8 * 32) << 24, - SafeLoadAs(in + 8 * 32) >> 22 | SafeLoadAs(in + 8 * 33) << 42, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 33), - SafeLoadAs(in + 8 * 33) >> 50 | SafeLoadAs(in + 8 * 34) << 14, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 32 | SafeLoadAs(in + 8 * 35) << 32, - SafeLoadAs(in + 8 * 35), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 35) >> 60 | SafeLoadAs(in + 8 * 36) << 4, - SafeLoadAs(in + 8 * 36) >> 42 | SafeLoadAs(in + 8 * 37) << 22, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 37) >> 24 | SafeLoadAs(in + 8 * 38) << 40, - SafeLoadAs(in + 8 * 38), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 38) >> 52 | SafeLoadAs(in + 8 * 39) << 12, - SafeLoadAs(in + 8 * 39) >> 34 | SafeLoadAs(in + 8 * 40) << 30, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 40), - SafeLoadAs(in + 8 * 40) >> 62 | SafeLoadAs(in + 8 * 41) << 2, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 41) >> 44 | SafeLoadAs(in + 8 * 42) << 20, - SafeLoadAs(in + 8 * 42) >> 26 | SafeLoadAs(in + 8 * 43) << 38, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 43), - SafeLoadAs(in + 8 * 43) >> 54 | SafeLoadAs(in + 8 * 44) << 10, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 46-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 44) >> 36 | SafeLoadAs(in + 8 * 45) << 28, - SafeLoadAs(in + 8 * 45), - }; - shifts = simd_batch{ 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 46 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7fffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 47-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 47 | SafeLoadAs(in + 8 * 1) << 17, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 30 | SafeLoadAs(in + 8 * 2) << 34, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 60 | SafeLoadAs(in + 8 * 3) << 4, - SafeLoadAs(in + 8 * 3) >> 43 | SafeLoadAs(in + 8 * 4) << 21, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 26 | SafeLoadAs(in + 8 * 5) << 38, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 56 | SafeLoadAs(in + 8 * 6) << 8, - SafeLoadAs(in + 8 * 6) >> 39 | SafeLoadAs(in + 8 * 7) << 25, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 22 | SafeLoadAs(in + 8 * 8) << 42, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 52 | SafeLoadAs(in + 8 * 9) << 12, - SafeLoadAs(in + 8 * 9) >> 35 | SafeLoadAs(in + 8 * 10) << 29, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 18 | SafeLoadAs(in + 8 * 11) << 46, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 48 | SafeLoadAs(in + 8 * 12) << 16, - SafeLoadAs(in + 8 * 12) >> 31 | SafeLoadAs(in + 8 * 13) << 33, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 61 | SafeLoadAs(in + 8 * 14) << 3, - }; - shifts = simd_batch{ 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 44 | SafeLoadAs(in + 8 * 15) << 20, - SafeLoadAs(in + 8 * 15) >> 27 | SafeLoadAs(in + 8 * 16) << 37, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 57 | SafeLoadAs(in + 8 * 17) << 7, - }; - shifts = simd_batch{ 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 40 | SafeLoadAs(in + 8 * 18) << 24, - SafeLoadAs(in + 8 * 18) >> 23 | SafeLoadAs(in + 8 * 19) << 41, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 53 | SafeLoadAs(in + 8 * 20) << 11, - }; - shifts = simd_batch{ 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 36 | SafeLoadAs(in + 8 * 21) << 28, - SafeLoadAs(in + 8 * 21) >> 19 | SafeLoadAs(in + 8 * 22) << 45, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 49 | SafeLoadAs(in + 8 * 23) << 15, - }; - shifts = simd_batch{ 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 32 | SafeLoadAs(in + 8 * 24) << 32, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 62 | SafeLoadAs(in + 8 * 25) << 2, - SafeLoadAs(in + 8 * 25) >> 45 | SafeLoadAs(in + 8 * 26) << 19, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 28 | SafeLoadAs(in + 8 * 27) << 36, - SafeLoadAs(in + 8 * 27), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 58 | SafeLoadAs(in + 8 * 28) << 6, - SafeLoadAs(in + 8 * 28) >> 41 | SafeLoadAs(in + 8 * 29) << 23, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 24 | SafeLoadAs(in + 8 * 30) << 40, - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 54 | SafeLoadAs(in + 8 * 31) << 10, - SafeLoadAs(in + 8 * 31) >> 37 | SafeLoadAs(in + 8 * 32) << 27, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 32) >> 20 | SafeLoadAs(in + 8 * 33) << 44, - SafeLoadAs(in + 8 * 33), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 50 | SafeLoadAs(in + 8 * 34) << 14, - SafeLoadAs(in + 8 * 34) >> 33 | SafeLoadAs(in + 8 * 35) << 31, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 35), - SafeLoadAs(in + 8 * 35) >> 63 | SafeLoadAs(in + 8 * 36) << 1, - }; - shifts = simd_batch{ 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 46 | SafeLoadAs(in + 8 * 37) << 18, - SafeLoadAs(in + 8 * 37) >> 29 | SafeLoadAs(in + 8 * 38) << 35, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 38), - SafeLoadAs(in + 8 * 38) >> 59 | SafeLoadAs(in + 8 * 39) << 5, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 42 | SafeLoadAs(in + 8 * 40) << 22, - SafeLoadAs(in + 8 * 40) >> 25 | SafeLoadAs(in + 8 * 41) << 39, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 41), - SafeLoadAs(in + 8 * 41) >> 55 | SafeLoadAs(in + 8 * 42) << 9, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 42) >> 38 | SafeLoadAs(in + 8 * 43) << 26, - SafeLoadAs(in + 8 * 43) >> 21 | SafeLoadAs(in + 8 * 44) << 43, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 44), - SafeLoadAs(in + 8 * 44) >> 51 | SafeLoadAs(in + 8 * 45) << 13, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 47-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 45) >> 34 | SafeLoadAs(in + 8 * 46) << 30, - SafeLoadAs(in + 8 * 46), - }; - shifts = simd_batch{ 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 47 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 48-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 48 | SafeLoadAs(in + 8 * 1) << 16, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 32 | SafeLoadAs(in + 8 * 2) << 32, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 48 | SafeLoadAs(in + 8 * 4) << 16, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 32 | SafeLoadAs(in + 8 * 5) << 32, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 48 | SafeLoadAs(in + 8 * 7) << 16, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 32 | SafeLoadAs(in + 8 * 8) << 32, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 48 | SafeLoadAs(in + 8 * 10) << 16, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 32 | SafeLoadAs(in + 8 * 11) << 32, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 48 | SafeLoadAs(in + 8 * 13) << 16, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 32 | SafeLoadAs(in + 8 * 14) << 32, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 48 | SafeLoadAs(in + 8 * 16) << 16, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 32 | SafeLoadAs(in + 8 * 17) << 32, - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 48 | SafeLoadAs(in + 8 * 19) << 16, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 32 | SafeLoadAs(in + 8 * 20) << 32, - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 48 | SafeLoadAs(in + 8 * 22) << 16, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 32 | SafeLoadAs(in + 8 * 23) << 32, - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 48 | SafeLoadAs(in + 8 * 25) << 16, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 32 | SafeLoadAs(in + 8 * 26) << 32, - SafeLoadAs(in + 8 * 26), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 48 | SafeLoadAs(in + 8 * 28) << 16, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 32 | SafeLoadAs(in + 8 * 29) << 32, - SafeLoadAs(in + 8 * 29), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 48 | SafeLoadAs(in + 8 * 31) << 16, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 32 | SafeLoadAs(in + 8 * 32) << 32, - SafeLoadAs(in + 8 * 32), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 33), - SafeLoadAs(in + 8 * 33) >> 48 | SafeLoadAs(in + 8 * 34) << 16, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 32 | SafeLoadAs(in + 8 * 35) << 32, - SafeLoadAs(in + 8 * 35), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 36), - SafeLoadAs(in + 8 * 36) >> 48 | SafeLoadAs(in + 8 * 37) << 16, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 37) >> 32 | SafeLoadAs(in + 8 * 38) << 32, - SafeLoadAs(in + 8 * 38), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 39), - SafeLoadAs(in + 8 * 39) >> 48 | SafeLoadAs(in + 8 * 40) << 16, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 40) >> 32 | SafeLoadAs(in + 8 * 41) << 32, - SafeLoadAs(in + 8 * 41), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 42), - SafeLoadAs(in + 8 * 42) >> 48 | SafeLoadAs(in + 8 * 43) << 16, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 43) >> 32 | SafeLoadAs(in + 8 * 44) << 32, - SafeLoadAs(in + 8 * 44), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 45), - SafeLoadAs(in + 8 * 45) >> 48 | SafeLoadAs(in + 8 * 46) << 16, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 48-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 46) >> 32 | SafeLoadAs(in + 8 * 47) << 32, - SafeLoadAs(in + 8 * 47), - }; - shifts = simd_batch{ 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 48 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1ffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 49-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 49 | SafeLoadAs(in + 8 * 1) << 15, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 34 | SafeLoadAs(in + 8 * 2) << 30, - SafeLoadAs(in + 8 * 2) >> 19 | SafeLoadAs(in + 8 * 3) << 45, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 53 | SafeLoadAs(in + 8 * 4) << 11, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 38 | SafeLoadAs(in + 8 * 5) << 26, - SafeLoadAs(in + 8 * 5) >> 23 | SafeLoadAs(in + 8 * 6) << 41, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 57 | SafeLoadAs(in + 8 * 7) << 7, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 42 | SafeLoadAs(in + 8 * 8) << 22, - SafeLoadAs(in + 8 * 8) >> 27 | SafeLoadAs(in + 8 * 9) << 37, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 61 | SafeLoadAs(in + 8 * 10) << 3, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 46 | SafeLoadAs(in + 8 * 11) << 18, - SafeLoadAs(in + 8 * 11) >> 31 | SafeLoadAs(in + 8 * 12) << 33, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 16 | SafeLoadAs(in + 8 * 13) << 48, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 50 | SafeLoadAs(in + 8 * 14) << 14, - SafeLoadAs(in + 8 * 14) >> 35 | SafeLoadAs(in + 8 * 15) << 29, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 20 | SafeLoadAs(in + 8 * 16) << 44, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 54 | SafeLoadAs(in + 8 * 17) << 10, - SafeLoadAs(in + 8 * 17) >> 39 | SafeLoadAs(in + 8 * 18) << 25, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 24 | SafeLoadAs(in + 8 * 19) << 40, - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 58 | SafeLoadAs(in + 8 * 20) << 6, - SafeLoadAs(in + 8 * 20) >> 43 | SafeLoadAs(in + 8 * 21) << 21, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 28 | SafeLoadAs(in + 8 * 22) << 36, - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 62 | SafeLoadAs(in + 8 * 23) << 2, - SafeLoadAs(in + 8 * 23) >> 47 | SafeLoadAs(in + 8 * 24) << 17, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 32 | SafeLoadAs(in + 8 * 25) << 32, - SafeLoadAs(in + 8 * 25) >> 17 | SafeLoadAs(in + 8 * 26) << 47, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 51 | SafeLoadAs(in + 8 * 27) << 13, - }; - shifts = simd_batch{ 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 36 | SafeLoadAs(in + 8 * 28) << 28, - SafeLoadAs(in + 8 * 28) >> 21 | SafeLoadAs(in + 8 * 29) << 43, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 29), - SafeLoadAs(in + 8 * 29) >> 55 | SafeLoadAs(in + 8 * 30) << 9, - }; - shifts = simd_batch{ 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 40 | SafeLoadAs(in + 8 * 31) << 24, - SafeLoadAs(in + 8 * 31) >> 25 | SafeLoadAs(in + 8 * 32) << 39, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 32), - SafeLoadAs(in + 8 * 32) >> 59 | SafeLoadAs(in + 8 * 33) << 5, - }; - shifts = simd_batch{ 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 44 | SafeLoadAs(in + 8 * 34) << 20, - SafeLoadAs(in + 8 * 34) >> 29 | SafeLoadAs(in + 8 * 35) << 35, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 35), - SafeLoadAs(in + 8 * 35) >> 63 | SafeLoadAs(in + 8 * 36) << 1, - }; - shifts = simd_batch{ 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 48 | SafeLoadAs(in + 8 * 37) << 16, - SafeLoadAs(in + 8 * 37) >> 33 | SafeLoadAs(in + 8 * 38) << 31, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 38) >> 18 | SafeLoadAs(in + 8 * 39) << 46, - SafeLoadAs(in + 8 * 39), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 52 | SafeLoadAs(in + 8 * 40) << 12, - SafeLoadAs(in + 8 * 40) >> 37 | SafeLoadAs(in + 8 * 41) << 27, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 41) >> 22 | SafeLoadAs(in + 8 * 42) << 42, - SafeLoadAs(in + 8 * 42), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 42) >> 56 | SafeLoadAs(in + 8 * 43) << 8, - SafeLoadAs(in + 8 * 43) >> 41 | SafeLoadAs(in + 8 * 44) << 23, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 44) >> 26 | SafeLoadAs(in + 8 * 45) << 38, - SafeLoadAs(in + 8 * 45), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 45) >> 60 | SafeLoadAs(in + 8 * 46) << 4, - SafeLoadAs(in + 8 * 46) >> 45 | SafeLoadAs(in + 8 * 47) << 19, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 49-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 47) >> 30 | SafeLoadAs(in + 8 * 48) << 34, - SafeLoadAs(in + 8 * 48), - }; - shifts = simd_batch{ 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 49 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3ffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 50-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 50 | SafeLoadAs(in + 8 * 1) << 14, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 36 | SafeLoadAs(in + 8 * 2) << 28, - SafeLoadAs(in + 8 * 2) >> 22 | SafeLoadAs(in + 8 * 3) << 42, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 58 | SafeLoadAs(in + 8 * 4) << 6, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 44 | SafeLoadAs(in + 8 * 5) << 20, - SafeLoadAs(in + 8 * 5) >> 30 | SafeLoadAs(in + 8 * 6) << 34, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 16 | SafeLoadAs(in + 8 * 7) << 48, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 52 | SafeLoadAs(in + 8 * 8) << 12, - SafeLoadAs(in + 8 * 8) >> 38 | SafeLoadAs(in + 8 * 9) << 26, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 24 | SafeLoadAs(in + 8 * 10) << 40, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 60 | SafeLoadAs(in + 8 * 11) << 4, - SafeLoadAs(in + 8 * 11) >> 46 | SafeLoadAs(in + 8 * 12) << 18, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 32 | SafeLoadAs(in + 8 * 13) << 32, - SafeLoadAs(in + 8 * 13) >> 18 | SafeLoadAs(in + 8 * 14) << 46, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 54 | SafeLoadAs(in + 8 * 15) << 10, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 40 | SafeLoadAs(in + 8 * 16) << 24, - SafeLoadAs(in + 8 * 16) >> 26 | SafeLoadAs(in + 8 * 17) << 38, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 62 | SafeLoadAs(in + 8 * 18) << 2, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 48 | SafeLoadAs(in + 8 * 19) << 16, - SafeLoadAs(in + 8 * 19) >> 34 | SafeLoadAs(in + 8 * 20) << 30, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 20 | SafeLoadAs(in + 8 * 21) << 44, - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 56 | SafeLoadAs(in + 8 * 22) << 8, - SafeLoadAs(in + 8 * 22) >> 42 | SafeLoadAs(in + 8 * 23) << 22, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 28 | SafeLoadAs(in + 8 * 24) << 36, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25) >> 50 | SafeLoadAs(in + 8 * 26) << 14, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 36 | SafeLoadAs(in + 8 * 27) << 28, - SafeLoadAs(in + 8 * 27) >> 22 | SafeLoadAs(in + 8 * 28) << 42, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 58 | SafeLoadAs(in + 8 * 29) << 6, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 44 | SafeLoadAs(in + 8 * 30) << 20, - SafeLoadAs(in + 8 * 30) >> 30 | SafeLoadAs(in + 8 * 31) << 34, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 16 | SafeLoadAs(in + 8 * 32) << 48, - SafeLoadAs(in + 8 * 32), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 32) >> 52 | SafeLoadAs(in + 8 * 33) << 12, - SafeLoadAs(in + 8 * 33) >> 38 | SafeLoadAs(in + 8 * 34) << 26, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 24 | SafeLoadAs(in + 8 * 35) << 40, - SafeLoadAs(in + 8 * 35), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 35) >> 60 | SafeLoadAs(in + 8 * 36) << 4, - SafeLoadAs(in + 8 * 36) >> 46 | SafeLoadAs(in + 8 * 37) << 18, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 37) >> 32 | SafeLoadAs(in + 8 * 38) << 32, - SafeLoadAs(in + 8 * 38) >> 18 | SafeLoadAs(in + 8 * 39) << 46, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 39), - SafeLoadAs(in + 8 * 39) >> 54 | SafeLoadAs(in + 8 * 40) << 10, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 40) >> 40 | SafeLoadAs(in + 8 * 41) << 24, - SafeLoadAs(in + 8 * 41) >> 26 | SafeLoadAs(in + 8 * 42) << 38, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 42), - SafeLoadAs(in + 8 * 42) >> 62 | SafeLoadAs(in + 8 * 43) << 2, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 43) >> 48 | SafeLoadAs(in + 8 * 44) << 16, - SafeLoadAs(in + 8 * 44) >> 34 | SafeLoadAs(in + 8 * 45) << 30, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 45) >> 20 | SafeLoadAs(in + 8 * 46) << 44, - SafeLoadAs(in + 8 * 46), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 46) >> 56 | SafeLoadAs(in + 8 * 47) << 8, - SafeLoadAs(in + 8 * 47) >> 42 | SafeLoadAs(in + 8 * 48) << 22, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 50-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 48) >> 28 | SafeLoadAs(in + 8 * 49) << 36, - SafeLoadAs(in + 8 * 49), - }; - shifts = simd_batch{ 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 50 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7ffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 51-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 51 | SafeLoadAs(in + 8 * 1) << 13, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 38 | SafeLoadAs(in + 8 * 2) << 26, - SafeLoadAs(in + 8 * 2) >> 25 | SafeLoadAs(in + 8 * 3) << 39, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 63 | SafeLoadAs(in + 8 * 4) << 1, - }; - shifts = simd_batch{ 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 50 | SafeLoadAs(in + 8 * 5) << 14, - SafeLoadAs(in + 8 * 5) >> 37 | SafeLoadAs(in + 8 * 6) << 27, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 24 | SafeLoadAs(in + 8 * 7) << 40, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 62 | SafeLoadAs(in + 8 * 8) << 2, - SafeLoadAs(in + 8 * 8) >> 49 | SafeLoadAs(in + 8 * 9) << 15, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 36 | SafeLoadAs(in + 8 * 10) << 28, - SafeLoadAs(in + 8 * 10) >> 23 | SafeLoadAs(in + 8 * 11) << 41, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 61 | SafeLoadAs(in + 8 * 12) << 3, - }; - shifts = simd_batch{ 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 48 | SafeLoadAs(in + 8 * 13) << 16, - SafeLoadAs(in + 8 * 13) >> 35 | SafeLoadAs(in + 8 * 14) << 29, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 22 | SafeLoadAs(in + 8 * 15) << 42, - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 60 | SafeLoadAs(in + 8 * 16) << 4, - SafeLoadAs(in + 8 * 16) >> 47 | SafeLoadAs(in + 8 * 17) << 17, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 34 | SafeLoadAs(in + 8 * 18) << 30, - SafeLoadAs(in + 8 * 18) >> 21 | SafeLoadAs(in + 8 * 19) << 43, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 59 | SafeLoadAs(in + 8 * 20) << 5, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 46 | SafeLoadAs(in + 8 * 21) << 18, - SafeLoadAs(in + 8 * 21) >> 33 | SafeLoadAs(in + 8 * 22) << 31, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 20 | SafeLoadAs(in + 8 * 23) << 44, - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 58 | SafeLoadAs(in + 8 * 24) << 6, - SafeLoadAs(in + 8 * 24) >> 45 | SafeLoadAs(in + 8 * 25) << 19, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 32 | SafeLoadAs(in + 8 * 26) << 32, - SafeLoadAs(in + 8 * 26) >> 19 | SafeLoadAs(in + 8 * 27) << 45, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 57 | SafeLoadAs(in + 8 * 28) << 7, - }; - shifts = simd_batch{ 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 44 | SafeLoadAs(in + 8 * 29) << 20, - SafeLoadAs(in + 8 * 29) >> 31 | SafeLoadAs(in + 8 * 30) << 33, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 18 | SafeLoadAs(in + 8 * 31) << 46, - SafeLoadAs(in + 8 * 31), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 56 | SafeLoadAs(in + 8 * 32) << 8, - SafeLoadAs(in + 8 * 32) >> 43 | SafeLoadAs(in + 8 * 33) << 21, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 30 | SafeLoadAs(in + 8 * 34) << 34, - SafeLoadAs(in + 8 * 34) >> 17 | SafeLoadAs(in + 8 * 35) << 47, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 35), - SafeLoadAs(in + 8 * 35) >> 55 | SafeLoadAs(in + 8 * 36) << 9, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 42 | SafeLoadAs(in + 8 * 37) << 22, - SafeLoadAs(in + 8 * 37) >> 29 | SafeLoadAs(in + 8 * 38) << 35, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 38) >> 16 | SafeLoadAs(in + 8 * 39) << 48, - SafeLoadAs(in + 8 * 39), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 54 | SafeLoadAs(in + 8 * 40) << 10, - SafeLoadAs(in + 8 * 40) >> 41 | SafeLoadAs(in + 8 * 41) << 23, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 41) >> 28 | SafeLoadAs(in + 8 * 42) << 36, - SafeLoadAs(in + 8 * 42) >> 15 | SafeLoadAs(in + 8 * 43) << 49, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 43), - SafeLoadAs(in + 8 * 43) >> 53 | SafeLoadAs(in + 8 * 44) << 11, - }; - shifts = simd_batch{ 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 44) >> 40 | SafeLoadAs(in + 8 * 45) << 24, - SafeLoadAs(in + 8 * 45) >> 27 | SafeLoadAs(in + 8 * 46) << 37, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 46) >> 14 | SafeLoadAs(in + 8 * 47) << 50, - SafeLoadAs(in + 8 * 47), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 47) >> 52 | SafeLoadAs(in + 8 * 48) << 12, - SafeLoadAs(in + 8 * 48) >> 39 | SafeLoadAs(in + 8 * 49) << 25, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 51-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 49) >> 26 | SafeLoadAs(in + 8 * 50) << 38, - SafeLoadAs(in + 8 * 50), - }; - shifts = simd_batch{ 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 51 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xfffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 52-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 52 | SafeLoadAs(in + 8 * 1) << 12, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 40 | SafeLoadAs(in + 8 * 2) << 24, - SafeLoadAs(in + 8 * 2) >> 28 | SafeLoadAs(in + 8 * 3) << 36, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 16 | SafeLoadAs(in + 8 * 4) << 48, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 56 | SafeLoadAs(in + 8 * 5) << 8, - SafeLoadAs(in + 8 * 5) >> 44 | SafeLoadAs(in + 8 * 6) << 20, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 32 | SafeLoadAs(in + 8 * 7) << 32, - SafeLoadAs(in + 8 * 7) >> 20 | SafeLoadAs(in + 8 * 8) << 44, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 60 | SafeLoadAs(in + 8 * 9) << 4, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 48 | SafeLoadAs(in + 8 * 10) << 16, - SafeLoadAs(in + 8 * 10) >> 36 | SafeLoadAs(in + 8 * 11) << 28, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 24 | SafeLoadAs(in + 8 * 12) << 40, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 52 | SafeLoadAs(in + 8 * 14) << 12, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 40 | SafeLoadAs(in + 8 * 15) << 24, - SafeLoadAs(in + 8 * 15) >> 28 | SafeLoadAs(in + 8 * 16) << 36, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 16 | SafeLoadAs(in + 8 * 17) << 48, - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 56 | SafeLoadAs(in + 8 * 18) << 8, - SafeLoadAs(in + 8 * 18) >> 44 | SafeLoadAs(in + 8 * 19) << 20, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 32 | SafeLoadAs(in + 8 * 20) << 32, - SafeLoadAs(in + 8 * 20) >> 20 | SafeLoadAs(in + 8 * 21) << 44, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 60 | SafeLoadAs(in + 8 * 22) << 4, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 48 | SafeLoadAs(in + 8 * 23) << 16, - SafeLoadAs(in + 8 * 23) >> 36 | SafeLoadAs(in + 8 * 24) << 28, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 24 | SafeLoadAs(in + 8 * 25) << 40, - SafeLoadAs(in + 8 * 25), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 52 | SafeLoadAs(in + 8 * 27) << 12, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 40 | SafeLoadAs(in + 8 * 28) << 24, - SafeLoadAs(in + 8 * 28) >> 28 | SafeLoadAs(in + 8 * 29) << 36, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 16 | SafeLoadAs(in + 8 * 30) << 48, - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 56 | SafeLoadAs(in + 8 * 31) << 8, - SafeLoadAs(in + 8 * 31) >> 44 | SafeLoadAs(in + 8 * 32) << 20, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 32) >> 32 | SafeLoadAs(in + 8 * 33) << 32, - SafeLoadAs(in + 8 * 33) >> 20 | SafeLoadAs(in + 8 * 34) << 44, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 34), - SafeLoadAs(in + 8 * 34) >> 60 | SafeLoadAs(in + 8 * 35) << 4, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 35) >> 48 | SafeLoadAs(in + 8 * 36) << 16, - SafeLoadAs(in + 8 * 36) >> 36 | SafeLoadAs(in + 8 * 37) << 28, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 37) >> 24 | SafeLoadAs(in + 8 * 38) << 40, - SafeLoadAs(in + 8 * 38), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 39), - SafeLoadAs(in + 8 * 39) >> 52 | SafeLoadAs(in + 8 * 40) << 12, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 40) >> 40 | SafeLoadAs(in + 8 * 41) << 24, - SafeLoadAs(in + 8 * 41) >> 28 | SafeLoadAs(in + 8 * 42) << 36, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 42) >> 16 | SafeLoadAs(in + 8 * 43) << 48, - SafeLoadAs(in + 8 * 43), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 43) >> 56 | SafeLoadAs(in + 8 * 44) << 8, - SafeLoadAs(in + 8 * 44) >> 44 | SafeLoadAs(in + 8 * 45) << 20, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 45) >> 32 | SafeLoadAs(in + 8 * 46) << 32, - SafeLoadAs(in + 8 * 46) >> 20 | SafeLoadAs(in + 8 * 47) << 44, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 47), - SafeLoadAs(in + 8 * 47) >> 60 | SafeLoadAs(in + 8 * 48) << 4, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 48) >> 48 | SafeLoadAs(in + 8 * 49) << 16, - SafeLoadAs(in + 8 * 49) >> 36 | SafeLoadAs(in + 8 * 50) << 28, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 52-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 50) >> 24 | SafeLoadAs(in + 8 * 51) << 40, - SafeLoadAs(in + 8 * 51), - }; - shifts = simd_batch{ 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 52 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1fffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 53-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 53 | SafeLoadAs(in + 8 * 1) << 11, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 42 | SafeLoadAs(in + 8 * 2) << 22, - SafeLoadAs(in + 8 * 2) >> 31 | SafeLoadAs(in + 8 * 3) << 33, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 20 | SafeLoadAs(in + 8 * 4) << 44, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 62 | SafeLoadAs(in + 8 * 5) << 2, - SafeLoadAs(in + 8 * 5) >> 51 | SafeLoadAs(in + 8 * 6) << 13, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 40 | SafeLoadAs(in + 8 * 7) << 24, - SafeLoadAs(in + 8 * 7) >> 29 | SafeLoadAs(in + 8 * 8) << 35, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 18 | SafeLoadAs(in + 8 * 9) << 46, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 60 | SafeLoadAs(in + 8 * 10) << 4, - SafeLoadAs(in + 8 * 10) >> 49 | SafeLoadAs(in + 8 * 11) << 15, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 38 | SafeLoadAs(in + 8 * 12) << 26, - SafeLoadAs(in + 8 * 12) >> 27 | SafeLoadAs(in + 8 * 13) << 37, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 16 | SafeLoadAs(in + 8 * 14) << 48, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 58 | SafeLoadAs(in + 8 * 15) << 6, - SafeLoadAs(in + 8 * 15) >> 47 | SafeLoadAs(in + 8 * 16) << 17, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 36 | SafeLoadAs(in + 8 * 17) << 28, - SafeLoadAs(in + 8 * 17) >> 25 | SafeLoadAs(in + 8 * 18) << 39, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 14 | SafeLoadAs(in + 8 * 19) << 50, - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 56 | SafeLoadAs(in + 8 * 20) << 8, - SafeLoadAs(in + 8 * 20) >> 45 | SafeLoadAs(in + 8 * 21) << 19, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 34 | SafeLoadAs(in + 8 * 22) << 30, - SafeLoadAs(in + 8 * 22) >> 23 | SafeLoadAs(in + 8 * 23) << 41, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 12 | SafeLoadAs(in + 8 * 24) << 52, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 54 | SafeLoadAs(in + 8 * 25) << 10, - SafeLoadAs(in + 8 * 25) >> 43 | SafeLoadAs(in + 8 * 26) << 21, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 32 | SafeLoadAs(in + 8 * 27) << 32, - SafeLoadAs(in + 8 * 27) >> 21 | SafeLoadAs(in + 8 * 28) << 43, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 63 | SafeLoadAs(in + 8 * 29) << 1, - }; - shifts = simd_batch{ 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 52 | SafeLoadAs(in + 8 * 30) << 12, - SafeLoadAs(in + 8 * 30) >> 41 | SafeLoadAs(in + 8 * 31) << 23, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 30 | SafeLoadAs(in + 8 * 32) << 34, - SafeLoadAs(in + 8 * 32) >> 19 | SafeLoadAs(in + 8 * 33) << 45, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 33), - SafeLoadAs(in + 8 * 33) >> 61 | SafeLoadAs(in + 8 * 34) << 3, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 50 | SafeLoadAs(in + 8 * 35) << 14, - SafeLoadAs(in + 8 * 35) >> 39 | SafeLoadAs(in + 8 * 36) << 25, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 28 | SafeLoadAs(in + 8 * 37) << 36, - SafeLoadAs(in + 8 * 37) >> 17 | SafeLoadAs(in + 8 * 38) << 47, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 38), - SafeLoadAs(in + 8 * 38) >> 59 | SafeLoadAs(in + 8 * 39) << 5, - }; - shifts = simd_batch{ 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 48 | SafeLoadAs(in + 8 * 40) << 16, - SafeLoadAs(in + 8 * 40) >> 37 | SafeLoadAs(in + 8 * 41) << 27, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 41) >> 26 | SafeLoadAs(in + 8 * 42) << 38, - SafeLoadAs(in + 8 * 42) >> 15 | SafeLoadAs(in + 8 * 43) << 49, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 43), - SafeLoadAs(in + 8 * 43) >> 57 | SafeLoadAs(in + 8 * 44) << 7, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 44) >> 46 | SafeLoadAs(in + 8 * 45) << 18, - SafeLoadAs(in + 8 * 45) >> 35 | SafeLoadAs(in + 8 * 46) << 29, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 46) >> 24 | SafeLoadAs(in + 8 * 47) << 40, - SafeLoadAs(in + 8 * 47) >> 13 | SafeLoadAs(in + 8 * 48) << 51, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 48), - SafeLoadAs(in + 8 * 48) >> 55 | SafeLoadAs(in + 8 * 49) << 9, - }; - shifts = simd_batch{ 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 49) >> 44 | SafeLoadAs(in + 8 * 50) << 20, - SafeLoadAs(in + 8 * 50) >> 33 | SafeLoadAs(in + 8 * 51) << 31, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 53-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 51) >> 22 | SafeLoadAs(in + 8 * 52) << 42, - SafeLoadAs(in + 8 * 52), - }; - shifts = simd_batch{ 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 53 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3fffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 54-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 54 | SafeLoadAs(in + 8 * 1) << 10, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 44 | SafeLoadAs(in + 8 * 2) << 20, - SafeLoadAs(in + 8 * 2) >> 34 | SafeLoadAs(in + 8 * 3) << 30, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 24 | SafeLoadAs(in + 8 * 4) << 40, - SafeLoadAs(in + 8 * 4) >> 14 | SafeLoadAs(in + 8 * 5) << 50, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 58 | SafeLoadAs(in + 8 * 6) << 6, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 48 | SafeLoadAs(in + 8 * 7) << 16, - SafeLoadAs(in + 8 * 7) >> 38 | SafeLoadAs(in + 8 * 8) << 26, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 28 | SafeLoadAs(in + 8 * 9) << 36, - SafeLoadAs(in + 8 * 9) >> 18 | SafeLoadAs(in + 8 * 10) << 46, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 62 | SafeLoadAs(in + 8 * 11) << 2, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 52 | SafeLoadAs(in + 8 * 12) << 12, - SafeLoadAs(in + 8 * 12) >> 42 | SafeLoadAs(in + 8 * 13) << 22, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 32 | SafeLoadAs(in + 8 * 14) << 32, - SafeLoadAs(in + 8 * 14) >> 22 | SafeLoadAs(in + 8 * 15) << 42, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 12 | SafeLoadAs(in + 8 * 16) << 52, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - SafeLoadAs(in + 8 * 17) >> 46 | SafeLoadAs(in + 8 * 18) << 18, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 36 | SafeLoadAs(in + 8 * 19) << 28, - SafeLoadAs(in + 8 * 19) >> 26 | SafeLoadAs(in + 8 * 20) << 38, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 16 | SafeLoadAs(in + 8 * 21) << 48, - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 60 | SafeLoadAs(in + 8 * 22) << 4, - SafeLoadAs(in + 8 * 22) >> 50 | SafeLoadAs(in + 8 * 23) << 14, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 40 | SafeLoadAs(in + 8 * 24) << 24, - SafeLoadAs(in + 8 * 24) >> 30 | SafeLoadAs(in + 8 * 25) << 34, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 20 | SafeLoadAs(in + 8 * 26) << 44, - SafeLoadAs(in + 8 * 26), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 54 | SafeLoadAs(in + 8 * 28) << 10, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 44 | SafeLoadAs(in + 8 * 29) << 20, - SafeLoadAs(in + 8 * 29) >> 34 | SafeLoadAs(in + 8 * 30) << 30, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 24 | SafeLoadAs(in + 8 * 31) << 40, - SafeLoadAs(in + 8 * 31) >> 14 | SafeLoadAs(in + 8 * 32) << 50, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 32), - SafeLoadAs(in + 8 * 32) >> 58 | SafeLoadAs(in + 8 * 33) << 6, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 48 | SafeLoadAs(in + 8 * 34) << 16, - SafeLoadAs(in + 8 * 34) >> 38 | SafeLoadAs(in + 8 * 35) << 26, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 35) >> 28 | SafeLoadAs(in + 8 * 36) << 36, - SafeLoadAs(in + 8 * 36) >> 18 | SafeLoadAs(in + 8 * 37) << 46, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 37), - SafeLoadAs(in + 8 * 37) >> 62 | SafeLoadAs(in + 8 * 38) << 2, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 38) >> 52 | SafeLoadAs(in + 8 * 39) << 12, - SafeLoadAs(in + 8 * 39) >> 42 | SafeLoadAs(in + 8 * 40) << 22, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 40) >> 32 | SafeLoadAs(in + 8 * 41) << 32, - SafeLoadAs(in + 8 * 41) >> 22 | SafeLoadAs(in + 8 * 42) << 42, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 42) >> 12 | SafeLoadAs(in + 8 * 43) << 52, - SafeLoadAs(in + 8 * 43), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 43) >> 56 | SafeLoadAs(in + 8 * 44) << 8, - SafeLoadAs(in + 8 * 44) >> 46 | SafeLoadAs(in + 8 * 45) << 18, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 45) >> 36 | SafeLoadAs(in + 8 * 46) << 28, - SafeLoadAs(in + 8 * 46) >> 26 | SafeLoadAs(in + 8 * 47) << 38, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 47) >> 16 | SafeLoadAs(in + 8 * 48) << 48, - SafeLoadAs(in + 8 * 48), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 48) >> 60 | SafeLoadAs(in + 8 * 49) << 4, - SafeLoadAs(in + 8 * 49) >> 50 | SafeLoadAs(in + 8 * 50) << 14, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 50) >> 40 | SafeLoadAs(in + 8 * 51) << 24, - SafeLoadAs(in + 8 * 51) >> 30 | SafeLoadAs(in + 8 * 52) << 34, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 54-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 52) >> 20 | SafeLoadAs(in + 8 * 53) << 44, - SafeLoadAs(in + 8 * 53), - }; - shifts = simd_batch{ 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 54 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7fffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 55-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 55 | SafeLoadAs(in + 8 * 1) << 9, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 46 | SafeLoadAs(in + 8 * 2) << 18, - SafeLoadAs(in + 8 * 2) >> 37 | SafeLoadAs(in + 8 * 3) << 27, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 28 | SafeLoadAs(in + 8 * 4) << 36, - SafeLoadAs(in + 8 * 4) >> 19 | SafeLoadAs(in + 8 * 5) << 45, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 10 | SafeLoadAs(in + 8 * 6) << 54, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 56 | SafeLoadAs(in + 8 * 7) << 8, - SafeLoadAs(in + 8 * 7) >> 47 | SafeLoadAs(in + 8 * 8) << 17, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 38 | SafeLoadAs(in + 8 * 9) << 26, - SafeLoadAs(in + 8 * 9) >> 29 | SafeLoadAs(in + 8 * 10) << 35, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 20 | SafeLoadAs(in + 8 * 11) << 44, - SafeLoadAs(in + 8 * 11) >> 11 | SafeLoadAs(in + 8 * 12) << 53, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 57 | SafeLoadAs(in + 8 * 13) << 7, - }; - shifts = simd_batch{ 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 48 | SafeLoadAs(in + 8 * 14) << 16, - SafeLoadAs(in + 8 * 14) >> 39 | SafeLoadAs(in + 8 * 15) << 25, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 30 | SafeLoadAs(in + 8 * 16) << 34, - SafeLoadAs(in + 8 * 16) >> 21 | SafeLoadAs(in + 8 * 17) << 43, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 12 | SafeLoadAs(in + 8 * 18) << 52, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 58 | SafeLoadAs(in + 8 * 19) << 6, - SafeLoadAs(in + 8 * 19) >> 49 | SafeLoadAs(in + 8 * 20) << 15, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 40 | SafeLoadAs(in + 8 * 21) << 24, - SafeLoadAs(in + 8 * 21) >> 31 | SafeLoadAs(in + 8 * 22) << 33, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 22 | SafeLoadAs(in + 8 * 23) << 42, - SafeLoadAs(in + 8 * 23) >> 13 | SafeLoadAs(in + 8 * 24) << 51, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 59 | SafeLoadAs(in + 8 * 25) << 5, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 50 | SafeLoadAs(in + 8 * 26) << 14, - SafeLoadAs(in + 8 * 26) >> 41 | SafeLoadAs(in + 8 * 27) << 23, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 32 | SafeLoadAs(in + 8 * 28) << 32, - SafeLoadAs(in + 8 * 28) >> 23 | SafeLoadAs(in + 8 * 29) << 41, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 14 | SafeLoadAs(in + 8 * 30) << 50, - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 60 | SafeLoadAs(in + 8 * 31) << 4, - SafeLoadAs(in + 8 * 31) >> 51 | SafeLoadAs(in + 8 * 32) << 13, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 32) >> 42 | SafeLoadAs(in + 8 * 33) << 22, - SafeLoadAs(in + 8 * 33) >> 33 | SafeLoadAs(in + 8 * 34) << 31, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 24 | SafeLoadAs(in + 8 * 35) << 40, - SafeLoadAs(in + 8 * 35) >> 15 | SafeLoadAs(in + 8 * 36) << 49, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 36), - SafeLoadAs(in + 8 * 36) >> 61 | SafeLoadAs(in + 8 * 37) << 3, - }; - shifts = simd_batch{ 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 37) >> 52 | SafeLoadAs(in + 8 * 38) << 12, - SafeLoadAs(in + 8 * 38) >> 43 | SafeLoadAs(in + 8 * 39) << 21, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 34 | SafeLoadAs(in + 8 * 40) << 30, - SafeLoadAs(in + 8 * 40) >> 25 | SafeLoadAs(in + 8 * 41) << 39, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 41) >> 16 | SafeLoadAs(in + 8 * 42) << 48, - SafeLoadAs(in + 8 * 42), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 42) >> 62 | SafeLoadAs(in + 8 * 43) << 2, - SafeLoadAs(in + 8 * 43) >> 53 | SafeLoadAs(in + 8 * 44) << 11, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 44) >> 44 | SafeLoadAs(in + 8 * 45) << 20, - SafeLoadAs(in + 8 * 45) >> 35 | SafeLoadAs(in + 8 * 46) << 29, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 46) >> 26 | SafeLoadAs(in + 8 * 47) << 38, - SafeLoadAs(in + 8 * 47) >> 17 | SafeLoadAs(in + 8 * 48) << 47, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 48), - SafeLoadAs(in + 8 * 48) >> 63 | SafeLoadAs(in + 8 * 49) << 1, - }; - shifts = simd_batch{ 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 49) >> 54 | SafeLoadAs(in + 8 * 50) << 10, - SafeLoadAs(in + 8 * 50) >> 45 | SafeLoadAs(in + 8 * 51) << 19, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 51) >> 36 | SafeLoadAs(in + 8 * 52) << 28, - SafeLoadAs(in + 8 * 52) >> 27 | SafeLoadAs(in + 8 * 53) << 37, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 55-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 53) >> 18 | SafeLoadAs(in + 8 * 54) << 46, - SafeLoadAs(in + 8 * 54), - }; - shifts = simd_batch{ 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 55 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xffffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 56-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 56 | SafeLoadAs(in + 8 * 1) << 8, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 48 | SafeLoadAs(in + 8 * 2) << 16, - SafeLoadAs(in + 8 * 2) >> 40 | SafeLoadAs(in + 8 * 3) << 24, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 32 | SafeLoadAs(in + 8 * 4) << 32, - SafeLoadAs(in + 8 * 4) >> 24 | SafeLoadAs(in + 8 * 5) << 40, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 16 | SafeLoadAs(in + 8 * 6) << 48, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 48 | SafeLoadAs(in + 8 * 9) << 16, - SafeLoadAs(in + 8 * 9) >> 40 | SafeLoadAs(in + 8 * 10) << 24, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 32 | SafeLoadAs(in + 8 * 11) << 32, - SafeLoadAs(in + 8 * 11) >> 24 | SafeLoadAs(in + 8 * 12) << 40, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 16 | SafeLoadAs(in + 8 * 13) << 48, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 56 | SafeLoadAs(in + 8 * 15) << 8, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 48 | SafeLoadAs(in + 8 * 16) << 16, - SafeLoadAs(in + 8 * 16) >> 40 | SafeLoadAs(in + 8 * 17) << 24, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 32 | SafeLoadAs(in + 8 * 18) << 32, - SafeLoadAs(in + 8 * 18) >> 24 | SafeLoadAs(in + 8 * 19) << 40, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 16 | SafeLoadAs(in + 8 * 20) << 48, - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 56 | SafeLoadAs(in + 8 * 22) << 8, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 48 | SafeLoadAs(in + 8 * 23) << 16, - SafeLoadAs(in + 8 * 23) >> 40 | SafeLoadAs(in + 8 * 24) << 24, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 32 | SafeLoadAs(in + 8 * 25) << 32, - SafeLoadAs(in + 8 * 25) >> 24 | SafeLoadAs(in + 8 * 26) << 40, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 16 | SafeLoadAs(in + 8 * 27) << 48, - SafeLoadAs(in + 8 * 27), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 56 | SafeLoadAs(in + 8 * 29) << 8, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 48 | SafeLoadAs(in + 8 * 30) << 16, - SafeLoadAs(in + 8 * 30) >> 40 | SafeLoadAs(in + 8 * 31) << 24, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 32 | SafeLoadAs(in + 8 * 32) << 32, - SafeLoadAs(in + 8 * 32) >> 24 | SafeLoadAs(in + 8 * 33) << 40, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 16 | SafeLoadAs(in + 8 * 34) << 48, - SafeLoadAs(in + 8 * 34), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 35), - SafeLoadAs(in + 8 * 35) >> 56 | SafeLoadAs(in + 8 * 36) << 8, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 48 | SafeLoadAs(in + 8 * 37) << 16, - SafeLoadAs(in + 8 * 37) >> 40 | SafeLoadAs(in + 8 * 38) << 24, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 38) >> 32 | SafeLoadAs(in + 8 * 39) << 32, - SafeLoadAs(in + 8 * 39) >> 24 | SafeLoadAs(in + 8 * 40) << 40, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 40) >> 16 | SafeLoadAs(in + 8 * 41) << 48, - SafeLoadAs(in + 8 * 41), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 42), - SafeLoadAs(in + 8 * 42) >> 56 | SafeLoadAs(in + 8 * 43) << 8, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 43) >> 48 | SafeLoadAs(in + 8 * 44) << 16, - SafeLoadAs(in + 8 * 44) >> 40 | SafeLoadAs(in + 8 * 45) << 24, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 45) >> 32 | SafeLoadAs(in + 8 * 46) << 32, - SafeLoadAs(in + 8 * 46) >> 24 | SafeLoadAs(in + 8 * 47) << 40, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 47) >> 16 | SafeLoadAs(in + 8 * 48) << 48, - SafeLoadAs(in + 8 * 48), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 49), - SafeLoadAs(in + 8 * 49) >> 56 | SafeLoadAs(in + 8 * 50) << 8, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 50) >> 48 | SafeLoadAs(in + 8 * 51) << 16, - SafeLoadAs(in + 8 * 51) >> 40 | SafeLoadAs(in + 8 * 52) << 24, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 52) >> 32 | SafeLoadAs(in + 8 * 53) << 32, - SafeLoadAs(in + 8 * 53) >> 24 | SafeLoadAs(in + 8 * 54) << 40, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 56-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 54) >> 16 | SafeLoadAs(in + 8 * 55) << 48, - SafeLoadAs(in + 8 * 55), - }; - shifts = simd_batch{ 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 56 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1ffffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 57-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 57 | SafeLoadAs(in + 8 * 1) << 7, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 50 | SafeLoadAs(in + 8 * 2) << 14, - SafeLoadAs(in + 8 * 2) >> 43 | SafeLoadAs(in + 8 * 3) << 21, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 36 | SafeLoadAs(in + 8 * 4) << 28, - SafeLoadAs(in + 8 * 4) >> 29 | SafeLoadAs(in + 8 * 5) << 35, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 22 | SafeLoadAs(in + 8 * 6) << 42, - SafeLoadAs(in + 8 * 6) >> 15 | SafeLoadAs(in + 8 * 7) << 49, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 8 | SafeLoadAs(in + 8 * 8) << 56, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 58 | SafeLoadAs(in + 8 * 9) << 6, - SafeLoadAs(in + 8 * 9) >> 51 | SafeLoadAs(in + 8 * 10) << 13, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 44 | SafeLoadAs(in + 8 * 11) << 20, - SafeLoadAs(in + 8 * 11) >> 37 | SafeLoadAs(in + 8 * 12) << 27, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 30 | SafeLoadAs(in + 8 * 13) << 34, - SafeLoadAs(in + 8 * 13) >> 23 | SafeLoadAs(in + 8 * 14) << 41, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 16 | SafeLoadAs(in + 8 * 15) << 48, - SafeLoadAs(in + 8 * 15) >> 9 | SafeLoadAs(in + 8 * 16) << 55, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 59 | SafeLoadAs(in + 8 * 17) << 5, - }; - shifts = simd_batch{ 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 52 | SafeLoadAs(in + 8 * 18) << 12, - SafeLoadAs(in + 8 * 18) >> 45 | SafeLoadAs(in + 8 * 19) << 19, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 38 | SafeLoadAs(in + 8 * 20) << 26, - SafeLoadAs(in + 8 * 20) >> 31 | SafeLoadAs(in + 8 * 21) << 33, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 24 | SafeLoadAs(in + 8 * 22) << 40, - SafeLoadAs(in + 8 * 22) >> 17 | SafeLoadAs(in + 8 * 23) << 47, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 10 | SafeLoadAs(in + 8 * 24) << 54, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 60 | SafeLoadAs(in + 8 * 25) << 4, - SafeLoadAs(in + 8 * 25) >> 53 | SafeLoadAs(in + 8 * 26) << 11, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 46 | SafeLoadAs(in + 8 * 27) << 18, - SafeLoadAs(in + 8 * 27) >> 39 | SafeLoadAs(in + 8 * 28) << 25, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 32 | SafeLoadAs(in + 8 * 29) << 32, - SafeLoadAs(in + 8 * 29) >> 25 | SafeLoadAs(in + 8 * 30) << 39, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 18 | SafeLoadAs(in + 8 * 31) << 46, - SafeLoadAs(in + 8 * 31) >> 11 | SafeLoadAs(in + 8 * 32) << 53, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 32), - SafeLoadAs(in + 8 * 32) >> 61 | SafeLoadAs(in + 8 * 33) << 3, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 54 | SafeLoadAs(in + 8 * 34) << 10, - SafeLoadAs(in + 8 * 34) >> 47 | SafeLoadAs(in + 8 * 35) << 17, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 35) >> 40 | SafeLoadAs(in + 8 * 36) << 24, - SafeLoadAs(in + 8 * 36) >> 33 | SafeLoadAs(in + 8 * 37) << 31, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 37) >> 26 | SafeLoadAs(in + 8 * 38) << 38, - SafeLoadAs(in + 8 * 38) >> 19 | SafeLoadAs(in + 8 * 39) << 45, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 12 | SafeLoadAs(in + 8 * 40) << 52, - SafeLoadAs(in + 8 * 40), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 40) >> 62 | SafeLoadAs(in + 8 * 41) << 2, - SafeLoadAs(in + 8 * 41) >> 55 | SafeLoadAs(in + 8 * 42) << 9, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 42) >> 48 | SafeLoadAs(in + 8 * 43) << 16, - SafeLoadAs(in + 8 * 43) >> 41 | SafeLoadAs(in + 8 * 44) << 23, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 44) >> 34 | SafeLoadAs(in + 8 * 45) << 30, - SafeLoadAs(in + 8 * 45) >> 27 | SafeLoadAs(in + 8 * 46) << 37, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 46) >> 20 | SafeLoadAs(in + 8 * 47) << 44, - SafeLoadAs(in + 8 * 47) >> 13 | SafeLoadAs(in + 8 * 48) << 51, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 48), - SafeLoadAs(in + 8 * 48) >> 63 | SafeLoadAs(in + 8 * 49) << 1, - }; - shifts = simd_batch{ 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 49) >> 56 | SafeLoadAs(in + 8 * 50) << 8, - SafeLoadAs(in + 8 * 50) >> 49 | SafeLoadAs(in + 8 * 51) << 15, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 51) >> 42 | SafeLoadAs(in + 8 * 52) << 22, - SafeLoadAs(in + 8 * 52) >> 35 | SafeLoadAs(in + 8 * 53) << 29, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 53) >> 28 | SafeLoadAs(in + 8 * 54) << 36, - SafeLoadAs(in + 8 * 54) >> 21 | SafeLoadAs(in + 8 * 55) << 43, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 57-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 55) >> 14 | SafeLoadAs(in + 8 * 56) << 50, - SafeLoadAs(in + 8 * 56), - }; - shifts = simd_batch{ 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 57 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3ffffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 58-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 58 | SafeLoadAs(in + 8 * 1) << 6, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 52 | SafeLoadAs(in + 8 * 2) << 12, - SafeLoadAs(in + 8 * 2) >> 46 | SafeLoadAs(in + 8 * 3) << 18, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 40 | SafeLoadAs(in + 8 * 4) << 24, - SafeLoadAs(in + 8 * 4) >> 34 | SafeLoadAs(in + 8 * 5) << 30, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 28 | SafeLoadAs(in + 8 * 6) << 36, - SafeLoadAs(in + 8 * 6) >> 22 | SafeLoadAs(in + 8 * 7) << 42, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 16 | SafeLoadAs(in + 8 * 8) << 48, - SafeLoadAs(in + 8 * 8) >> 10 | SafeLoadAs(in + 8 * 9) << 54, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 62 | SafeLoadAs(in + 8 * 10) << 2, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 56 | SafeLoadAs(in + 8 * 11) << 8, - SafeLoadAs(in + 8 * 11) >> 50 | SafeLoadAs(in + 8 * 12) << 14, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 44 | SafeLoadAs(in + 8 * 13) << 20, - SafeLoadAs(in + 8 * 13) >> 38 | SafeLoadAs(in + 8 * 14) << 26, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 32 | SafeLoadAs(in + 8 * 15) << 32, - SafeLoadAs(in + 8 * 15) >> 26 | SafeLoadAs(in + 8 * 16) << 38, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 20 | SafeLoadAs(in + 8 * 17) << 44, - SafeLoadAs(in + 8 * 17) >> 14 | SafeLoadAs(in + 8 * 18) << 50, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 8 | SafeLoadAs(in + 8 * 19) << 56, - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 60 | SafeLoadAs(in + 8 * 20) << 4, - SafeLoadAs(in + 8 * 20) >> 54 | SafeLoadAs(in + 8 * 21) << 10, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 48 | SafeLoadAs(in + 8 * 22) << 16, - SafeLoadAs(in + 8 * 22) >> 42 | SafeLoadAs(in + 8 * 23) << 22, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 36 | SafeLoadAs(in + 8 * 24) << 28, - SafeLoadAs(in + 8 * 24) >> 30 | SafeLoadAs(in + 8 * 25) << 34, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 24 | SafeLoadAs(in + 8 * 26) << 40, - SafeLoadAs(in + 8 * 26) >> 18 | SafeLoadAs(in + 8 * 27) << 46, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 12 | SafeLoadAs(in + 8 * 28) << 52, - SafeLoadAs(in + 8 * 28), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 29), - SafeLoadAs(in + 8 * 29) >> 58 | SafeLoadAs(in + 8 * 30) << 6, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 52 | SafeLoadAs(in + 8 * 31) << 12, - SafeLoadAs(in + 8 * 31) >> 46 | SafeLoadAs(in + 8 * 32) << 18, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 32) >> 40 | SafeLoadAs(in + 8 * 33) << 24, - SafeLoadAs(in + 8 * 33) >> 34 | SafeLoadAs(in + 8 * 34) << 30, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 28 | SafeLoadAs(in + 8 * 35) << 36, - SafeLoadAs(in + 8 * 35) >> 22 | SafeLoadAs(in + 8 * 36) << 42, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 16 | SafeLoadAs(in + 8 * 37) << 48, - SafeLoadAs(in + 8 * 37) >> 10 | SafeLoadAs(in + 8 * 38) << 54, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 38), - SafeLoadAs(in + 8 * 38) >> 62 | SafeLoadAs(in + 8 * 39) << 2, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 56 | SafeLoadAs(in + 8 * 40) << 8, - SafeLoadAs(in + 8 * 40) >> 50 | SafeLoadAs(in + 8 * 41) << 14, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 41) >> 44 | SafeLoadAs(in + 8 * 42) << 20, - SafeLoadAs(in + 8 * 42) >> 38 | SafeLoadAs(in + 8 * 43) << 26, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 43) >> 32 | SafeLoadAs(in + 8 * 44) << 32, - SafeLoadAs(in + 8 * 44) >> 26 | SafeLoadAs(in + 8 * 45) << 38, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 45) >> 20 | SafeLoadAs(in + 8 * 46) << 44, - SafeLoadAs(in + 8 * 46) >> 14 | SafeLoadAs(in + 8 * 47) << 50, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 47) >> 8 | SafeLoadAs(in + 8 * 48) << 56, - SafeLoadAs(in + 8 * 48), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 48) >> 60 | SafeLoadAs(in + 8 * 49) << 4, - SafeLoadAs(in + 8 * 49) >> 54 | SafeLoadAs(in + 8 * 50) << 10, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 50) >> 48 | SafeLoadAs(in + 8 * 51) << 16, - SafeLoadAs(in + 8 * 51) >> 42 | SafeLoadAs(in + 8 * 52) << 22, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 58-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 52) >> 36 | SafeLoadAs(in + 8 * 53) << 28, - SafeLoadAs(in + 8 * 53) >> 30 | SafeLoadAs(in + 8 * 54) << 34, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); + } +}; - // extract 58-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 54) >> 24 | SafeLoadAs(in + 8 * 55) << 40, - SafeLoadAs(in + 8 * 55) >> 18 | SafeLoadAs(in + 8 * 56) << 46, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; +template<> +struct Simd128UnpackerForWidth { - // extract 58-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 56) >> 12 | SafeLoadAs(in + 8 * 57) << 52, - SafeLoadAs(in + 8 * 57), - }; - shifts = simd_batch{ 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; + using Dispatch = Kernel; - in += 58 * 8; - return in; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; + + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; @@ -23471,337 +1895,12 @@ struct Simd128UnpackerForWidth { template<> struct Simd128UnpackerForWidth { - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xfffffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 60-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 60 | SafeLoadAs(in + 8 * 1) << 4, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 56 | SafeLoadAs(in + 8 * 2) << 8, - SafeLoadAs(in + 8 * 2) >> 52 | SafeLoadAs(in + 8 * 3) << 12, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 48 | SafeLoadAs(in + 8 * 4) << 16, - SafeLoadAs(in + 8 * 4) >> 44 | SafeLoadAs(in + 8 * 5) << 20, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 40 | SafeLoadAs(in + 8 * 6) << 24, - SafeLoadAs(in + 8 * 6) >> 36 | SafeLoadAs(in + 8 * 7) << 28, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 32 | SafeLoadAs(in + 8 * 8) << 32, - SafeLoadAs(in + 8 * 8) >> 28 | SafeLoadAs(in + 8 * 9) << 36, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 24 | SafeLoadAs(in + 8 * 10) << 40, - SafeLoadAs(in + 8 * 10) >> 20 | SafeLoadAs(in + 8 * 11) << 44, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 16 | SafeLoadAs(in + 8 * 12) << 48, - SafeLoadAs(in + 8 * 12) >> 12 | SafeLoadAs(in + 8 * 13) << 52, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 8 | SafeLoadAs(in + 8 * 14) << 56, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 60 | SafeLoadAs(in + 8 * 16) << 4, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - SafeLoadAs(in + 8 * 17) >> 52 | SafeLoadAs(in + 8 * 18) << 12, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 48 | SafeLoadAs(in + 8 * 19) << 16, - SafeLoadAs(in + 8 * 19) >> 44 | SafeLoadAs(in + 8 * 20) << 20, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 40 | SafeLoadAs(in + 8 * 21) << 24, - SafeLoadAs(in + 8 * 21) >> 36 | SafeLoadAs(in + 8 * 22) << 28, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 32 | SafeLoadAs(in + 8 * 23) << 32, - SafeLoadAs(in + 8 * 23) >> 28 | SafeLoadAs(in + 8 * 24) << 36, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 24 | SafeLoadAs(in + 8 * 25) << 40, - SafeLoadAs(in + 8 * 25) >> 20 | SafeLoadAs(in + 8 * 26) << 44, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 16 | SafeLoadAs(in + 8 * 27) << 48, - SafeLoadAs(in + 8 * 27) >> 12 | SafeLoadAs(in + 8 * 28) << 52, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 8 | SafeLoadAs(in + 8 * 29) << 56, - SafeLoadAs(in + 8 * 29), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 60 | SafeLoadAs(in + 8 * 31) << 4, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 56 | SafeLoadAs(in + 8 * 32) << 8, - SafeLoadAs(in + 8 * 32) >> 52 | SafeLoadAs(in + 8 * 33) << 12, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 48 | SafeLoadAs(in + 8 * 34) << 16, - SafeLoadAs(in + 8 * 34) >> 44 | SafeLoadAs(in + 8 * 35) << 20, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 35) >> 40 | SafeLoadAs(in + 8 * 36) << 24, - SafeLoadAs(in + 8 * 36) >> 36 | SafeLoadAs(in + 8 * 37) << 28, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 37) >> 32 | SafeLoadAs(in + 8 * 38) << 32, - SafeLoadAs(in + 8 * 38) >> 28 | SafeLoadAs(in + 8 * 39) << 36, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 24 | SafeLoadAs(in + 8 * 40) << 40, - SafeLoadAs(in + 8 * 40) >> 20 | SafeLoadAs(in + 8 * 41) << 44, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 41) >> 16 | SafeLoadAs(in + 8 * 42) << 48, - SafeLoadAs(in + 8 * 42) >> 12 | SafeLoadAs(in + 8 * 43) << 52, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 43) >> 8 | SafeLoadAs(in + 8 * 44) << 56, - SafeLoadAs(in + 8 * 44), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 45), - SafeLoadAs(in + 8 * 45) >> 60 | SafeLoadAs(in + 8 * 46) << 4, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 46) >> 56 | SafeLoadAs(in + 8 * 47) << 8, - SafeLoadAs(in + 8 * 47) >> 52 | SafeLoadAs(in + 8 * 48) << 12, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 48) >> 48 | SafeLoadAs(in + 8 * 49) << 16, - SafeLoadAs(in + 8 * 49) >> 44 | SafeLoadAs(in + 8 * 50) << 20, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 50) >> 40 | SafeLoadAs(in + 8 * 51) << 24, - SafeLoadAs(in + 8 * 51) >> 36 | SafeLoadAs(in + 8 * 52) << 28, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 52) >> 32 | SafeLoadAs(in + 8 * 53) << 32, - SafeLoadAs(in + 8 * 53) >> 28 | SafeLoadAs(in + 8 * 54) << 36, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 54) >> 24 | SafeLoadAs(in + 8 * 55) << 40, - SafeLoadAs(in + 8 * 55) >> 20 | SafeLoadAs(in + 8 * 56) << 44, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; + using Dispatch = Kernel; - // extract 60-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 56) >> 16 | SafeLoadAs(in + 8 * 57) << 48, - SafeLoadAs(in + 8 * 57) >> 12 | SafeLoadAs(in + 8 * 58) << 52, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 60-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 58) >> 8 | SafeLoadAs(in + 8 * 59) << 56, - SafeLoadAs(in + 8 * 59), - }; - shifts = simd_batch{ 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; + static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - in += 60 * 8; - return in; + static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { + return Dispatch::unpack(in, out); } }; diff --git a/cpp/src/arrow/util/bpacking_simd_codegen.py b/cpp/src/arrow/util/bpacking_simd_codegen.py index 3218fe24cb6..029472eb9fa 100755 --- a/cpp/src/arrow/util/bpacking_simd_codegen.py +++ b/cpp/src/arrow/util/bpacking_simd_codegen.py @@ -46,6 +46,20 @@ """ +def packed_max_spread_bytes(width: int, bit_offset: int = 0) -> int: + max_spread = (width + 7) // 8 + start = bit_offset + while True: + byte_start = start // 8 + byte_end = (start + width - 1) // 8 + spread = byte_end - byte_start + 1 + max_spread = max(spread, max_spread) + start += width + if start % 8 == bit_offset: + break + return max_spread + + @dataclasses.dataclass class UnpackStructGenerator: out_type: str @@ -160,13 +174,28 @@ def print_struct_k(self, bit: int): print("template<>") print(f"struct {self.struct_specialization(bit)} {{") print() - print( - " using simd_batch = xsimd::make_sized_batch_t<" - f"{self.out_type}, {self.simd_value_count}>;" - ) - print(f" static constexpr int kValuesUnpacked = {self.out_bit_width};") - print() - self.print_unpack_bit_func(bit) + + if packed_max_spread_bytes(bit) <= self.out_byte_width: + print( + " using Dispatch = Kernel<" + f"{self.out_type}, {bit}, {self.simd_bit_width}>;" + ) + print() + print(" static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked;") + print() + print( + f" static const uint8_t* unpack(const uint8_t* in, {self.out_type}* out) {{" + ) + print(" return Dispatch::unpack(in, out);") + print(" }") + else: + print( + " using simd_batch = xsimd::make_sized_batch_t<" + f"{self.out_type}, {self.simd_value_count}>;" + ) + print(f" static constexpr int kValuesUnpacked = {self.out_bit_width};") + print() + self.print_unpack_bit_func(bit) print("};") def print_uint32_fallback_struct(self): @@ -224,6 +253,7 @@ def print_headers(self): print("#include ") print() print('#include "arrow/util/ubsan.h"') + print('#include "arrow/util/bpacking_simd_impl_internal.h"') def print_file_top(self): print("#pragma once") From 288d744a01f0c7ae93ecc21e9f5c19628af43db2 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 27 Oct 2025 19:09:37 +0100 Subject: [PATCH 09/64] Refactor array to xsimd::batch_constant --- .../arrow/util/bpacking_simd_impl_internal.h | 38 +++++++++++-------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 13b6600e5ef..56845072191 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -34,6 +34,8 @@ namespace arrow::internal { // - _mm_cvtepi8_epi32 // - no _mm_srlv_epi32 (128bit) in xsimd with AVX2 required arch // - no need for while loop (for up to 8 is sufficient) +// - upstream var lshift to xsimd +// - array to batch constant to xsimd // - Shifts per swizzle can be improved when self.packed_max_byte_spread == 1 and the // byte can be reused (when val_bit_width divides packed_max_byte_spread). @@ -143,7 +145,7 @@ struct KernelPlan { using ReadsPerKernel = std::array; - using Swizzle = std::array; + using Swizzle = std::array; using SwizzlesPerRead = std::array; using SwizzlesPerKernel = std::array; @@ -174,6 +176,7 @@ constexpr KernelPlan BuildPlan() { using Plan = KernelPlan; constexpr auto kShape = Plan::kShape; constexpr auto kPlanSize = Plan::kPlanSize; + static_assert(kShape.packed_max_spread_bytes() <= kShape.unpacked_byte_size()); Plan plan = {}; @@ -195,7 +198,8 @@ constexpr KernelPlan BuildPlan() { // Looping over the multiple bytes needed for current values for (int b = 0; b < kShape.packed_max_spread_bytes(); ++b) { - plan.swizzles.at(r).at(sw).at(sw_offset_byte + b) = packed_byte_in_read + b; + plan.swizzles.at(r).at(sw).at(sw_offset_byte + b) = + static_cast(packed_byte_in_read + b); } // Shift is a single value but many packed values may be swizzles to a sing // unpacked value @@ -223,6 +227,19 @@ constexpr T max_value(const std::array& arr) { return out; } +template +constexpr auto make_batch_constant_impl(std::index_sequence) { + using Array = std::decay_t; + using value_type = typename Array::value_type; + + return xsimd::batch_constant{}; +} + +template +constexpr auto make_batch_constant() { + return make_batch_constant_impl(std::make_index_sequence()); +} + template struct Kernel { static constexpr auto kPlan = BuildPlan(); @@ -267,12 +284,7 @@ struct Kernel { shifted = (words * kMults) >> kMaxRightShift; } else { - struct MakeRightShifts { - static constexpr unpacked_type get(int i, int n) { return kRightShiftsArr.at(i); } - }; - - constexpr auto kRightShifts = - xsimd::make_batch_constant(); + constexpr auto kRightShifts = make_batch_constant(); shifted = words >> kRightShifts; } @@ -289,14 +301,8 @@ struct Kernel { template static void unpack_one_swizzle_impl(const simd_bytes& bytes, unpacked_type* out, std::integer_sequence) { - struct MakeSwizzles { - static constexpr int get(int i, int n) { - return kPlan.swizzles.at(kReadIdx).at(kSwizzleIdx).at(i); - } - }; - - constexpr auto kSwizzles = - xsimd::make_batch_constant(); + static constexpr auto kSwizzlesArr = kPlan.swizzles.at(kReadIdx).at(kSwizzleIdx); + constexpr auto kSwizzles = make_batch_constant(); const auto swizzled = xsimd::swizzle(bytes, kSwizzles); const auto words = xsimd::bitwise_cast(swizzled); From b87ff0d33028954d59fda8231fce637894bde548 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 27 Oct 2025 20:02:17 +0100 Subject: [PATCH 10/64] Refactor right shift --- .../arrow/util/bpacking_simd_impl_internal.h | 73 +++++++++++-------- 1 file changed, 41 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 56845072191..8089672041c 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -240,6 +240,42 @@ constexpr auto make_batch_constant() { return make_batch_constant_impl(std::make_index_sequence()); } +// Intel x86-64 does not have variable right shifts before AVX2. +// +// When we know that the relevant bits will not overflow, we can instead shift left all +// values to align them with the one with the largest right shifts followed by a constant +// shift on all values. +// In doing so, we replace the variable left shift by a variable multiply with a power of +// two. +// +// This trick is borrowed from Daniel Lemire and Leonid Boytsov, Decoding billions of +// integers per second through vectorization, Software Practice & Experience 45 (1), 2015. +// http://arxiv.org/abs/1209.2137 +template +auto overflow_right_shift(const xsimd::batch& batch, + xsimd::batch_constant shifts) { + constexpr bool kHasSse2 = xsimd::supported_architectures::contains(); + constexpr bool kHasAvx2 = xsimd::supported_architectures::contains(); + + if constexpr (kHasSse2 && !kHasAvx2) { + static constexpr auto kShiftsArr = std::array{kShifts...}; + static constexpr Int kMaxRightShift = max_value(kShiftsArr); + + struct MakeMults { + static constexpr Int get(int i, int n) { + // Equivalent to left shift of kMaxRightShift - kRightShifts.at(i). + return Int{1} << (kMaxRightShift - kShiftsArr.at(i)); + } + }; + + constexpr auto kMults = xsimd::make_batch_constant(); + return (batch * kMults) >> kMaxRightShift; + + } else { + return batch >> shifts; + } +} + template struct Kernel { static constexpr auto kPlan = BuildPlan(); @@ -257,43 +293,16 @@ struct Kernel { static void unpack_one_shift_impl(const simd_batch& words, unpacked_type* out) { static constexpr auto kRightShiftsArr = kPlan.shifts.at(kReadIdx).at(kSwizzleIdx).at(kShiftIdx); - - constexpr bool kHasSse2 = xsimd::supported_architectures::contains(); - constexpr bool kHasAvx2 = xsimd::supported_architectures::contains(); - - // Intel x86-64 does not have variable right shifts before AVX2. - // Instead, since we know the packed value can safely be left shifted up to the - // maximum already in the batch, we use a multiplication to emulate a left shits, - // followed by a static right shift. - // Trick from Daniel Lemire and Leonid Boytsov, Decoding billions of integers per - // second through vectorization, Software Practice & Experience 45 (1), 2015. - // http://arxiv.org/abs/1209.2137 - simd_batch shifted; - if constexpr (kHasSse2 && !kHasAvx2) { - static constexpr unpacked_type kMaxRightShift = max_value(kRightShiftsArr); - - struct MakeMults { - static constexpr unpacked_type get(int i, int n) { - // Equivalent to left shift of kMaxRightShift - kRightShifts.at(i). - return unpacked_type{1} << (kMaxRightShift - kRightShiftsArr.at(i)); - } - }; - - constexpr auto kMults = - xsimd::make_batch_constant(); - - shifted = (words * kMults) >> kMaxRightShift; - } else { - constexpr auto kRightShifts = make_batch_constant(); - - shifted = words >> kRightShifts; - } - + constexpr auto kRightShifts = make_batch_constant(); constexpr auto kMask = kPlan.mask; constexpr auto kOutOffset = (kReadIdx * kPlan.unpacked_per_read() + kSwizzleIdx * kPlan.unpacked_per_swizzle() + kShiftIdx * kPlan.unpacked_per_shifts()); + // Intel x86-64 does not have variable right shifts before AVX2. + // We know the packed value can safely be left shifted up to the largest offset so we + // can use the fallback on these platforms. + const auto shifted = overflow_right_shift(words, kRightShifts); const auto vals = shifted & kMask; xsimd::store_unaligned(out + kOutOffset, vals); } From df9d6edc47ddcda21f09ca2fa652baa699742ab2 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 28 Oct 2025 10:04:17 +0100 Subject: [PATCH 11/64] Add oversized plan --- .../arrow/util/bpacking_simd_impl_internal.h | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 8089672041c..a0832a42bd5 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -38,6 +39,10 @@ namespace arrow::internal { // - array to batch constant to xsimd // - Shifts per swizzle can be improved when self.packed_max_byte_spread == 1 and the // byte can be reused (when val_bit_width divides packed_max_byte_spread). +// - Try for uint16_t and uint8_t +// - For Avx2: +// - Inspect how swizzle across lanes are handled +// - Investigate AVX2 with 128 bit register struct KernelShape { const int simd_bit_size_; @@ -340,4 +345,69 @@ struct Kernel { } }; +template +struct OversizedKernelPlan { + using Traits = KernelTraits; + static constexpr auto kShape = Traits::kShape; + + static constexpr int kUnpackedPerkernel = std::lcm(kShape.unpacked_per_simd(), 8); + static constexpr int kReadsPerKernel = static_cast(bit_util::CeilDiv( + kUnpackedPerkernel * kShape.packed_bit_size(), kShape.simd_bit_size())); + + using ReadsPerKernel = std::array; + + using Swizzle = std::array; + using SwizzlesPerKernel = std::array; + + using Shift = std::array; + using ShitsPerKernel = std::array; + + ReadsPerKernel reads; + SwizzlesPerKernel low_swizzles; + SwizzlesPerKernel high_swizzles; + ShitsPerKernel low_rshifts; + ShitsPerKernel high_lshifts; + UnpackedUint mask = bit_util::LeastSignificantBitMask(kPackedBitSize); +}; + +template +constexpr OversizedKernelPlan +BuildOversizedPlan() { + using Plan = OversizedKernelPlan; + constexpr auto kShape = Plan::kShape; + static_assert(kShape.unpacked_byte_size() < kShape.max_spread_bytes()); + constexpr int kOverBytes = + kShape.packed_max_spread_bytes() - kShape.unpacked_byte_size(); + + Plan plan = {}; + + int packed_start_bit = 0; + for (int r = 0; r < Plan::kReadsPerKernel; ++r) { + const int read_start_byte = packed_start_bit / 8; + plan.reads.at(r) = read_start_byte; + + for (int u = 0; u < kShape.unpacked_per_simd(); ++u) { + const int packed_start_byte = packed_start_bit / 8; + const int packed_byte_in_read = packed_start_byte - read_start_byte; + + // Looping over maximum number of bytes that can fit a value + // We fill more than necessary in the high swizzle because in the absence of + // variable right shifts, we will erase some bits from the low sizzled values. + for (int b = 0; b < kShape.unpacked_byte_size(); ++b) { + const auto idx = u * kShape.unpacked_byte_size() + b; + plan.low_swizzles.at(r).at(idx) = packed_byte_in_read + b; + plan.high_swizzles.at(r).at(idx) = packed_byte_in_read + b + kOverBytes; + } + + // low and high swizzles need to be rshifted but the oversized bytes created a + // larger lshift for high values. + plan.low_rshifts.at(r).at(u) = packed_start_bit % 8; + plan.high_lshifts.at(r).at(u) = 8 * kOverBytes - (packed_start_bit % 8); + + packed_start_bit += kShape.packed_bit_size(); + } + } + + return plan; +} } // namespace arrow::internal From b55c888d2a6f6c8b672c1f868d77cfbee3dc0b49 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 28 Oct 2025 14:01:59 +0100 Subject: [PATCH 12/64] Add oversized kernel --- .../arrow/util/bpacking_simd_impl_internal.h | 130 +++++++++++++++++- 1 file changed, 124 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index a0832a42bd5..c310730f683 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -112,8 +112,8 @@ constexpr KernelPlanSize BuildPlanSize(const KernelShape& shape) { }; // If after a whole swizzle reading iteration we fall unaligned, the remaining - // iterations will start with an aligned first value, reducing the effective capacity of - // the SIMD batch. + // iterations will start with an unaligned first value, reducing the effective capacity + // of the SIMD batch. // We must check that our read iteration size still works with subsequent misalignment // by looping until aligned. // One may think that using such large reading iterations risks overshooting an aligned @@ -245,6 +245,34 @@ constexpr auto make_batch_constant() { return make_batch_constant_impl(std::make_index_sequence()); } +// Intel x86-64 does not have variable left shifts before AVX2. +// +// We replace the variable left shift by a variable multiply with a power of two. +// +// This trick is borrowed from Daniel Lemire and Leonid Boytsov, Decoding billions of +// integers per second through vectorization, Software Practice & Experience 45 (1), 2015. +// http://arxiv.org/abs/1209.2137 +template +auto left_shift(const xsimd::batch& batch, + xsimd::batch_constant shifts) { + constexpr bool kHasSse2 = xsimd::supported_architectures::contains(); + constexpr bool kHasAvx2 = xsimd::supported_architectures::contains(); + + if constexpr (kHasSse2 && !kHasAvx2) { + static constexpr auto kShiftsArr = std::array{kShifts...}; + + struct MakeMults { + static constexpr Int get(int i, int n) { return Int{1} << kShiftsArr.at(i); } + }; + + constexpr auto kMults = xsimd::make_batch_constant(); + return batch * kMults; + + } else { + return batch << shifts; + } +} + // Intel x86-64 does not have variable right shifts before AVX2. // // When we know that the relevant bits will not overflow, we can instead shift left all @@ -356,10 +384,10 @@ struct OversizedKernelPlan { using ReadsPerKernel = std::array; - using Swizzle = std::array; + using Swizzle = std::array; using SwizzlesPerKernel = std::array; - using Shift = std::array; + using Shift = std::array; using ShitsPerKernel = std::array; ReadsPerKernel reads; @@ -367,7 +395,8 @@ struct OversizedKernelPlan { SwizzlesPerKernel high_swizzles; ShitsPerKernel low_rshifts; ShitsPerKernel high_lshifts; - UnpackedUint mask = bit_util::LeastSignificantBitMask(kPackedBitSize); + UnpackedUint low_mask; + UnpackedUint high_mask; }; template @@ -375,7 +404,7 @@ constexpr OversizedKernelPlan BuildOversizedPlan() { using Plan = OversizedKernelPlan; constexpr auto kShape = Plan::kShape; - static_assert(kShape.unpacked_byte_size() < kShape.max_spread_bytes()); + static_assert(kShape.unpacked_byte_size() < kShape.packed_max_spread_bytes()); constexpr int kOverBytes = kShape.packed_max_spread_bytes() - kShape.unpacked_byte_size(); @@ -408,6 +437,95 @@ BuildOversizedPlan() { } } + constexpr auto mask = bit_util::LeastSignificantBitMask(kPackedBitSize); + constexpr auto half_low_bit_mask = + bit_util::LeastSignificantBitMask(kShape.unpacked_bit_size() / 2); + plan.low_mask = mask & half_low_bit_mask; + plan.high_mask = mask & (~half_low_bit_mask); + return plan; } + +template +struct OversizedKernel { + static constexpr auto kPlan = + BuildOversizedPlan(); + static constexpr auto kShape = kPlan.kShape; + using Traits = typename decltype(kPlan)::Traits; + using unpacked_type = typename Traits::unpacked_type; + using simd_batch = typename Traits::simd_batch; + using simd_bytes = typename Traits::simd_bytes; + using arch_type = typename Traits::arch_type; + + static constexpr int kValuesUnpacked = kPlan.kUnpackedPerkernel; + + template + static void unpack_one_shift_impl(const simd_batch& words, unpacked_type* out) { + static constexpr auto kRightShiftsArr = + kPlan.shifts.at(kReadIdx).at(kSwizzleIdx).at(kShiftIdx); + constexpr auto kRightShifts = make_batch_constant(); + constexpr auto kMask = kPlan.mask; + constexpr auto kOutOffset = (kReadIdx * kPlan.unpacked_per_read() + + kSwizzleIdx * kPlan.unpacked_per_swizzle() + + kShiftIdx * kPlan.unpacked_per_shifts()); + + // Intel x86-64 does not have variable right shifts before AVX2. + // We know the packed value can safely be left shifted up to the largest offset so we + // can use the fallback on these platforms. + const auto shifted = overflow_right_shift(words, kRightShifts); + const auto vals = shifted & kMask; + xsimd::store_unaligned(out + kOutOffset, vals); + } + + template + static void unpack_one_swizzle_impl(const simd_bytes& bytes, unpacked_type* out, + std::integer_sequence) { + static constexpr auto kSwizzlesArr = kPlan.swizzles.at(kReadIdx).at(kSwizzleIdx); + constexpr auto kSwizzles = make_batch_constant(); + + const auto swizzled = xsimd::swizzle(bytes, kSwizzles); + const auto words = xsimd::bitwise_cast(swizzled); + (unpack_one_shift_impl(words, out), ...); + } + + template + static void unpack_one_read_impl(const uint8_t* in, unpacked_type* out) { + static constexpr auto kLowSwizzlesArr = kPlan.low_swizzles.at(kReadIdx); + constexpr auto kLowSwizzles = make_batch_constant(); + static constexpr auto kLowRShiftsArr = kPlan.low_rshifts.at(kReadIdx); + constexpr auto kLowRShifts = make_batch_constant(); + + static constexpr auto kHighSwizzlesArr = kPlan.high_swizzles.at(kReadIdx); + constexpr auto kHighSwizzles = make_batch_constant(); + static constexpr auto kHighLShiftsArr = kPlan.high_lshifts.at(kReadIdx); + constexpr auto kHighLShifts = make_batch_constant(); + + const auto bytes = simd_bytes::load_unaligned(in + kPlan.reads.at(kReadIdx)); + + const auto low_swizzled = xsimd::swizzle(bytes, kLowSwizzles); + const auto low_words = xsimd::bitwise_cast(low_swizzled); + const auto low_shifted = overflow_right_shift(low_words, kLowRShifts); + const auto low_half_vals = low_shifted & kPlan.low_mask; + + const auto high_swizzled = xsimd::swizzle(bytes, kHighSwizzles); + const auto high_words = xsimd::bitwise_cast(high_swizzled); + const auto high_shifted = left_shift(high_words, kHighLShifts); + const auto high_half_vals = high_shifted & kPlan.high_mask; + + const auto vals = low_half_vals | high_half_vals; + xsimd::store_unaligned(out + kReadIdx * kShape.unpacked_per_simd(), vals); + } + + template + static void unpack_all_impl(const uint8_t* in, unpacked_type* out, + std::integer_sequence) { + (unpack_one_read_impl(in, out), ...); + } + + static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) { + using ReadSeq = std::make_integer_sequence; + unpack_all_impl(in, out, ReadSeq{}); + return in + (kPlan.kUnpackedPerkernel * kShape.packed_bit_size()) / 8; + } +}; } // namespace arrow::internal From f91630ab640238495e9b2c9f7d303983ec3422b2 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 28 Oct 2025 15:24:33 +0100 Subject: [PATCH 13/64] Rename kernels --- .../arrow/util/bpacking_simd_impl_internal.h | 60 ++++++++++++++----- 1 file changed, 44 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index c310730f683..c48250d1c22 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -44,11 +44,39 @@ namespace arrow::internal { // - Inspect how swizzle across lanes are handled // - Investigate AVX2 with 128 bit register +constexpr bool PackedIsOversizedForSimd(int simd_bit_size, int unpacked_bit_size, + int packed_bit_size) { + const int unpacked_per_simd = simd_bit_size / unpacked_bit_size; + + const auto packed_per_read_for_offset = [&](int bit_offset) -> int { + return (simd_bit_size - bit_offset) / packed_bit_size; + }; + + int packed_start_bit = 0; + do { + int packed_per_read = packed_per_read_for_offset(packed_start_bit % 8); + if (packed_per_read < unpacked_per_simd) { + return true; + } + packed_start_bit += unpacked_per_simd * packed_bit_size; + } while (packed_start_bit % 8 != 0); + + return false; +} + struct KernelShape { const int simd_bit_size_; const int unpacked_bit_size_; const int packed_bit_size_; const int packed_max_spread_bytes_ = PackedMaxSpreadBytes(packed_bit_size_, 0); + const bool is_oversized_ = + PackedIsOversizedForSimd(simd_bit_size_, unpacked_bit_size_, packed_bit_size_); + + constexpr bool is_medium() const { + return packed_max_spread_bytes() <= unpacked_byte_size(); + } + constexpr bool is_large() const { return !is_medium() && !is_oversized(); } + constexpr bool is_oversized() const { return is_oversized_; } /// Properties of an SIMD batch constexpr int simd_bit_size() const { return simd_bit_size_; } @@ -78,7 +106,7 @@ struct KernelTraits { using arch_type = typename simd_batch::arch_type; }; -struct KernelPlanSize { +struct MediumKernelPlanSize { int reads_per_kernel_; int swizzles_per_read_; int shifts_per_swizzle_; @@ -99,7 +127,7 @@ struct KernelPlanSize { } }; -constexpr KernelPlanSize BuildPlanSize(const KernelShape& shape) { +constexpr MediumKernelPlanSize BuildMediumPlanSize(const KernelShape& shape) { const int shifts_per_swizzle = shape.unpacked_byte_size() / shape.packed_max_spread_bytes(); @@ -143,10 +171,10 @@ constexpr KernelPlanSize BuildPlanSize(const KernelShape& shape) { } template -struct KernelPlan { +struct MediumKernelPlan { using Traits = KernelTraits; static constexpr auto kShape = Traits::kShape; - static constexpr auto kPlanSize = BuildPlanSize(kShape); + static constexpr auto kPlanSize = BuildMediumPlanSize(kShape); using ReadsPerKernel = std::array; @@ -177,11 +205,11 @@ struct KernelPlan { }; template -constexpr KernelPlan BuildPlan() { - using Plan = KernelPlan; +constexpr MediumKernelPlan BuildMediumPlan() { + using Plan = MediumKernelPlan; constexpr auto kShape = Plan::kShape; constexpr auto kPlanSize = Plan::kPlanSize; - static_assert(kShape.packed_max_spread_bytes() <= kShape.unpacked_byte_size()); + static_assert(kShape.is_medium()); Plan plan = {}; @@ -310,8 +338,9 @@ auto overflow_right_shift(const xsimd::batch& batch, } template -struct Kernel { - static constexpr auto kPlan = BuildPlan(); +struct MediumKernel { + static constexpr auto kPlan = + BuildMediumPlan(); static constexpr auto kPlanSize = kPlan.kPlanSize; static constexpr auto kShape = kPlan.kShape; using Traits = typename decltype(kPlan)::Traits; @@ -374,7 +403,7 @@ struct Kernel { }; template -struct OversizedKernelPlan { +struct LargeKernelPlan { using Traits = KernelTraits; static constexpr auto kShape = Traits::kShape; @@ -400,11 +429,10 @@ struct OversizedKernelPlan { }; template -constexpr OversizedKernelPlan -BuildOversizedPlan() { - using Plan = OversizedKernelPlan; +constexpr LargeKernelPlan BuildLargePlan() { + using Plan = LargeKernelPlan; constexpr auto kShape = Plan::kShape; - static_assert(kShape.unpacked_byte_size() < kShape.packed_max_spread_bytes()); + static_assert(kShape.is_large()); constexpr int kOverBytes = kShape.packed_max_spread_bytes() - kShape.unpacked_byte_size(); @@ -447,9 +475,9 @@ BuildOversizedPlan() { } template -struct OversizedKernel { +struct LargeKernel { static constexpr auto kPlan = - BuildOversizedPlan(); + BuildLargePlan(); static constexpr auto kShape = kPlan.kShape; using Traits = typename decltype(kPlan)::Traits; using unpacked_type = typename Traits::unpacked_type; From b4d3281ab365a9e3977e83760b9bdb4dd38a3855 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 28 Oct 2025 15:47:48 +0100 Subject: [PATCH 14/64] Add simd kernel dispatch --- .../arrow/util/bpacking_dispatch_internal.h | 18 +++++----- .../arrow/util/bpacking_simd_impl_internal.h | 33 +++++++++++++++++++ 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h index eed5542808d..79026536a4d 100644 --- a/cpp/src/arrow/util/bpacking_dispatch_internal.h +++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h @@ -190,18 +190,20 @@ void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int bit_ using UnpackerForWidth = Unpacker; constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked; - // Running the optimized kernel for batch extraction - const int unpacker_iter_count = batch_size / kValuesUnpacked; - for (int i = 0; i < unpacker_iter_count; ++i) { - in = UnpackerForWidth::unpack(in, out); - out += kValuesUnpacked; + if constexpr (kValuesUnpacked > 0) { + // Running the optimized kernel for batch extraction + const int unpacker_iter_count = batch_size / kValuesUnpacked; + for (int i = 0; i < unpacker_iter_count; ++i) { + in = UnpackerForWidth::unpack(in, out); + out += kValuesUnpacked; + } + batch_size -= unpacker_iter_count * kValuesUnpacked; + ARROW_DCHECK_LT(batch_size, kValuesUnpacked); + ARROW_COMPILER_ASSUME(batch_size < kValuesUnpacked); } - batch_size -= unpacker_iter_count * kValuesUnpacked; // Running the epilog for the remaining values that don't fit in a kernel - ARROW_DCHECK_LT(batch_size, kValuesUnpacked); ARROW_DCHECK_GE(batch_size, 0); - ARROW_COMPILER_ASSUME(batch_size < kValuesUnpacked); ARROW_COMPILER_ASSUME(batch_size >= 0); unpack_exact(in, out, batch_size, /* bit_offset= */ 0); } diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index c48250d1c22..03d287c2855 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -556,4 +556,37 @@ struct LargeKernel { return in + (kPlan.kUnpackedPerkernel * kShape.packed_bit_size()) / 8; } }; + +template +struct OversizedKernel { + using unpacked_type = UnpackedUint; + + static constexpr int kValuesUnpacked = 0; + + static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) { return in; } +}; + +template +constexpr auto DispatchKernel() { + using kTraits = KernelTraits; + if constexpr (kTraits::kShape.is_medium()) { + return MediumKernel{}; + } else if constexpr (kTraits::kShape.is_large()) { + return LargeKernel{}; + } else { + return OversizedKernel{}; + } +} + +template +using DispatchKernelType = + decltype(DispatchKernel()); + +template +struct Kernel : DispatchKernelType { + using Base = DispatchKernelType; + using Base::kValuesUnpacked; + using Base::unpack; +}; + } // namespace arrow::internal From 6b395d047772db6b976607b3b637791d89931fa7 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 28 Oct 2025 15:53:35 +0100 Subject: [PATCH 15/64] Call Simd kernel directly --- .../bpacking_simd128_generated_internal.h | 2920 ----------------- cpp/src/arrow/util/bpacking_simd_default.cc | 9 +- .../arrow/util/bpacking_simd_impl_internal.h | 40 +- 3 files changed, 43 insertions(+), 2926 deletions(-) delete mode 100644 cpp/src/arrow/util/bpacking_simd128_generated_internal.h diff --git a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h deleted file mode 100644 index 6421bd30682..00000000000 --- a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h +++ /dev/null @@ -1,2920 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// WARNING: this file is generated, DO NOT EDIT. -// Usage: -// python cpp/src/arrow/util/bpacking_simd_codegen.py 128 - -#pragma once - -#include -#include - -#include - -#include "arrow/util/ubsan.h" -#include "arrow/util/bpacking_simd_impl_internal.h" - -namespace arrow::internal { -namespace { - -using ::arrow::util::SafeLoadAs; - -template -struct Simd128UnpackerForWidth; - -template -struct Simd128UnpackerForWidth { - - static constexpr int kValuesUnpacked = Simd128UnpackerForWidth::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, bool* out) { - uint32_t buffer[kValuesUnpacked] = {}; - in = Simd128UnpackerForWidth::unpack(in, buffer); - for(int k = 0; k< kValuesUnpacked; ++k) { - out[k] = static_cast(buffer[k]); - } - return in; - } -}; - -template -struct Simd128UnpackerForWidth { - - static constexpr int kValuesUnpacked = Simd128UnpackerForWidth::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint8_t* out) { - uint32_t buffer[kValuesUnpacked] = {}; - in = Simd128UnpackerForWidth::unpack(in, buffer); - for(int k = 0; k< kValuesUnpacked; ++k) { - out[k] = static_cast(buffer[k]); - } - return in; - } -}; - -template -struct Simd128UnpackerForWidth { - - static constexpr int kValuesUnpacked = Simd128UnpackerForWidth::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint16_t* out) { - uint32_t buffer[kValuesUnpacked] = {}; - in = Simd128UnpackerForWidth::unpack(in, buffer); - for(int k = 0; k< kValuesUnpacked; ++k) { - out[k] = static_cast(buffer[k]); - } - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7ffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 27-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 27 | SafeLoadAs(in + 4 * 1) << 5, - SafeLoadAs(in + 4 * 1) >> 22 | SafeLoadAs(in + 4 * 2) << 10, - SafeLoadAs(in + 4 * 2) >> 17 | SafeLoadAs(in + 4 * 3) << 15, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 12 | SafeLoadAs(in + 4 * 4) << 20, - SafeLoadAs(in + 4 * 4) >> 7 | SafeLoadAs(in + 4 * 5) << 25, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7) >> 19 | SafeLoadAs(in + 4 * 8) << 13, - SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, - SafeLoadAs(in + 4 * 9) >> 9 | SafeLoadAs(in + 4 * 10) << 23, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 31 | SafeLoadAs(in + 4 * 11) << 1, - SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, - SafeLoadAs(in + 4 * 12) >> 21 | SafeLoadAs(in + 4 * 13) << 11, - }; - shifts = simd_batch{ 4, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, - SafeLoadAs(in + 4 * 14) >> 11 | SafeLoadAs(in + 4 * 15) << 21, - SafeLoadAs(in + 4 * 15) >> 6 | SafeLoadAs(in + 4 * 16) << 26, - SafeLoadAs(in + 4 * 16), - }; - shifts = simd_batch{ 0, 0, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, - SafeLoadAs(in + 4 * 17) >> 23 | SafeLoadAs(in + 4 * 18) << 9, - SafeLoadAs(in + 4 * 18) >> 18 | SafeLoadAs(in + 4 * 19) << 14, - SafeLoadAs(in + 4 * 19) >> 13 | SafeLoadAs(in + 4 * 20) << 19, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 20) >> 8 | SafeLoadAs(in + 4 * 21) << 24, - SafeLoadAs(in + 4 * 21), - SafeLoadAs(in + 4 * 21) >> 30 | SafeLoadAs(in + 4 * 22) << 2, - SafeLoadAs(in + 4 * 22) >> 25 | SafeLoadAs(in + 4 * 23) << 7, - }; - shifts = simd_batch{ 0, 3, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 23) >> 20 | SafeLoadAs(in + 4 * 24) << 12, - SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, - SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, - SafeLoadAs(in + 4 * 26), - }; - shifts = simd_batch{ 0, 0, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 27 * 4; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1fffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 29-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 29 | SafeLoadAs(in + 4 * 1) << 3, - SafeLoadAs(in + 4 * 1) >> 26 | SafeLoadAs(in + 4 * 2) << 6, - SafeLoadAs(in + 4 * 2) >> 23 | SafeLoadAs(in + 4 * 3) << 9, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 20 | SafeLoadAs(in + 4 * 4) << 12, - SafeLoadAs(in + 4 * 4) >> 17 | SafeLoadAs(in + 4 * 5) << 15, - SafeLoadAs(in + 4 * 5) >> 14 | SafeLoadAs(in + 4 * 6) << 18, - SafeLoadAs(in + 4 * 6) >> 11 | SafeLoadAs(in + 4 * 7) << 21, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 8 | SafeLoadAs(in + 4 * 8) << 24, - SafeLoadAs(in + 4 * 8) >> 5 | SafeLoadAs(in + 4 * 9) << 27, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 31 | SafeLoadAs(in + 4 * 10) << 1, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 10) >> 28 | SafeLoadAs(in + 4 * 11) << 4, - SafeLoadAs(in + 4 * 11) >> 25 | SafeLoadAs(in + 4 * 12) << 7, - SafeLoadAs(in + 4 * 12) >> 22 | SafeLoadAs(in + 4 * 13) << 10, - SafeLoadAs(in + 4 * 13) >> 19 | SafeLoadAs(in + 4 * 14) << 13, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 14) >> 16 | SafeLoadAs(in + 4 * 15) << 16, - SafeLoadAs(in + 4 * 15) >> 13 | SafeLoadAs(in + 4 * 16) << 19, - SafeLoadAs(in + 4 * 16) >> 10 | SafeLoadAs(in + 4 * 17) << 22, - SafeLoadAs(in + 4 * 17) >> 7 | SafeLoadAs(in + 4 * 18) << 25, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 18) >> 4 | SafeLoadAs(in + 4 * 19) << 28, - SafeLoadAs(in + 4 * 19), - SafeLoadAs(in + 4 * 19) >> 30 | SafeLoadAs(in + 4 * 20) << 2, - SafeLoadAs(in + 4 * 20) >> 27 | SafeLoadAs(in + 4 * 21) << 5, - }; - shifts = simd_batch{ 0, 1, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 21) >> 24 | SafeLoadAs(in + 4 * 22) << 8, - SafeLoadAs(in + 4 * 22) >> 21 | SafeLoadAs(in + 4 * 23) << 11, - SafeLoadAs(in + 4 * 23) >> 18 | SafeLoadAs(in + 4 * 24) << 14, - SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 25) >> 12 | SafeLoadAs(in + 4 * 26) << 20, - SafeLoadAs(in + 4 * 26) >> 9 | SafeLoadAs(in + 4 * 27) << 23, - SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, - SafeLoadAs(in + 4 * 28), - }; - shifts = simd_batch{ 0, 0, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 29 * 4; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3fffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 30-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, - SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, - SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, - SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, - SafeLoadAs(in + 4 * 9) >> 12 | SafeLoadAs(in + 4 * 10) << 20, - SafeLoadAs(in + 4 * 10) >> 10 | SafeLoadAs(in + 4 * 11) << 22, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 11) >> 8 | SafeLoadAs(in + 4 * 12) << 24, - SafeLoadAs(in + 4 * 12) >> 6 | SafeLoadAs(in + 4 * 13) << 26, - SafeLoadAs(in + 4 * 13) >> 4 | SafeLoadAs(in + 4 * 14) << 28, - SafeLoadAs(in + 4 * 14), - }; - shifts = simd_batch{ 0, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 30 | SafeLoadAs(in + 4 * 16) << 2, - SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, - SafeLoadAs(in + 4 * 17) >> 26 | SafeLoadAs(in + 4 * 18) << 6, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, - SafeLoadAs(in + 4 * 19) >> 22 | SafeLoadAs(in + 4 * 20) << 10, - SafeLoadAs(in + 4 * 20) >> 20 | SafeLoadAs(in + 4 * 21) << 12, - SafeLoadAs(in + 4 * 21) >> 18 | SafeLoadAs(in + 4 * 22) << 14, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 22) >> 16 | SafeLoadAs(in + 4 * 23) << 16, - SafeLoadAs(in + 4 * 23) >> 14 | SafeLoadAs(in + 4 * 24) << 18, - SafeLoadAs(in + 4 * 24) >> 12 | SafeLoadAs(in + 4 * 25) << 20, - SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 26) >> 8 | SafeLoadAs(in + 4 * 27) << 24, - SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, - SafeLoadAs(in + 4 * 28) >> 4 | SafeLoadAs(in + 4 * 29) << 28, - SafeLoadAs(in + 4 * 29), - }; - shifts = simd_batch{ 0, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 30 * 4; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7fffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 31-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 31 | SafeLoadAs(in + 4 * 1) << 1, - SafeLoadAs(in + 4 * 1) >> 30 | SafeLoadAs(in + 4 * 2) << 2, - SafeLoadAs(in + 4 * 2) >> 29 | SafeLoadAs(in + 4 * 3) << 3, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 28 | SafeLoadAs(in + 4 * 4) << 4, - SafeLoadAs(in + 4 * 4) >> 27 | SafeLoadAs(in + 4 * 5) << 5, - SafeLoadAs(in + 4 * 5) >> 26 | SafeLoadAs(in + 4 * 6) << 6, - SafeLoadAs(in + 4 * 6) >> 25 | SafeLoadAs(in + 4 * 7) << 7, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 24 | SafeLoadAs(in + 4 * 8) << 8, - SafeLoadAs(in + 4 * 8) >> 23 | SafeLoadAs(in + 4 * 9) << 9, - SafeLoadAs(in + 4 * 9) >> 22 | SafeLoadAs(in + 4 * 10) << 10, - SafeLoadAs(in + 4 * 10) >> 21 | SafeLoadAs(in + 4 * 11) << 11, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 11) >> 20 | SafeLoadAs(in + 4 * 12) << 12, - SafeLoadAs(in + 4 * 12) >> 19 | SafeLoadAs(in + 4 * 13) << 13, - SafeLoadAs(in + 4 * 13) >> 18 | SafeLoadAs(in + 4 * 14) << 14, - SafeLoadAs(in + 4 * 14) >> 17 | SafeLoadAs(in + 4 * 15) << 15, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 15) >> 16 | SafeLoadAs(in + 4 * 16) << 16, - SafeLoadAs(in + 4 * 16) >> 15 | SafeLoadAs(in + 4 * 17) << 17, - SafeLoadAs(in + 4 * 17) >> 14 | SafeLoadAs(in + 4 * 18) << 18, - SafeLoadAs(in + 4 * 18) >> 13 | SafeLoadAs(in + 4 * 19) << 19, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 19) >> 12 | SafeLoadAs(in + 4 * 20) << 20, - SafeLoadAs(in + 4 * 20) >> 11 | SafeLoadAs(in + 4 * 21) << 21, - SafeLoadAs(in + 4 * 21) >> 10 | SafeLoadAs(in + 4 * 22) << 22, - SafeLoadAs(in + 4 * 22) >> 9 | SafeLoadAs(in + 4 * 23) << 23, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 23) >> 8 | SafeLoadAs(in + 4 * 24) << 24, - SafeLoadAs(in + 4 * 24) >> 7 | SafeLoadAs(in + 4 * 25) << 25, - SafeLoadAs(in + 4 * 25) >> 6 | SafeLoadAs(in + 4 * 26) << 26, - SafeLoadAs(in + 4 * 26) >> 5 | SafeLoadAs(in + 4 * 27) << 27, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 27) >> 4 | SafeLoadAs(in + 4 * 28) << 28, - SafeLoadAs(in + 4 * 28) >> 3 | SafeLoadAs(in + 4 * 29) << 29, - SafeLoadAs(in + 4 * 29) >> 2 | SafeLoadAs(in + 4 * 30) << 30, - SafeLoadAs(in + 4 * 30), - }; - shifts = simd_batch{ 0, 0, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 31 * 4; - return in; - } -}; - - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7ffffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 59-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 59 | SafeLoadAs(in + 8 * 1) << 5, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 54 | SafeLoadAs(in + 8 * 2) << 10, - SafeLoadAs(in + 8 * 2) >> 49 | SafeLoadAs(in + 8 * 3) << 15, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 44 | SafeLoadAs(in + 8 * 4) << 20, - SafeLoadAs(in + 8 * 4) >> 39 | SafeLoadAs(in + 8 * 5) << 25, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 34 | SafeLoadAs(in + 8 * 6) << 30, - SafeLoadAs(in + 8 * 6) >> 29 | SafeLoadAs(in + 8 * 7) << 35, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 24 | SafeLoadAs(in + 8 * 8) << 40, - SafeLoadAs(in + 8 * 8) >> 19 | SafeLoadAs(in + 8 * 9) << 45, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 14 | SafeLoadAs(in + 8 * 10) << 50, - SafeLoadAs(in + 8 * 10) >> 9 | SafeLoadAs(in + 8 * 11) << 55, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 63 | SafeLoadAs(in + 8 * 12) << 1, - }; - shifts = simd_batch{ 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 58 | SafeLoadAs(in + 8 * 13) << 6, - SafeLoadAs(in + 8 * 13) >> 53 | SafeLoadAs(in + 8 * 14) << 11, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 48 | SafeLoadAs(in + 8 * 15) << 16, - SafeLoadAs(in + 8 * 15) >> 43 | SafeLoadAs(in + 8 * 16) << 21, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 38 | SafeLoadAs(in + 8 * 17) << 26, - SafeLoadAs(in + 8 * 17) >> 33 | SafeLoadAs(in + 8 * 18) << 31, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 28 | SafeLoadAs(in + 8 * 19) << 36, - SafeLoadAs(in + 8 * 19) >> 23 | SafeLoadAs(in + 8 * 20) << 41, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 18 | SafeLoadAs(in + 8 * 21) << 46, - SafeLoadAs(in + 8 * 21) >> 13 | SafeLoadAs(in + 8 * 22) << 51, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 8 | SafeLoadAs(in + 8 * 23) << 56, - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 62 | SafeLoadAs(in + 8 * 24) << 2, - SafeLoadAs(in + 8 * 24) >> 57 | SafeLoadAs(in + 8 * 25) << 7, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 52 | SafeLoadAs(in + 8 * 26) << 12, - SafeLoadAs(in + 8 * 26) >> 47 | SafeLoadAs(in + 8 * 27) << 17, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 42 | SafeLoadAs(in + 8 * 28) << 22, - SafeLoadAs(in + 8 * 28) >> 37 | SafeLoadAs(in + 8 * 29) << 27, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 32 | SafeLoadAs(in + 8 * 30) << 32, - SafeLoadAs(in + 8 * 30) >> 27 | SafeLoadAs(in + 8 * 31) << 37, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 22 | SafeLoadAs(in + 8 * 32) << 42, - SafeLoadAs(in + 8 * 32) >> 17 | SafeLoadAs(in + 8 * 33) << 47, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 12 | SafeLoadAs(in + 8 * 34) << 52, - SafeLoadAs(in + 8 * 34) >> 7 | SafeLoadAs(in + 8 * 35) << 57, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 35), - SafeLoadAs(in + 8 * 35) >> 61 | SafeLoadAs(in + 8 * 36) << 3, - }; - shifts = simd_batch{ 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 56 | SafeLoadAs(in + 8 * 37) << 8, - SafeLoadAs(in + 8 * 37) >> 51 | SafeLoadAs(in + 8 * 38) << 13, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 38) >> 46 | SafeLoadAs(in + 8 * 39) << 18, - SafeLoadAs(in + 8 * 39) >> 41 | SafeLoadAs(in + 8 * 40) << 23, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 40) >> 36 | SafeLoadAs(in + 8 * 41) << 28, - SafeLoadAs(in + 8 * 41) >> 31 | SafeLoadAs(in + 8 * 42) << 33, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 42) >> 26 | SafeLoadAs(in + 8 * 43) << 38, - SafeLoadAs(in + 8 * 43) >> 21 | SafeLoadAs(in + 8 * 44) << 43, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 44) >> 16 | SafeLoadAs(in + 8 * 45) << 48, - SafeLoadAs(in + 8 * 45) >> 11 | SafeLoadAs(in + 8 * 46) << 53, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 46) >> 6 | SafeLoadAs(in + 8 * 47) << 58, - SafeLoadAs(in + 8 * 47), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 47) >> 60 | SafeLoadAs(in + 8 * 48) << 4, - SafeLoadAs(in + 8 * 48) >> 55 | SafeLoadAs(in + 8 * 49) << 9, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 49) >> 50 | SafeLoadAs(in + 8 * 50) << 14, - SafeLoadAs(in + 8 * 50) >> 45 | SafeLoadAs(in + 8 * 51) << 19, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 51) >> 40 | SafeLoadAs(in + 8 * 52) << 24, - SafeLoadAs(in + 8 * 52) >> 35 | SafeLoadAs(in + 8 * 53) << 29, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 53) >> 30 | SafeLoadAs(in + 8 * 54) << 34, - SafeLoadAs(in + 8 * 54) >> 25 | SafeLoadAs(in + 8 * 55) << 39, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 55) >> 20 | SafeLoadAs(in + 8 * 56) << 44, - SafeLoadAs(in + 8 * 56) >> 15 | SafeLoadAs(in + 8 * 57) << 49, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 59-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 57) >> 10 | SafeLoadAs(in + 8 * 58) << 54, - SafeLoadAs(in + 8 * 58), - }; - shifts = simd_batch{ 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 59 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using Dispatch = Kernel; - - static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - return Dispatch::unpack(in, out); - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1fffffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 61-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 61 | SafeLoadAs(in + 8 * 1) << 3, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 58 | SafeLoadAs(in + 8 * 2) << 6, - SafeLoadAs(in + 8 * 2) >> 55 | SafeLoadAs(in + 8 * 3) << 9, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 52 | SafeLoadAs(in + 8 * 4) << 12, - SafeLoadAs(in + 8 * 4) >> 49 | SafeLoadAs(in + 8 * 5) << 15, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 46 | SafeLoadAs(in + 8 * 6) << 18, - SafeLoadAs(in + 8 * 6) >> 43 | SafeLoadAs(in + 8 * 7) << 21, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 40 | SafeLoadAs(in + 8 * 8) << 24, - SafeLoadAs(in + 8 * 8) >> 37 | SafeLoadAs(in + 8 * 9) << 27, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 34 | SafeLoadAs(in + 8 * 10) << 30, - SafeLoadAs(in + 8 * 10) >> 31 | SafeLoadAs(in + 8 * 11) << 33, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 28 | SafeLoadAs(in + 8 * 12) << 36, - SafeLoadAs(in + 8 * 12) >> 25 | SafeLoadAs(in + 8 * 13) << 39, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 22 | SafeLoadAs(in + 8 * 14) << 42, - SafeLoadAs(in + 8 * 14) >> 19 | SafeLoadAs(in + 8 * 15) << 45, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 16 | SafeLoadAs(in + 8 * 16) << 48, - SafeLoadAs(in + 8 * 16) >> 13 | SafeLoadAs(in + 8 * 17) << 51, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 10 | SafeLoadAs(in + 8 * 18) << 54, - SafeLoadAs(in + 8 * 18) >> 7 | SafeLoadAs(in + 8 * 19) << 57, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 4 | SafeLoadAs(in + 8 * 20) << 60, - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 62 | SafeLoadAs(in + 8 * 21) << 2, - SafeLoadAs(in + 8 * 21) >> 59 | SafeLoadAs(in + 8 * 22) << 5, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 56 | SafeLoadAs(in + 8 * 23) << 8, - SafeLoadAs(in + 8 * 23) >> 53 | SafeLoadAs(in + 8 * 24) << 11, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 50 | SafeLoadAs(in + 8 * 25) << 14, - SafeLoadAs(in + 8 * 25) >> 47 | SafeLoadAs(in + 8 * 26) << 17, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 44 | SafeLoadAs(in + 8 * 27) << 20, - SafeLoadAs(in + 8 * 27) >> 41 | SafeLoadAs(in + 8 * 28) << 23, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 38 | SafeLoadAs(in + 8 * 29) << 26, - SafeLoadAs(in + 8 * 29) >> 35 | SafeLoadAs(in + 8 * 30) << 29, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 32 | SafeLoadAs(in + 8 * 31) << 32, - SafeLoadAs(in + 8 * 31) >> 29 | SafeLoadAs(in + 8 * 32) << 35, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 32) >> 26 | SafeLoadAs(in + 8 * 33) << 38, - SafeLoadAs(in + 8 * 33) >> 23 | SafeLoadAs(in + 8 * 34) << 41, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 20 | SafeLoadAs(in + 8 * 35) << 44, - SafeLoadAs(in + 8 * 35) >> 17 | SafeLoadAs(in + 8 * 36) << 47, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 14 | SafeLoadAs(in + 8 * 37) << 50, - SafeLoadAs(in + 8 * 37) >> 11 | SafeLoadAs(in + 8 * 38) << 53, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 38) >> 8 | SafeLoadAs(in + 8 * 39) << 56, - SafeLoadAs(in + 8 * 39) >> 5 | SafeLoadAs(in + 8 * 40) << 59, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 40), - SafeLoadAs(in + 8 * 40) >> 63 | SafeLoadAs(in + 8 * 41) << 1, - }; - shifts = simd_batch{ 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 41) >> 60 | SafeLoadAs(in + 8 * 42) << 4, - SafeLoadAs(in + 8 * 42) >> 57 | SafeLoadAs(in + 8 * 43) << 7, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 43) >> 54 | SafeLoadAs(in + 8 * 44) << 10, - SafeLoadAs(in + 8 * 44) >> 51 | SafeLoadAs(in + 8 * 45) << 13, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 45) >> 48 | SafeLoadAs(in + 8 * 46) << 16, - SafeLoadAs(in + 8 * 46) >> 45 | SafeLoadAs(in + 8 * 47) << 19, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 47) >> 42 | SafeLoadAs(in + 8 * 48) << 22, - SafeLoadAs(in + 8 * 48) >> 39 | SafeLoadAs(in + 8 * 49) << 25, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 49) >> 36 | SafeLoadAs(in + 8 * 50) << 28, - SafeLoadAs(in + 8 * 50) >> 33 | SafeLoadAs(in + 8 * 51) << 31, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 51) >> 30 | SafeLoadAs(in + 8 * 52) << 34, - SafeLoadAs(in + 8 * 52) >> 27 | SafeLoadAs(in + 8 * 53) << 37, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 53) >> 24 | SafeLoadAs(in + 8 * 54) << 40, - SafeLoadAs(in + 8 * 54) >> 21 | SafeLoadAs(in + 8 * 55) << 43, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 55) >> 18 | SafeLoadAs(in + 8 * 56) << 46, - SafeLoadAs(in + 8 * 56) >> 15 | SafeLoadAs(in + 8 * 57) << 49, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 57) >> 12 | SafeLoadAs(in + 8 * 58) << 52, - SafeLoadAs(in + 8 * 58) >> 9 | SafeLoadAs(in + 8 * 59) << 55, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 61-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 59) >> 6 | SafeLoadAs(in + 8 * 60) << 58, - SafeLoadAs(in + 8 * 60), - }; - shifts = simd_batch{ 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 61 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3fffffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 62-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 62 | SafeLoadAs(in + 8 * 1) << 2, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 60 | SafeLoadAs(in + 8 * 2) << 4, - SafeLoadAs(in + 8 * 2) >> 58 | SafeLoadAs(in + 8 * 3) << 6, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 56 | SafeLoadAs(in + 8 * 4) << 8, - SafeLoadAs(in + 8 * 4) >> 54 | SafeLoadAs(in + 8 * 5) << 10, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 52 | SafeLoadAs(in + 8 * 6) << 12, - SafeLoadAs(in + 8 * 6) >> 50 | SafeLoadAs(in + 8 * 7) << 14, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 48 | SafeLoadAs(in + 8 * 8) << 16, - SafeLoadAs(in + 8 * 8) >> 46 | SafeLoadAs(in + 8 * 9) << 18, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 44 | SafeLoadAs(in + 8 * 10) << 20, - SafeLoadAs(in + 8 * 10) >> 42 | SafeLoadAs(in + 8 * 11) << 22, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 40 | SafeLoadAs(in + 8 * 12) << 24, - SafeLoadAs(in + 8 * 12) >> 38 | SafeLoadAs(in + 8 * 13) << 26, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 36 | SafeLoadAs(in + 8 * 14) << 28, - SafeLoadAs(in + 8 * 14) >> 34 | SafeLoadAs(in + 8 * 15) << 30, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 32 | SafeLoadAs(in + 8 * 16) << 32, - SafeLoadAs(in + 8 * 16) >> 30 | SafeLoadAs(in + 8 * 17) << 34, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 28 | SafeLoadAs(in + 8 * 18) << 36, - SafeLoadAs(in + 8 * 18) >> 26 | SafeLoadAs(in + 8 * 19) << 38, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 24 | SafeLoadAs(in + 8 * 20) << 40, - SafeLoadAs(in + 8 * 20) >> 22 | SafeLoadAs(in + 8 * 21) << 42, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 20 | SafeLoadAs(in + 8 * 22) << 44, - SafeLoadAs(in + 8 * 22) >> 18 | SafeLoadAs(in + 8 * 23) << 46, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 16 | SafeLoadAs(in + 8 * 24) << 48, - SafeLoadAs(in + 8 * 24) >> 14 | SafeLoadAs(in + 8 * 25) << 50, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 12 | SafeLoadAs(in + 8 * 26) << 52, - SafeLoadAs(in + 8 * 26) >> 10 | SafeLoadAs(in + 8 * 27) << 54, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 8 | SafeLoadAs(in + 8 * 28) << 56, - SafeLoadAs(in + 8 * 28) >> 6 | SafeLoadAs(in + 8 * 29) << 58, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 4 | SafeLoadAs(in + 8 * 30) << 60, - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 31), - SafeLoadAs(in + 8 * 31) >> 62 | SafeLoadAs(in + 8 * 32) << 2, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 32) >> 60 | SafeLoadAs(in + 8 * 33) << 4, - SafeLoadAs(in + 8 * 33) >> 58 | SafeLoadAs(in + 8 * 34) << 6, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 56 | SafeLoadAs(in + 8 * 35) << 8, - SafeLoadAs(in + 8 * 35) >> 54 | SafeLoadAs(in + 8 * 36) << 10, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 52 | SafeLoadAs(in + 8 * 37) << 12, - SafeLoadAs(in + 8 * 37) >> 50 | SafeLoadAs(in + 8 * 38) << 14, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 38) >> 48 | SafeLoadAs(in + 8 * 39) << 16, - SafeLoadAs(in + 8 * 39) >> 46 | SafeLoadAs(in + 8 * 40) << 18, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 40) >> 44 | SafeLoadAs(in + 8 * 41) << 20, - SafeLoadAs(in + 8 * 41) >> 42 | SafeLoadAs(in + 8 * 42) << 22, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 42) >> 40 | SafeLoadAs(in + 8 * 43) << 24, - SafeLoadAs(in + 8 * 43) >> 38 | SafeLoadAs(in + 8 * 44) << 26, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 44) >> 36 | SafeLoadAs(in + 8 * 45) << 28, - SafeLoadAs(in + 8 * 45) >> 34 | SafeLoadAs(in + 8 * 46) << 30, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 46) >> 32 | SafeLoadAs(in + 8 * 47) << 32, - SafeLoadAs(in + 8 * 47) >> 30 | SafeLoadAs(in + 8 * 48) << 34, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 48) >> 28 | SafeLoadAs(in + 8 * 49) << 36, - SafeLoadAs(in + 8 * 49) >> 26 | SafeLoadAs(in + 8 * 50) << 38, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 50) >> 24 | SafeLoadAs(in + 8 * 51) << 40, - SafeLoadAs(in + 8 * 51) >> 22 | SafeLoadAs(in + 8 * 52) << 42, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 52) >> 20 | SafeLoadAs(in + 8 * 53) << 44, - SafeLoadAs(in + 8 * 53) >> 18 | SafeLoadAs(in + 8 * 54) << 46, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 54) >> 16 | SafeLoadAs(in + 8 * 55) << 48, - SafeLoadAs(in + 8 * 55) >> 14 | SafeLoadAs(in + 8 * 56) << 50, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 56) >> 12 | SafeLoadAs(in + 8 * 57) << 52, - SafeLoadAs(in + 8 * 57) >> 10 | SafeLoadAs(in + 8 * 58) << 54, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 58) >> 8 | SafeLoadAs(in + 8 * 59) << 56, - SafeLoadAs(in + 8 * 59) >> 6 | SafeLoadAs(in + 8 * 60) << 58, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 62-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 60) >> 4 | SafeLoadAs(in + 8 * 61) << 60, - SafeLoadAs(in + 8 * 61), - }; - shifts = simd_batch{ 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 62 * 8; - return in; - } -}; - -template<> -struct Simd128UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7fffffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 63-bit bundles 0 to 1 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 63 | SafeLoadAs(in + 8 * 1) << 1, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 2 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 62 | SafeLoadAs(in + 8 * 2) << 2, - SafeLoadAs(in + 8 * 2) >> 61 | SafeLoadAs(in + 8 * 3) << 3, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 4 to 5 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - SafeLoadAs(in + 8 * 4) >> 59 | SafeLoadAs(in + 8 * 5) << 5, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 6 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 58 | SafeLoadAs(in + 8 * 6) << 6, - SafeLoadAs(in + 8 * 6) >> 57 | SafeLoadAs(in + 8 * 7) << 7, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 8 to 9 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8) >> 55 | SafeLoadAs(in + 8 * 9) << 9, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 10 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 54 | SafeLoadAs(in + 8 * 10) << 10, - SafeLoadAs(in + 8 * 10) >> 53 | SafeLoadAs(in + 8 * 11) << 11, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 12 to 13 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 52 | SafeLoadAs(in + 8 * 12) << 12, - SafeLoadAs(in + 8 * 12) >> 51 | SafeLoadAs(in + 8 * 13) << 13, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 14 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 50 | SafeLoadAs(in + 8 * 14) << 14, - SafeLoadAs(in + 8 * 14) >> 49 | SafeLoadAs(in + 8 * 15) << 15, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 16 to 17 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 48 | SafeLoadAs(in + 8 * 16) << 16, - SafeLoadAs(in + 8 * 16) >> 47 | SafeLoadAs(in + 8 * 17) << 17, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 18 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 46 | SafeLoadAs(in + 8 * 18) << 18, - SafeLoadAs(in + 8 * 18) >> 45 | SafeLoadAs(in + 8 * 19) << 19, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 20 to 21 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 44 | SafeLoadAs(in + 8 * 20) << 20, - SafeLoadAs(in + 8 * 20) >> 43 | SafeLoadAs(in + 8 * 21) << 21, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 22 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 42 | SafeLoadAs(in + 8 * 22) << 22, - SafeLoadAs(in + 8 * 22) >> 41 | SafeLoadAs(in + 8 * 23) << 23, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 24 to 25 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 40 | SafeLoadAs(in + 8 * 24) << 24, - SafeLoadAs(in + 8 * 24) >> 39 | SafeLoadAs(in + 8 * 25) << 25, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 26 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 38 | SafeLoadAs(in + 8 * 26) << 26, - SafeLoadAs(in + 8 * 26) >> 37 | SafeLoadAs(in + 8 * 27) << 27, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 28 to 29 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 36 | SafeLoadAs(in + 8 * 28) << 28, - SafeLoadAs(in + 8 * 28) >> 35 | SafeLoadAs(in + 8 * 29) << 29, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 30 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 34 | SafeLoadAs(in + 8 * 30) << 30, - SafeLoadAs(in + 8 * 30) >> 33 | SafeLoadAs(in + 8 * 31) << 31, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 32 to 33 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 32 | SafeLoadAs(in + 8 * 32) << 32, - SafeLoadAs(in + 8 * 32) >> 31 | SafeLoadAs(in + 8 * 33) << 33, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 34 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 30 | SafeLoadAs(in + 8 * 34) << 34, - SafeLoadAs(in + 8 * 34) >> 29 | SafeLoadAs(in + 8 * 35) << 35, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 36 to 37 - words = simd_batch{ - SafeLoadAs(in + 8 * 35) >> 28 | SafeLoadAs(in + 8 * 36) << 36, - SafeLoadAs(in + 8 * 36) >> 27 | SafeLoadAs(in + 8 * 37) << 37, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 38 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 37) >> 26 | SafeLoadAs(in + 8 * 38) << 38, - SafeLoadAs(in + 8 * 38) >> 25 | SafeLoadAs(in + 8 * 39) << 39, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 40 to 41 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 24 | SafeLoadAs(in + 8 * 40) << 40, - SafeLoadAs(in + 8 * 40) >> 23 | SafeLoadAs(in + 8 * 41) << 41, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 42 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 41) >> 22 | SafeLoadAs(in + 8 * 42) << 42, - SafeLoadAs(in + 8 * 42) >> 21 | SafeLoadAs(in + 8 * 43) << 43, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 44 to 45 - words = simd_batch{ - SafeLoadAs(in + 8 * 43) >> 20 | SafeLoadAs(in + 8 * 44) << 44, - SafeLoadAs(in + 8 * 44) >> 19 | SafeLoadAs(in + 8 * 45) << 45, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 46 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 45) >> 18 | SafeLoadAs(in + 8 * 46) << 46, - SafeLoadAs(in + 8 * 46) >> 17 | SafeLoadAs(in + 8 * 47) << 47, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 48 to 49 - words = simd_batch{ - SafeLoadAs(in + 8 * 47) >> 16 | SafeLoadAs(in + 8 * 48) << 48, - SafeLoadAs(in + 8 * 48) >> 15 | SafeLoadAs(in + 8 * 49) << 49, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 50 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 49) >> 14 | SafeLoadAs(in + 8 * 50) << 50, - SafeLoadAs(in + 8 * 50) >> 13 | SafeLoadAs(in + 8 * 51) << 51, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 52 to 53 - words = simd_batch{ - SafeLoadAs(in + 8 * 51) >> 12 | SafeLoadAs(in + 8 * 52) << 52, - SafeLoadAs(in + 8 * 52) >> 11 | SafeLoadAs(in + 8 * 53) << 53, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 54 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 53) >> 10 | SafeLoadAs(in + 8 * 54) << 54, - SafeLoadAs(in + 8 * 54) >> 9 | SafeLoadAs(in + 8 * 55) << 55, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 56 to 57 - words = simd_batch{ - SafeLoadAs(in + 8 * 55) >> 8 | SafeLoadAs(in + 8 * 56) << 56, - SafeLoadAs(in + 8 * 56) >> 7 | SafeLoadAs(in + 8 * 57) << 57, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 58 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 57) >> 6 | SafeLoadAs(in + 8 * 58) << 58, - SafeLoadAs(in + 8 * 58) >> 5 | SafeLoadAs(in + 8 * 59) << 59, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 60 to 61 - words = simd_batch{ - SafeLoadAs(in + 8 * 59) >> 4 | SafeLoadAs(in + 8 * 60) << 60, - SafeLoadAs(in + 8 * 60) >> 3 | SafeLoadAs(in + 8 * 61) << 61, - }; - shifts = simd_batch{ 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - // extract 63-bit bundles 62 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 61) >> 2 | SafeLoadAs(in + 8 * 62) << 62, - SafeLoadAs(in + 8 * 62), - }; - shifts = simd_batch{ 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 2; - - in += 63 * 8; - return in; - } -}; - - -} // namespace -} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_simd_default.cc b/cpp/src/arrow/util/bpacking_simd_default.cc index 691d43f783b..1041de78ae3 100644 --- a/cpp/src/arrow/util/bpacking_simd_default.cc +++ b/cpp/src/arrow/util/bpacking_simd_default.cc @@ -17,18 +17,21 @@ #include "arrow/util/bpacking_dispatch_internal.h" #if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2) -# include "arrow/util/bpacking_simd128_generated_internal.h" +# include "arrow/util/bpacking_simd_impl_internal.h" #endif #include "arrow/util/bpacking_simd_internal.h" namespace arrow::internal { +template +using Simd128Kernel = Kernel; + #if defined(ARROW_HAVE_NEON) template void unpack_neon(const uint8_t* in, Uint* out, int batch_size, int num_bits, int bit_offset) { - return unpack_jump(in, out, batch_size, num_bits, bit_offset); + return unpack_jump(in, out, batch_size, num_bits, bit_offset); } template void unpack_neon(const uint8_t*, bool*, int, int, int); @@ -42,7 +45,7 @@ template void unpack_neon(const uint8_t*, uint64_t*, int, int, int); template void unpack_sse4_2(const uint8_t* in, Uint* out, int batch_size, int num_bits, int bit_offset) { - return unpack_jump(in, out, batch_size, num_bits, bit_offset); + return unpack_jump(in, out, batch_size, num_bits, bit_offset); } template void unpack_sse4_2(const uint8_t*, bool*, int, int, int); diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 03d287c2855..800bf7d6104 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -34,14 +34,14 @@ namespace arrow::internal { // TODO // - _mm_cvtepi8_epi32 // - no _mm_srlv_epi32 (128bit) in xsimd with AVX2 required arch -// - no need for while loop (for up to 8 is sufficient) +// - no need for while loop (for up to 8 is sufficient) // - upstream var lshift to xsimd // - array to batch constant to xsimd // - Shifts per swizzle can be improved when self.packed_max_byte_spread == 1 and the // byte can be reused (when val_bit_width divides packed_max_byte_spread). -// - Try for uint16_t and uint8_t +// - Try for uint16_t and uint8_t and bool (currently copy) // - For Avx2: -// - Inspect how swizzle across lanes are handled +// - Inspect how swizzle across lanes are handled: _mm256_shuffle_epi8 not used? // - Investigate AVX2 with 128 bit register constexpr bool PackedIsOversizedForSimd(int simd_bit_size, int unpacked_bit_size, @@ -589,4 +589,38 @@ struct Kernel : DispatchKernelType { using Base::unpack; }; +template +struct Kernel + : Kernel { + using Base = DispatchKernelType; + using Base::kValuesUnpacked; + using unpacked_type = uint8_t; + + static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) { + uint16_t buffer[kValuesUnpacked] = {}; + in = Base::unpack(in, buffer); + for (int k = 0; k < kValuesUnpacked; ++k) { + out[k] = static_cast(buffer[k]); + } + return in; + } +}; + +template +struct Kernel + : Kernel { + using Base = DispatchKernelType; + using Base::kValuesUnpacked; + using unpacked_type = bool; + + static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) { + uint16_t buffer[kValuesUnpacked] = {}; + in = Base::unpack(in, buffer); + for (int k = 0; k < kValuesUnpacked; ++k) { + out[k] = static_cast(buffer[k]); + } + return in; + } +}; + } // namespace arrow::internal From a195cd21a9fc387402a3fdd362546bfbee65e48c Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 29 Oct 2025 10:45:20 +0100 Subject: [PATCH 16/64] Fix SIMD level None --- cpp/src/arrow/util/bpacking_simd_default.cc | 46 ++++++++------------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_default.cc b/cpp/src/arrow/util/bpacking_simd_default.cc index 1041de78ae3..1835157f4e2 100644 --- a/cpp/src/arrow/util/bpacking_simd_default.cc +++ b/cpp/src/arrow/util/bpacking_simd_default.cc @@ -15,45 +15,35 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/util/bpacking_dispatch_internal.h" -#if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2) -# include "arrow/util/bpacking_simd_impl_internal.h" +#if defined(ARROW_HAVE_NEON) +# define UNPACK_PLATFORM unpack_neon +#elif defined(ARROW_HAVE_SSE4_2) +# define UNPACK_PLATFORM unpack_sse4_2 #endif -#include "arrow/util/bpacking_simd_internal.h" + +#if defined(UNPACK_PLATFORM) + +# include "arrow/util/bpacking_dispatch_internal.h" +# include "arrow/util/bpacking_simd_impl_internal.h" +# include "arrow/util/bpacking_simd_internal.h" namespace arrow::internal { template using Simd128Kernel = Kernel; -#if defined(ARROW_HAVE_NEON) - template -void unpack_neon(const uint8_t* in, Uint* out, int batch_size, int num_bits, - int bit_offset) { +void UNPACK_PLATFORM(const uint8_t* in, Uint* out, int batch_size, int num_bits, + int bit_offset) { return unpack_jump(in, out, batch_size, num_bits, bit_offset); } -template void unpack_neon(const uint8_t*, bool*, int, int, int); -template void unpack_neon(const uint8_t*, uint8_t*, int, int, int); -template void unpack_neon(const uint8_t*, uint16_t*, int, int, int); -template void unpack_neon(const uint8_t*, uint32_t*, int, int, int); -template void unpack_neon(const uint8_t*, uint64_t*, int, int, int); +template void UNPACK_PLATFORM(const uint8_t*, bool*, int, int, int); +template void UNPACK_PLATFORM(const uint8_t*, uint8_t*, int, int, int); +template void UNPACK_PLATFORM(const uint8_t*, uint16_t*, int, int, int); +template void UNPACK_PLATFORM(const uint8_t*, uint32_t*, int, int, int); +template void UNPACK_PLATFORM(const uint8_t*, uint64_t*, int, int, int); -#elif defined(ARROW_HAVE_SSE4_2) - -template -void unpack_sse4_2(const uint8_t* in, Uint* out, int batch_size, int num_bits, - int bit_offset) { - return unpack_jump(in, out, batch_size, num_bits, bit_offset); -} - -template void unpack_sse4_2(const uint8_t*, bool*, int, int, int); -template void unpack_sse4_2(const uint8_t*, uint8_t*, int, int, int); -template void unpack_sse4_2(const uint8_t*, uint16_t*, int, int, int); -template void unpack_sse4_2(const uint8_t*, uint32_t*, int, int, int); -template void unpack_sse4_2(const uint8_t*, uint64_t*, int, int, int); +} // namespace arrow::internal #endif - -} // namespace arrow::internal From 4aa76ebe45ff7f1fb7c37156ff8acfa95e565f46 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 29 Oct 2025 12:56:07 +0100 Subject: [PATCH 17/64] Initialize swizzles to -1 --- cpp/src/arrow/util/bpacking_simd_impl_internal.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 800bf7d6104..99d9e82f9de 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -44,6 +44,15 @@ namespace arrow::internal { // - Inspect how swizzle across lanes are handled: _mm256_shuffle_epi8 not used? // - Investigate AVX2 with 128 bit register +template +constexpr Arr BuildConstantArray(typename Arr::value_type val) { + Arr out = {}; + for (auto& v : out) { + v = val; + } + return out; +} + constexpr bool PackedIsOversizedForSimd(int simd_bit_size, int unpacked_bit_size, int packed_bit_size) { const int unpacked_per_simd = simd_bit_size / unpacked_bit_size; @@ -219,6 +228,8 @@ constexpr MediumKernelPlan BuildMedi plan.reads.at(r) = read_start_byte; for (int sw = 0; sw < kPlanSize.swizzles_per_read(); ++sw) { + constexpr int kUndefined = -1; + plan.swizzles.at(r).at(sw) = BuildConstantArray(kUndefined); for (int sh = 0; sh < kPlanSize.shifts_per_swizzle(); ++sh) { const int sh_offset_bytes = sh * kShape.packed_max_spread_bytes(); const int sh_offset_bits = 8 * sh_offset_bytes; @@ -443,6 +454,10 @@ constexpr LargeKernelPlan BuildLarge const int read_start_byte = packed_start_bit / 8; plan.reads.at(r) = read_start_byte; + constexpr int kUndefined = -1; + plan.low_swizzles.at(r) = BuildConstantArray(kUndefined); + plan.high_swizzles.at(r) = BuildConstantArray(kUndefined); + for (int u = 0; u < kShape.unpacked_per_simd(); ++u) { const int packed_start_byte = packed_start_bit / 8; const int packed_byte_in_read = packed_start_byte - read_start_byte; From 2bd7e93628ee5bf7270755e7c3c6bff095a61896 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 29 Oct 2025 16:49:42 +0100 Subject: [PATCH 18/64] Doc --- .../arrow/util/bpacking_simd_impl_internal.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 99d9e82f9de..b50d5f039f7 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -40,6 +40,8 @@ namespace arrow::internal { // - Shifts per swizzle can be improved when self.packed_max_byte_spread == 1 and the // byte can be reused (when val_bit_width divides packed_max_byte_spread). // - Try for uint16_t and uint8_t and bool (currently copy) +// - Add unpack_exact to benchmarks +// - Reduce input size on small bit width using a broadcast. // - For Avx2: // - Inspect how swizzle across lanes are handled: _mm256_shuffle_epi8 not used? // - Investigate AVX2 with 128 bit register @@ -287,13 +289,14 @@ constexpr auto make_batch_constant() { // Intel x86-64 does not have variable left shifts before AVX2. // // We replace the variable left shift by a variable multiply with a power of two. +// The behaviour is the same sa long as there are no overflow. // // This trick is borrowed from Daniel Lemire and Leonid Boytsov, Decoding billions of // integers per second through vectorization, Software Practice & Experience 45 (1), 2015. // http://arxiv.org/abs/1209.2137 template -auto left_shift(const xsimd::batch& batch, - xsimd::batch_constant shifts) { +auto left_shift_no_overflow(const xsimd::batch& batch, + xsimd::batch_constant shifts) { constexpr bool kHasSse2 = xsimd::supported_architectures::contains(); constexpr bool kHasAvx2 = xsimd::supported_architectures::contains(); @@ -324,8 +327,8 @@ auto left_shift(const xsimd::batch& batch, // integers per second through vectorization, Software Practice & Experience 45 (1), 2015. // http://arxiv.org/abs/1209.2137 template -auto overflow_right_shift(const xsimd::batch& batch, - xsimd::batch_constant shifts) { +auto right_shift_by_excess(const xsimd::batch& batch, + xsimd::batch_constant shifts) { constexpr bool kHasSse2 = xsimd::supported_architectures::contains(); constexpr bool kHasAvx2 = xsimd::supported_architectures::contains(); @@ -375,7 +378,7 @@ struct MediumKernel { // Intel x86-64 does not have variable right shifts before AVX2. // We know the packed value can safely be left shifted up to the largest offset so we // can use the fallback on these platforms. - const auto shifted = overflow_right_shift(words, kRightShifts); + const auto shifted = right_shift_by_excess(words, kRightShifts); const auto vals = shifted & kMask; xsimd::store_unaligned(out + kOutOffset, vals); } @@ -515,7 +518,7 @@ struct LargeKernel { // Intel x86-64 does not have variable right shifts before AVX2. // We know the packed value can safely be left shifted up to the largest offset so we // can use the fallback on these platforms. - const auto shifted = overflow_right_shift(words, kRightShifts); + const auto shifted = right_shift_by_excess(words, kRightShifts); const auto vals = shifted & kMask; xsimd::store_unaligned(out + kOutOffset, vals); } @@ -547,12 +550,12 @@ struct LargeKernel { const auto low_swizzled = xsimd::swizzle(bytes, kLowSwizzles); const auto low_words = xsimd::bitwise_cast(low_swizzled); - const auto low_shifted = overflow_right_shift(low_words, kLowRShifts); + const auto low_shifted = right_shift_by_excess(low_words, kLowRShifts); const auto low_half_vals = low_shifted & kPlan.low_mask; const auto high_swizzled = xsimd::swizzle(bytes, kHighSwizzles); const auto high_words = xsimd::bitwise_cast(high_swizzled); - const auto high_shifted = left_shift(high_words, kHighLShifts); + const auto high_shifted = left_shift_no_overflow(high_words, kHighLShifts); const auto high_half_vals = high_shifted & kPlan.high_mask; const auto vals = low_half_vals | high_half_vals; From 060c434482d2082e1a457e5881c02ac9c54216e8 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 29 Oct 2025 19:12:30 +0100 Subject: [PATCH 19/64] Improve test error message --- cpp/src/arrow/util/bpacking_test.cc | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc index 7d487c35d15..b693e0da3bc 100644 --- a/cpp/src/arrow/util/bpacking_test.cc +++ b/cpp/src/arrow/util/bpacking_test.cc @@ -114,8 +114,20 @@ class TestUnpack : public ::testing::TestWithParam { const auto packed = PackValues(original, num_values, bit_width, bit_offset); const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, bit_offset, unpack); - EXPECT_EQ(unpacked.size(), num_values); - EXPECT_EQ(original, unpacked); + + ASSERT_EQ(unpacked.size(), num_values); + const auto [iter_original, iter_unpacked] = + std::mismatch(original.cbegin(), original.cend(), unpacked.cbegin()); + Int val_original = 0; + Int val_unpacked = 0; + const auto mismatch_idx = static_cast(iter_original - original.cbegin()); + if (mismatch_idx < unpacked.size()) { + val_original = *iter_original; + val_unpacked = *iter_unpacked; + } + EXPECT_EQ(original, unpacked) << "At position " << mismatch_idx << "/" + << unpacked.size() << ", expected original value " + << val_original << " but unpacked " << val_unpacked; } template @@ -212,10 +224,10 @@ class TestUnpack : public ::testing::TestWithParam { if (testing::Test::HasFailure()) return; } - // Similarly, we test all epilogue sizes. That is extra values that could make it + // Similarly, we test all epilog sizes. That is extra values that could make it // fall outside of an SIMD register for (int epilogue_size = 0; epilogue_size <= kMaxBitWidth; ++epilogue_size) { - SCOPED_TRACE(::testing::Message() << "Testing epilogue_size=" << epilogue_size); + SCOPED_TRACE(::testing::Message() << "Testing epilog_size=" << epilogue_size); const int num_values = num_values_base + epilogue_size; From a70e085a73d97e169da597f1e006573f3d6c5d4d Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 28 Oct 2025 16:46:29 +0100 Subject: [PATCH 20/64] Use new kernel in avx2 --- cpp/src/arrow/util/bpacking_simd_avx2.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_avx2.cc b/cpp/src/arrow/util/bpacking_simd_avx2.cc index 8261cdadf25..c41f9fc3836 100644 --- a/cpp/src/arrow/util/bpacking_simd_avx2.cc +++ b/cpp/src/arrow/util/bpacking_simd_avx2.cc @@ -16,15 +16,17 @@ // under the License. #include "arrow/util/bpacking_dispatch_internal.h" -#include "arrow/util/bpacking_simd256_generated_internal.h" -#include "arrow/util/bpacking_simd_internal.h" +#include "arrow/util/bpacking_simd_impl_internal.h" namespace arrow::internal { +template +using Simd256Kernel = Kernel; + template void unpack_avx2(const uint8_t* in, Uint* out, int batch_size, int num_bits, int bit_offset) { - return unpack_jump(in, out, batch_size, num_bits, bit_offset); + return unpack_jump(in, out, batch_size, num_bits, bit_offset); } template void unpack_avx2(const uint8_t*, bool*, int, int, int); From 11fa65a55249af1738f674db88720e699d1d51a6 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 29 Oct 2025 18:05:11 +0100 Subject: [PATCH 21/64] AVX2 swizzle fallback --- .../arrow/util/bpacking_simd_impl_internal.h | 76 ++++++++++++++++++- 1 file changed, 72 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index b50d5f039f7..f7de2b109d1 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -286,6 +286,74 @@ constexpr auto make_batch_constant() { return make_batch_constant_impl(std::make_index_sequence()); } +template +struct SwizzleBiLaneGenericPlan { + using ByteSwizzle = std::array; + + ByteSwizzle self_lane; + ByteSwizzle cross_lane; +}; + +template +constexpr SwizzleBiLaneGenericPlan BuildSwizzleBiLaneGenericPlan( + std::array mask) { + constexpr T kAsZero = 0x80; // Most significant bit of the byte must be 1 + constexpr std::size_t kSize = N; + constexpr std::size_t kSizeHalf = kSize / 2; + + SwizzleBiLaneGenericPlan plan = {}; + + for (std::size_t k = 0; k < kSize; ++k) { + const bool is_defined = (0 <= mask[k]) && (mask[k] < kSize); + const bool is_first_lane_idx = k < kSizeHalf; + const bool is_first_lane_mask = mask[k] < kSizeHalf; + + if (!is_defined) { + plan.self_lane[k] = kAsZero; + plan.cross_lane[k] = kAsZero; + } else { + if (is_first_lane_idx) { + if (is_first_lane_mask) { + plan.self_lane[k] = mask[k]; + plan.cross_lane[k] = kAsZero; + } else { + plan.self_lane[k] = kAsZero; + plan.cross_lane[k] = mask[k] - kSizeHalf; + } + } else { + if (is_first_lane_mask) { + plan.self_lane[k] = kAsZero; + plan.cross_lane[k] = mask[k]; // Indices given within lane + } else { + plan.self_lane[k] = mask[k] - kSizeHalf; // Indices given within lane + plan.cross_lane[k] = kAsZero; + } + } + } + } + + return plan; +} + +template +auto swizzle_bytes(const xsimd::batch& batch, + xsimd::batch_constant mask) { + if constexpr (xsimd::supported_architectures::contains()) { + static constexpr auto kPlan = BuildSwizzleBiLaneGenericPlan(std::array{kIdx...}); + static constexpr auto kSelfSwizzleArr = kPlan.self_lane; + constexpr auto kSelfSwizzle = make_batch_constant(); + static constexpr auto kCrossSwizzleArr = kPlan.cross_lane; + constexpr auto kCrossSwizzle = make_batch_constant(); + + auto self = _mm256_shuffle_epi8(batch, kSelfSwizzle.as_batch()); + auto swapped = _mm256_permute2x128_si256(batch, batch, 0x01); + auto cross = _mm256_shuffle_epi8(swapped, kCrossSwizzle.as_batch()); + return xsimd::batch(_mm256_or_si256(self, cross)); + } else { + return xsimd::swizzle(batch, mask); + } +} + // Intel x86-64 does not have variable left shifts before AVX2. // // We replace the variable left shift by a variable multiply with a power of two. @@ -389,7 +457,7 @@ struct MediumKernel { static constexpr auto kSwizzlesArr = kPlan.swizzles.at(kReadIdx).at(kSwizzleIdx); constexpr auto kSwizzles = make_batch_constant(); - const auto swizzled = xsimd::swizzle(bytes, kSwizzles); + const auto swizzled = swizzle_bytes(bytes, kSwizzles); const auto words = xsimd::bitwise_cast(swizzled); (unpack_one_shift_impl(words, out), ...); } @@ -529,7 +597,7 @@ struct LargeKernel { static constexpr auto kSwizzlesArr = kPlan.swizzles.at(kReadIdx).at(kSwizzleIdx); constexpr auto kSwizzles = make_batch_constant(); - const auto swizzled = xsimd::swizzle(bytes, kSwizzles); + const auto swizzled = swizzle_bytes(bytes, kSwizzles); const auto words = xsimd::bitwise_cast(swizzled); (unpack_one_shift_impl(words, out), ...); } @@ -548,12 +616,12 @@ struct LargeKernel { const auto bytes = simd_bytes::load_unaligned(in + kPlan.reads.at(kReadIdx)); - const auto low_swizzled = xsimd::swizzle(bytes, kLowSwizzles); + const auto low_swizzled = swizzle_bytes(bytes, kLowSwizzles); const auto low_words = xsimd::bitwise_cast(low_swizzled); const auto low_shifted = right_shift_by_excess(low_words, kLowRShifts); const auto low_half_vals = low_shifted & kPlan.low_mask; - const auto high_swizzled = xsimd::swizzle(bytes, kHighSwizzles); + const auto high_swizzled = swizzle_bytes(bytes, kHighSwizzles); const auto high_words = xsimd::bitwise_cast(high_swizzled); const auto high_shifted = left_shift_no_overflow(high_words, kHighLShifts); const auto high_half_vals = high_shifted & kPlan.high_mask; From fa41ca74cba7dee5d8b7b013a4e1669ae1dc9781 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 30 Oct 2025 11:01:37 +0100 Subject: [PATCH 22/64] Remove dead code --- .../arrow/util/bpacking_simd_impl_internal.h | 31 +------------------ 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index f7de2b109d1..4871fe05641 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -35,7 +35,6 @@ namespace arrow::internal { // - _mm_cvtepi8_epi32 // - no _mm_srlv_epi32 (128bit) in xsimd with AVX2 required arch // - no need for while loop (for up to 8 is sufficient) -// - upstream var lshift to xsimd // - array to batch constant to xsimd // - Shifts per swizzle can be improved when self.packed_max_byte_spread == 1 and the // byte can be reused (when val_bit_width divides packed_max_byte_spread). @@ -45,6 +44,7 @@ namespace arrow::internal { // - For Avx2: // - Inspect how swizzle across lanes are handled: _mm256_shuffle_epi8 not used? // - Investigate AVX2 with 128 bit register +// - Fix overreading problem template constexpr Arr BuildConstantArray(typename Arr::value_type val) { @@ -573,35 +573,6 @@ struct LargeKernel { static constexpr int kValuesUnpacked = kPlan.kUnpackedPerkernel; - template - static void unpack_one_shift_impl(const simd_batch& words, unpacked_type* out) { - static constexpr auto kRightShiftsArr = - kPlan.shifts.at(kReadIdx).at(kSwizzleIdx).at(kShiftIdx); - constexpr auto kRightShifts = make_batch_constant(); - constexpr auto kMask = kPlan.mask; - constexpr auto kOutOffset = (kReadIdx * kPlan.unpacked_per_read() + - kSwizzleIdx * kPlan.unpacked_per_swizzle() + - kShiftIdx * kPlan.unpacked_per_shifts()); - - // Intel x86-64 does not have variable right shifts before AVX2. - // We know the packed value can safely be left shifted up to the largest offset so we - // can use the fallback on these platforms. - const auto shifted = right_shift_by_excess(words, kRightShifts); - const auto vals = shifted & kMask; - xsimd::store_unaligned(out + kOutOffset, vals); - } - - template - static void unpack_one_swizzle_impl(const simd_bytes& bytes, unpacked_type* out, - std::integer_sequence) { - static constexpr auto kSwizzlesArr = kPlan.swizzles.at(kReadIdx).at(kSwizzleIdx); - constexpr auto kSwizzles = make_batch_constant(); - - const auto swizzled = swizzle_bytes(bytes, kSwizzles); - const auto words = xsimd::bitwise_cast(swizzled); - (unpack_one_shift_impl(words, out), ...); - } - template static void unpack_one_read_impl(const uint8_t* in, unpacked_type* out) { static constexpr auto kLowSwizzlesArr = kPlan.low_swizzles.at(kReadIdx); From 9f24cf106c3f5d8c287f5a8aeeb0d9da05b4b263 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 30 Oct 2025 11:54:35 +0100 Subject: [PATCH 23/64] Simplify Large masks --- cpp/src/arrow/util/bpacking_simd_impl_internal.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 4871fe05641..cc67bafec43 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -506,8 +506,7 @@ struct LargeKernelPlan { SwizzlesPerKernel high_swizzles; ShitsPerKernel low_rshifts; ShitsPerKernel high_lshifts; - UnpackedUint low_mask; - UnpackedUint high_mask; + UnpackedUint mask; }; template @@ -551,11 +550,7 @@ constexpr LargeKernelPlan BuildLarge } } - constexpr auto mask = bit_util::LeastSignificantBitMask(kPackedBitSize); - constexpr auto half_low_bit_mask = - bit_util::LeastSignificantBitMask(kShape.unpacked_bit_size() / 2); - plan.low_mask = mask & half_low_bit_mask; - plan.high_mask = mask & (~half_low_bit_mask); + plan.mask = bit_util::LeastSignificantBitMask(kPackedBitSize); return plan; } @@ -590,14 +585,14 @@ struct LargeKernel { const auto low_swizzled = swizzle_bytes(bytes, kLowSwizzles); const auto low_words = xsimd::bitwise_cast(low_swizzled); const auto low_shifted = right_shift_by_excess(low_words, kLowRShifts); - const auto low_half_vals = low_shifted & kPlan.low_mask; const auto high_swizzled = swizzle_bytes(bytes, kHighSwizzles); const auto high_words = xsimd::bitwise_cast(high_swizzled); const auto high_shifted = left_shift_no_overflow(high_words, kHighLShifts); - const auto high_half_vals = high_shifted & kPlan.high_mask; - const auto vals = low_half_vals | high_half_vals; + // We can have a single mask and apply it after OR because the shifts will ensure that + // there are zeros where the high/low values are incomplete. + const auto vals = (low_shifted | high_shifted) & kPlan.mask; xsimd::store_unaligned(out + kReadIdx * kShape.unpacked_per_simd(), vals); } From 76b2428b42291049371a7bfe08d4919fb0e58d7a Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 30 Oct 2025 12:10:58 +0100 Subject: [PATCH 24/64] Remove bpacking 256 generated file --- .../bpacking_simd256_generated_internal.h | 15765 ---------------- 1 file changed, 15765 deletions(-) delete mode 100644 cpp/src/arrow/util/bpacking_simd256_generated_internal.h diff --git a/cpp/src/arrow/util/bpacking_simd256_generated_internal.h b/cpp/src/arrow/util/bpacking_simd256_generated_internal.h deleted file mode 100644 index ef8c12c077c..00000000000 --- a/cpp/src/arrow/util/bpacking_simd256_generated_internal.h +++ /dev/null @@ -1,15765 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// WARNING: this file is generated, DO NOT EDIT. -// Usage: -// python cpp/src/arrow/util/bpacking_simd_codegen.py 256 - -#pragma once - -#include -#include - -#include - -#include "arrow/util/ubsan.h" - -namespace arrow::internal { -namespace { - -using ::arrow::util::SafeLoadAs; - -template -struct Simd256UnpackerForWidth; - -template -struct Simd256UnpackerForWidth { - - static constexpr int kValuesUnpacked = Simd256UnpackerForWidth::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, bool* out) { - uint32_t buffer[kValuesUnpacked] = {}; - in = Simd256UnpackerForWidth::unpack(in, buffer); - for(int k = 0; k< kValuesUnpacked; ++k) { - out[k] = static_cast(buffer[k]); - } - return in; - } -}; - -template -struct Simd256UnpackerForWidth { - - static constexpr int kValuesUnpacked = Simd256UnpackerForWidth::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint8_t* out) { - uint32_t buffer[kValuesUnpacked] = {}; - in = Simd256UnpackerForWidth::unpack(in, buffer); - for(int k = 0; k< kValuesUnpacked; ++k) { - out[k] = static_cast(buffer[k]); - } - return in; - } -}; - -template -struct Simd256UnpackerForWidth { - - static constexpr int kValuesUnpacked = Simd256UnpackerForWidth::kValuesUnpacked; - - static const uint8_t* unpack(const uint8_t* in, uint16_t* out) { - uint32_t buffer[kValuesUnpacked] = {}; - in = Simd256UnpackerForWidth::unpack(in, buffer); - for(int k = 0; k< kValuesUnpacked; ++k) { - out[k] = static_cast(buffer[k]); - } - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 1-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 0, 1, 2, 3, 4, 5, 6, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 1-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 8, 9, 10, 11, 12, 13, 14, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 1-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 16, 17, 18, 19, 20, 21, 22, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 1-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 24, 25, 26, 27, 28, 29, 30, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 1 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 2-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 2-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 16, 18, 20, 22, 24, 26, 28, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 2-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 2-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 16, 18, 20, 22, 24, 26, 28, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 2 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 3-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 0, 3, 6, 9, 12, 15, 18, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 3-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 24, 27, 0, 1, 4, 7, 10, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 3-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{ 16, 19, 22, 25, 28, 0, 2, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 3-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{ 8, 11, 14, 17, 20, 23, 26, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 3 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xf; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 4-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 4-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 4-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 4-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 4 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 5-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 0, 5, 10, 15, 20, 25, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 5-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{ 8, 13, 18, 23, 0, 1, 6, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 5-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 31 | SafeLoadAs(in + 4 * 3) << 1, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{ 16, 21, 26, 0, 4, 9, 14, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 5-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 29 | SafeLoadAs(in + 4 * 4) << 3, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{ 24, 0, 2, 7, 12, 17, 22, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 5 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 6-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 0, 6, 12, 18, 24, 0, 4, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 6-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{ 16, 22, 0, 2, 8, 14, 20, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 6-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{ 0, 6, 12, 18, 24, 0, 4, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 6-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{ 16, 22, 0, 2, 8, 14, 20, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 6 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 7-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 0, 7, 14, 21, 0, 3, 10, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 7-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 27 | SafeLoadAs(in + 4 * 3) << 5, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{ 24, 0, 6, 13, 20, 0, 2, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 7-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{ 16, 23, 0, 5, 12, 19, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 7-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{ 8, 15, 22, 0, 4, 11, 18, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 7 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 8-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 8-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 8-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 8-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - }; - shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 8 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 9-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 27 | SafeLoadAs(in + 4 * 1) << 5, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, - }; - shifts = simd_batch{ 0, 9, 18, 0, 4, 13, 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 9-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{ 8, 17, 0, 3, 12, 21, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 9-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 25 | SafeLoadAs(in + 4 * 5) << 7, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{ 16, 0, 2, 11, 20, 0, 6, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 9-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - }; - shifts = simd_batch{ 0, 1, 10, 19, 0, 5, 14, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 9 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 10-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{ 0, 10, 20, 0, 8, 18, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 10-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{ 16, 0, 4, 14, 0, 2, 12, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 10-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 30 | SafeLoadAs(in + 4 * 6) << 2, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, - SafeLoadAs(in + 4 * 7), - }; - shifts = simd_batch{ 0, 10, 20, 0, 8, 18, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 10-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 26 | SafeLoadAs(in + 4 * 8) << 6, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9), - }; - shifts = simd_batch{ 16, 0, 4, 14, 0, 2, 12, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 10 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 11-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 22 | SafeLoadAs(in + 4 * 1) << 10, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 23 | SafeLoadAs(in + 4 * 2) << 9, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{ 0, 11, 0, 1, 12, 0, 2, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 11-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 2) >> 24 | SafeLoadAs(in + 4 * 3) << 8, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 25 | SafeLoadAs(in + 4 * 4) << 7, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{ 0, 3, 14, 0, 4, 15, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 11-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 27 | SafeLoadAs(in + 4 * 6) << 5, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 29 | SafeLoadAs(in + 4 * 8) << 3, - }; - shifts = simd_batch{ 16, 0, 6, 17, 0, 7, 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 11-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 31 | SafeLoadAs(in + 4 * 10) << 1, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10), - }; - shifts = simd_batch{ 8, 19, 0, 9, 20, 0, 10, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 11 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xfff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 12-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 24 | SafeLoadAs(in + 4 * 1) << 8, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 12-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 12-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - }; - shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 12-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 28 | SafeLoadAs(in + 4 * 11) << 4, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11), - }; - shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 12 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 13-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 26 | SafeLoadAs(in + 4 * 1) << 6, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 20 | SafeLoadAs(in + 4 * 2) << 12, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 27 | SafeLoadAs(in + 4 * 3) << 5, - }; - shifts = simd_batch{ 0, 13, 0, 7, 0, 1, 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 13-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 21 | SafeLoadAs(in + 4 * 4) << 11, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{ 8, 0, 2, 15, 0, 9, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 13-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 29 | SafeLoadAs(in + 4 * 7) << 3, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 23 | SafeLoadAs(in + 4 * 8) << 9, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, - SafeLoadAs(in + 4 * 9), - }; - shifts = simd_batch{ 16, 0, 10, 0, 4, 17, 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 13-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 31 | SafeLoadAs(in + 4 * 11) << 1, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 25 | SafeLoadAs(in + 4 * 12) << 7, - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12), - }; - shifts = simd_batch{ 0, 5, 18, 0, 12, 0, 6, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 13 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 14-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 24 | SafeLoadAs(in + 4 * 2) << 8, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{ 0, 14, 0, 10, 0, 6, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 14-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{ 16, 0, 12, 0, 8, 0, 4, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 14-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, - SafeLoadAs(in + 4 * 10), - }; - shifts = simd_batch{ 0, 14, 0, 10, 0, 6, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 14-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 30 | SafeLoadAs(in + 4 * 11) << 2, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 22 | SafeLoadAs(in + 4 * 13) << 10, - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13), - }; - shifts = simd_batch{ 16, 0, 12, 0, 8, 0, 4, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 14 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 15-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{ 0, 15, 0, 13, 0, 11, 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 15-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, - SafeLoadAs(in + 4 * 7), - }; - shifts = simd_batch{ 0, 7, 0, 5, 0, 3, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 15-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 31 | SafeLoadAs(in + 4 * 8) << 1, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 29 | SafeLoadAs(in + 4 * 9) << 3, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 27 | SafeLoadAs(in + 4 * 10) << 5, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 25 | SafeLoadAs(in + 4 * 11) << 7, - }; - shifts = simd_batch{ 16, 0, 14, 0, 12, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 15-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 23 | SafeLoadAs(in + 4 * 12) << 9, - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 21 | SafeLoadAs(in + 4 * 13) << 11, - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 19 | SafeLoadAs(in + 4 * 14) << 13, - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14), - }; - shifts = simd_batch{ 8, 0, 6, 0, 4, 0, 2, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 15 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 16-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 16-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - }; - shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 16-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11), - }; - shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 16-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15), - }; - shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 16 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1ffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 17-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 17 | SafeLoadAs(in + 4 * 1) << 15, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 19 | SafeLoadAs(in + 4 * 2) << 13, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 21 | SafeLoadAs(in + 4 * 3) << 11, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 23 | SafeLoadAs(in + 4 * 4) << 9, - }; - shifts = simd_batch{ 0, 0, 2, 0, 4, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 17-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 25 | SafeLoadAs(in + 4 * 5) << 7, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 27 | SafeLoadAs(in + 4 * 6) << 5, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 29 | SafeLoadAs(in + 4 * 7) << 3, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 31 | SafeLoadAs(in + 4 * 8) << 1, - }; - shifts = simd_batch{ 8, 0, 10, 0, 12, 0, 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 17-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 8) >> 16 | SafeLoadAs(in + 4 * 9) << 16, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 18 | SafeLoadAs(in + 4 * 10) << 14, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 20 | SafeLoadAs(in + 4 * 11) << 12, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 22 | SafeLoadAs(in + 4 * 12) << 10, - SafeLoadAs(in + 4 * 12), - }; - shifts = simd_batch{ 0, 1, 0, 3, 0, 5, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 17-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 12) >> 24 | SafeLoadAs(in + 4 * 13) << 8, - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 26 | SafeLoadAs(in + 4 * 14) << 6, - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14) >> 28 | SafeLoadAs(in + 4 * 15) << 4, - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 30 | SafeLoadAs(in + 4 * 16) << 2, - SafeLoadAs(in + 4 * 16), - }; - shifts = simd_batch{ 0, 9, 0, 11, 0, 13, 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 17 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3ffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 18-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 18 | SafeLoadAs(in + 4 * 1) << 14, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 22 | SafeLoadAs(in + 4 * 2) << 10, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - }; - shifts = simd_batch{ 0, 0, 4, 0, 8, 0, 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 18-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 4) >> 16 | SafeLoadAs(in + 4 * 5) << 16, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8), - }; - shifts = simd_batch{ 0, 2, 0, 6, 0, 10, 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 18-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 18 | SafeLoadAs(in + 4 * 10) << 14, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 22 | SafeLoadAs(in + 4 * 11) << 10, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 30 | SafeLoadAs(in + 4 * 13) << 2, - }; - shifts = simd_batch{ 0, 0, 4, 0, 8, 0, 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 18-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14) >> 20 | SafeLoadAs(in + 4 * 15) << 12, - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, - SafeLoadAs(in + 4 * 16), - SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, - SafeLoadAs(in + 4 * 17), - }; - shifts = simd_batch{ 0, 2, 0, 6, 0, 10, 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 18 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7ffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 19-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 19 | SafeLoadAs(in + 4 * 1) << 13, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 25 | SafeLoadAs(in + 4 * 2) << 7, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 31 | SafeLoadAs(in + 4 * 3) << 1, - SafeLoadAs(in + 4 * 3) >> 18 | SafeLoadAs(in + 4 * 4) << 14, - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{ 0, 0, 6, 0, 12, 0, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 19-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 4) >> 24 | SafeLoadAs(in + 4 * 5) << 8, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 30 | SafeLoadAs(in + 4 * 6) << 2, - SafeLoadAs(in + 4 * 6) >> 17 | SafeLoadAs(in + 4 * 7) << 15, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 23 | SafeLoadAs(in + 4 * 8) << 9, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 29 | SafeLoadAs(in + 4 * 9) << 3, - }; - shifts = simd_batch{ 0, 11, 0, 0, 4, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 19-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 9) >> 16 | SafeLoadAs(in + 4 * 10) << 16, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 22 | SafeLoadAs(in + 4 * 11) << 10, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 28 | SafeLoadAs(in + 4 * 12) << 4, - SafeLoadAs(in + 4 * 12) >> 15 | SafeLoadAs(in + 4 * 13) << 17, - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 21 | SafeLoadAs(in + 4 * 14) << 11, - }; - shifts = simd_batch{ 0, 3, 0, 9, 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 19-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14) >> 27 | SafeLoadAs(in + 4 * 15) << 5, - SafeLoadAs(in + 4 * 15) >> 14 | SafeLoadAs(in + 4 * 16) << 18, - SafeLoadAs(in + 4 * 16), - SafeLoadAs(in + 4 * 16) >> 20 | SafeLoadAs(in + 4 * 17) << 12, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 26 | SafeLoadAs(in + 4 * 18) << 6, - SafeLoadAs(in + 4 * 18), - }; - shifts = simd_batch{ 8, 0, 0, 1, 0, 7, 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 19 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xfffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 20-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 20 | SafeLoadAs(in + 4 * 1) << 12, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2) >> 16 | SafeLoadAs(in + 4 * 3) << 16, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 20-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, - SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, - SafeLoadAs(in + 4 * 9), - }; - shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 20-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 20 | SafeLoadAs(in + 4 * 11) << 12, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 28 | SafeLoadAs(in + 4 * 12) << 4, - SafeLoadAs(in + 4 * 12) >> 16 | SafeLoadAs(in + 4 * 13) << 16, - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 24 | SafeLoadAs(in + 4 * 14) << 8, - SafeLoadAs(in + 4 * 14), - }; - shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 20-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 20 | SafeLoadAs(in + 4 * 16) << 12, - SafeLoadAs(in + 4 * 16), - SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, - SafeLoadAs(in + 4 * 17) >> 16 | SafeLoadAs(in + 4 * 18) << 16, - SafeLoadAs(in + 4 * 18), - SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, - SafeLoadAs(in + 4 * 19), - }; - shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 20 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1fffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 21-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 21 | SafeLoadAs(in + 4 * 1) << 11, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, - SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4) >> 19 | SafeLoadAs(in + 4 * 5) << 13, - }; - shifts = simd_batch{ 0, 0, 10, 0, 0, 9, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 21-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, - SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8) >> 17 | SafeLoadAs(in + 4 * 9) << 15, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 27 | SafeLoadAs(in + 4 * 10) << 5, - }; - shifts = simd_batch{ 8, 0, 0, 7, 0, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 21-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, - SafeLoadAs(in + 4 * 12) >> 15 | SafeLoadAs(in + 4 * 13) << 17, - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 25 | SafeLoadAs(in + 4 * 14) << 7, - SafeLoadAs(in + 4 * 14) >> 14 | SafeLoadAs(in + 4 * 15) << 18, - SafeLoadAs(in + 4 * 15), - }; - shifts = simd_batch{ 0, 5, 0, 0, 4, 0, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 21-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, - SafeLoadAs(in + 4 * 16) >> 13 | SafeLoadAs(in + 4 * 17) << 19, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 23 | SafeLoadAs(in + 4 * 18) << 9, - SafeLoadAs(in + 4 * 18) >> 12 | SafeLoadAs(in + 4 * 19) << 20, - SafeLoadAs(in + 4 * 19), - SafeLoadAs(in + 4 * 19) >> 22 | SafeLoadAs(in + 4 * 20) << 10, - SafeLoadAs(in + 4 * 20), - }; - shifts = simd_batch{ 0, 0, 2, 0, 0, 1, 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 21 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3fffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 22-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 22 | SafeLoadAs(in + 4 * 1) << 10, - SafeLoadAs(in + 4 * 1) >> 12 | SafeLoadAs(in + 4 * 2) << 20, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 24 | SafeLoadAs(in + 4 * 3) << 8, - SafeLoadAs(in + 4 * 3) >> 14 | SafeLoadAs(in + 4 * 4) << 18, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, - }; - shifts = simd_batch{ 0, 0, 0, 2, 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 22-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 5) >> 16 | SafeLoadAs(in + 4 * 6) << 16, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, - SafeLoadAs(in + 4 * 7) >> 18 | SafeLoadAs(in + 4 * 8) << 14, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, - SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, - SafeLoadAs(in + 4 * 10), - }; - shifts = simd_batch{ 0, 6, 0, 0, 8, 0, 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 22-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 22 | SafeLoadAs(in + 4 * 12) << 10, - SafeLoadAs(in + 4 * 12) >> 12 | SafeLoadAs(in + 4 * 13) << 20, - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 24 | SafeLoadAs(in + 4 * 14) << 8, - SafeLoadAs(in + 4 * 14) >> 14 | SafeLoadAs(in + 4 * 15) << 18, - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 26 | SafeLoadAs(in + 4 * 16) << 6, - }; - shifts = simd_batch{ 0, 0, 0, 2, 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 22-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 16) >> 16 | SafeLoadAs(in + 4 * 17) << 16, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 28 | SafeLoadAs(in + 4 * 18) << 4, - SafeLoadAs(in + 4 * 18) >> 18 | SafeLoadAs(in + 4 * 19) << 14, - SafeLoadAs(in + 4 * 19), - SafeLoadAs(in + 4 * 19) >> 30 | SafeLoadAs(in + 4 * 20) << 2, - SafeLoadAs(in + 4 * 20) >> 20 | SafeLoadAs(in + 4 * 21) << 12, - SafeLoadAs(in + 4 * 21), - }; - shifts = simd_batch{ 0, 6, 0, 0, 8, 0, 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 22 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7fffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 23-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 23 | SafeLoadAs(in + 4 * 1) << 9, - SafeLoadAs(in + 4 * 1) >> 14 | SafeLoadAs(in + 4 * 2) << 18, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 28 | SafeLoadAs(in + 4 * 3) << 4, - SafeLoadAs(in + 4 * 3) >> 19 | SafeLoadAs(in + 4 * 4) << 13, - SafeLoadAs(in + 4 * 4) >> 10 | SafeLoadAs(in + 4 * 5) << 22, - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{ 0, 0, 0, 5, 0, 0, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 23-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 5) >> 24 | SafeLoadAs(in + 4 * 6) << 8, - SafeLoadAs(in + 4 * 6) >> 15 | SafeLoadAs(in + 4 * 7) << 17, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 29 | SafeLoadAs(in + 4 * 8) << 3, - SafeLoadAs(in + 4 * 8) >> 20 | SafeLoadAs(in + 4 * 9) << 12, - SafeLoadAs(in + 4 * 9) >> 11 | SafeLoadAs(in + 4 * 10) << 21, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 25 | SafeLoadAs(in + 4 * 11) << 7, - }; - shifts = simd_batch{ 0, 0, 6, 0, 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 23-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 11) >> 16 | SafeLoadAs(in + 4 * 12) << 16, - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 30 | SafeLoadAs(in + 4 * 13) << 2, - SafeLoadAs(in + 4 * 13) >> 21 | SafeLoadAs(in + 4 * 14) << 11, - SafeLoadAs(in + 4 * 14) >> 12 | SafeLoadAs(in + 4 * 15) << 20, - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 26 | SafeLoadAs(in + 4 * 16) << 6, - SafeLoadAs(in + 4 * 16) >> 17 | SafeLoadAs(in + 4 * 17) << 15, - }; - shifts = simd_batch{ 0, 7, 0, 0, 0, 3, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 23-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 31 | SafeLoadAs(in + 4 * 18) << 1, - SafeLoadAs(in + 4 * 18) >> 22 | SafeLoadAs(in + 4 * 19) << 10, - SafeLoadAs(in + 4 * 19) >> 13 | SafeLoadAs(in + 4 * 20) << 19, - SafeLoadAs(in + 4 * 20), - SafeLoadAs(in + 4 * 20) >> 27 | SafeLoadAs(in + 4 * 21) << 5, - SafeLoadAs(in + 4 * 21) >> 18 | SafeLoadAs(in + 4 * 22) << 14, - SafeLoadAs(in + 4 * 22), - }; - shifts = simd_batch{ 8, 0, 0, 0, 4, 0, 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 23 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 24-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 24 | SafeLoadAs(in + 4 * 1) << 8, - SafeLoadAs(in + 4 * 1) >> 16 | SafeLoadAs(in + 4 * 2) << 16, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4) >> 16 | SafeLoadAs(in + 4 * 5) << 16, - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 24-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, - SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, - SafeLoadAs(in + 4 * 11), - }; - shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 24-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 24 | SafeLoadAs(in + 4 * 13) << 8, - SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, - SafeLoadAs(in + 4 * 16) >> 16 | SafeLoadAs(in + 4 * 17) << 16, - SafeLoadAs(in + 4 * 17), - }; - shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 24-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 18), - SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, - SafeLoadAs(in + 4 * 19) >> 16 | SafeLoadAs(in + 4 * 20) << 16, - SafeLoadAs(in + 4 * 20), - SafeLoadAs(in + 4 * 21), - SafeLoadAs(in + 4 * 21) >> 24 | SafeLoadAs(in + 4 * 22) << 8, - SafeLoadAs(in + 4 * 22) >> 16 | SafeLoadAs(in + 4 * 23) << 16, - SafeLoadAs(in + 4 * 23), - }; - shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 24 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1ffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 25-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 25 | SafeLoadAs(in + 4 * 1) << 7, - SafeLoadAs(in + 4 * 1) >> 18 | SafeLoadAs(in + 4 * 2) << 14, - SafeLoadAs(in + 4 * 2) >> 11 | SafeLoadAs(in + 4 * 3) << 21, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 29 | SafeLoadAs(in + 4 * 4) << 3, - SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, - SafeLoadAs(in + 4 * 5) >> 15 | SafeLoadAs(in + 4 * 6) << 17, - }; - shifts = simd_batch{ 0, 0, 0, 0, 4, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 25-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 8 | SafeLoadAs(in + 4 * 7) << 24, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 26 | SafeLoadAs(in + 4 * 8) << 6, - SafeLoadAs(in + 4 * 8) >> 19 | SafeLoadAs(in + 4 * 9) << 13, - SafeLoadAs(in + 4 * 9) >> 12 | SafeLoadAs(in + 4 * 10) << 20, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 30 | SafeLoadAs(in + 4 * 11) << 2, - SafeLoadAs(in + 4 * 11) >> 23 | SafeLoadAs(in + 4 * 12) << 9, - }; - shifts = simd_batch{ 0, 1, 0, 0, 0, 5, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 25-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 12) >> 16 | SafeLoadAs(in + 4 * 13) << 16, - SafeLoadAs(in + 4 * 13) >> 9 | SafeLoadAs(in + 4 * 14) << 23, - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14) >> 27 | SafeLoadAs(in + 4 * 15) << 5, - SafeLoadAs(in + 4 * 15) >> 20 | SafeLoadAs(in + 4 * 16) << 12, - SafeLoadAs(in + 4 * 16) >> 13 | SafeLoadAs(in + 4 * 17) << 19, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 31 | SafeLoadAs(in + 4 * 18) << 1, - }; - shifts = simd_batch{ 0, 0, 2, 0, 0, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 25-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, - SafeLoadAs(in + 4 * 19) >> 17 | SafeLoadAs(in + 4 * 20) << 15, - SafeLoadAs(in + 4 * 20) >> 10 | SafeLoadAs(in + 4 * 21) << 22, - SafeLoadAs(in + 4 * 21), - SafeLoadAs(in + 4 * 21) >> 28 | SafeLoadAs(in + 4 * 22) << 4, - SafeLoadAs(in + 4 * 22) >> 21 | SafeLoadAs(in + 4 * 23) << 11, - SafeLoadAs(in + 4 * 23) >> 14 | SafeLoadAs(in + 4 * 24) << 18, - SafeLoadAs(in + 4 * 24), - }; - shifts = simd_batch{ 0, 0, 0, 3, 0, 0, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 25 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3ffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 26-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 26 | SafeLoadAs(in + 4 * 1) << 6, - SafeLoadAs(in + 4 * 1) >> 20 | SafeLoadAs(in + 4 * 2) << 12, - SafeLoadAs(in + 4 * 2) >> 14 | SafeLoadAs(in + 4 * 3) << 18, - SafeLoadAs(in + 4 * 3) >> 8 | SafeLoadAs(in + 4 * 4) << 24, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, - SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 2, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 26-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 16 | SafeLoadAs(in + 4 * 7) << 16, - SafeLoadAs(in + 4 * 7) >> 10 | SafeLoadAs(in + 4 * 8) << 22, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, - SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, - SafeLoadAs(in + 4 * 10) >> 18 | SafeLoadAs(in + 4 * 11) << 14, - SafeLoadAs(in + 4 * 11) >> 12 | SafeLoadAs(in + 4 * 12) << 20, - SafeLoadAs(in + 4 * 12), - }; - shifts = simd_batch{ 0, 0, 4, 0, 0, 0, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 26-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 26 | SafeLoadAs(in + 4 * 14) << 6, - SafeLoadAs(in + 4 * 14) >> 20 | SafeLoadAs(in + 4 * 15) << 12, - SafeLoadAs(in + 4 * 15) >> 14 | SafeLoadAs(in + 4 * 16) << 18, - SafeLoadAs(in + 4 * 16) >> 8 | SafeLoadAs(in + 4 * 17) << 24, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 28 | SafeLoadAs(in + 4 * 18) << 4, - SafeLoadAs(in + 4 * 18) >> 22 | SafeLoadAs(in + 4 * 19) << 10, - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 2, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 26-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 19) >> 16 | SafeLoadAs(in + 4 * 20) << 16, - SafeLoadAs(in + 4 * 20) >> 10 | SafeLoadAs(in + 4 * 21) << 22, - SafeLoadAs(in + 4 * 21), - SafeLoadAs(in + 4 * 21) >> 30 | SafeLoadAs(in + 4 * 22) << 2, - SafeLoadAs(in + 4 * 22) >> 24 | SafeLoadAs(in + 4 * 23) << 8, - SafeLoadAs(in + 4 * 23) >> 18 | SafeLoadAs(in + 4 * 24) << 14, - SafeLoadAs(in + 4 * 24) >> 12 | SafeLoadAs(in + 4 * 25) << 20, - SafeLoadAs(in + 4 * 25), - }; - shifts = simd_batch{ 0, 0, 4, 0, 0, 0, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 26 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7ffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 27-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 27 | SafeLoadAs(in + 4 * 1) << 5, - SafeLoadAs(in + 4 * 1) >> 22 | SafeLoadAs(in + 4 * 2) << 10, - SafeLoadAs(in + 4 * 2) >> 17 | SafeLoadAs(in + 4 * 3) << 15, - SafeLoadAs(in + 4 * 3) >> 12 | SafeLoadAs(in + 4 * 4) << 20, - SafeLoadAs(in + 4 * 4) >> 7 | SafeLoadAs(in + 4 * 5) << 25, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 27-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7) >> 19 | SafeLoadAs(in + 4 * 8) << 13, - SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, - SafeLoadAs(in + 4 * 9) >> 9 | SafeLoadAs(in + 4 * 10) << 23, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 31 | SafeLoadAs(in + 4 * 11) << 1, - SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, - SafeLoadAs(in + 4 * 12) >> 21 | SafeLoadAs(in + 4 * 13) << 11, - }; - shifts = simd_batch{ 0, 0, 0, 0, 4, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 27-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, - SafeLoadAs(in + 4 * 14) >> 11 | SafeLoadAs(in + 4 * 15) << 21, - SafeLoadAs(in + 4 * 15) >> 6 | SafeLoadAs(in + 4 * 16) << 26, - SafeLoadAs(in + 4 * 16), - SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, - SafeLoadAs(in + 4 * 17) >> 23 | SafeLoadAs(in + 4 * 18) << 9, - SafeLoadAs(in + 4 * 18) >> 18 | SafeLoadAs(in + 4 * 19) << 14, - SafeLoadAs(in + 4 * 19) >> 13 | SafeLoadAs(in + 4 * 20) << 19, - }; - shifts = simd_batch{ 0, 0, 0, 1, 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 27-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 20) >> 8 | SafeLoadAs(in + 4 * 21) << 24, - SafeLoadAs(in + 4 * 21), - SafeLoadAs(in + 4 * 21) >> 30 | SafeLoadAs(in + 4 * 22) << 2, - SafeLoadAs(in + 4 * 22) >> 25 | SafeLoadAs(in + 4 * 23) << 7, - SafeLoadAs(in + 4 * 23) >> 20 | SafeLoadAs(in + 4 * 24) << 12, - SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, - SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, - SafeLoadAs(in + 4 * 26), - }; - shifts = simd_batch{ 0, 3, 0, 0, 0, 0, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 27 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xfffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 28-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, - SafeLoadAs(in + 4 * 1) >> 24 | SafeLoadAs(in + 4 * 2) << 8, - SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, - SafeLoadAs(in + 4 * 3) >> 16 | SafeLoadAs(in + 4 * 4) << 16, - SafeLoadAs(in + 4 * 4) >> 12 | SafeLoadAs(in + 4 * 5) << 20, - SafeLoadAs(in + 4 * 5) >> 8 | SafeLoadAs(in + 4 * 6) << 24, - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 28-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, - SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, - SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, - SafeLoadAs(in + 4 * 11) >> 12 | SafeLoadAs(in + 4 * 12) << 20, - SafeLoadAs(in + 4 * 12) >> 8 | SafeLoadAs(in + 4 * 13) << 24, - SafeLoadAs(in + 4 * 13), - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 28-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14) >> 28 | SafeLoadAs(in + 4 * 15) << 4, - SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, - SafeLoadAs(in + 4 * 16) >> 20 | SafeLoadAs(in + 4 * 17) << 12, - SafeLoadAs(in + 4 * 17) >> 16 | SafeLoadAs(in + 4 * 18) << 16, - SafeLoadAs(in + 4 * 18) >> 12 | SafeLoadAs(in + 4 * 19) << 20, - SafeLoadAs(in + 4 * 19) >> 8 | SafeLoadAs(in + 4 * 20) << 24, - SafeLoadAs(in + 4 * 20), - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 28-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 21), - SafeLoadAs(in + 4 * 21) >> 28 | SafeLoadAs(in + 4 * 22) << 4, - SafeLoadAs(in + 4 * 22) >> 24 | SafeLoadAs(in + 4 * 23) << 8, - SafeLoadAs(in + 4 * 23) >> 20 | SafeLoadAs(in + 4 * 24) << 12, - SafeLoadAs(in + 4 * 24) >> 16 | SafeLoadAs(in + 4 * 25) << 16, - SafeLoadAs(in + 4 * 25) >> 12 | SafeLoadAs(in + 4 * 26) << 20, - SafeLoadAs(in + 4 * 26) >> 8 | SafeLoadAs(in + 4 * 27) << 24, - SafeLoadAs(in + 4 * 27), - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 28 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1fffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 29-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 29 | SafeLoadAs(in + 4 * 1) << 3, - SafeLoadAs(in + 4 * 1) >> 26 | SafeLoadAs(in + 4 * 2) << 6, - SafeLoadAs(in + 4 * 2) >> 23 | SafeLoadAs(in + 4 * 3) << 9, - SafeLoadAs(in + 4 * 3) >> 20 | SafeLoadAs(in + 4 * 4) << 12, - SafeLoadAs(in + 4 * 4) >> 17 | SafeLoadAs(in + 4 * 5) << 15, - SafeLoadAs(in + 4 * 5) >> 14 | SafeLoadAs(in + 4 * 6) << 18, - SafeLoadAs(in + 4 * 6) >> 11 | SafeLoadAs(in + 4 * 7) << 21, - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 29-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 8 | SafeLoadAs(in + 4 * 8) << 24, - SafeLoadAs(in + 4 * 8) >> 5 | SafeLoadAs(in + 4 * 9) << 27, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 31 | SafeLoadAs(in + 4 * 10) << 1, - SafeLoadAs(in + 4 * 10) >> 28 | SafeLoadAs(in + 4 * 11) << 4, - SafeLoadAs(in + 4 * 11) >> 25 | SafeLoadAs(in + 4 * 12) << 7, - SafeLoadAs(in + 4 * 12) >> 22 | SafeLoadAs(in + 4 * 13) << 10, - SafeLoadAs(in + 4 * 13) >> 19 | SafeLoadAs(in + 4 * 14) << 13, - }; - shifts = simd_batch{ 0, 0, 2, 0, 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 29-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 14) >> 16 | SafeLoadAs(in + 4 * 15) << 16, - SafeLoadAs(in + 4 * 15) >> 13 | SafeLoadAs(in + 4 * 16) << 19, - SafeLoadAs(in + 4 * 16) >> 10 | SafeLoadAs(in + 4 * 17) << 22, - SafeLoadAs(in + 4 * 17) >> 7 | SafeLoadAs(in + 4 * 18) << 25, - SafeLoadAs(in + 4 * 18) >> 4 | SafeLoadAs(in + 4 * 19) << 28, - SafeLoadAs(in + 4 * 19), - SafeLoadAs(in + 4 * 19) >> 30 | SafeLoadAs(in + 4 * 20) << 2, - SafeLoadAs(in + 4 * 20) >> 27 | SafeLoadAs(in + 4 * 21) << 5, - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 1, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 29-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 21) >> 24 | SafeLoadAs(in + 4 * 22) << 8, - SafeLoadAs(in + 4 * 22) >> 21 | SafeLoadAs(in + 4 * 23) << 11, - SafeLoadAs(in + 4 * 23) >> 18 | SafeLoadAs(in + 4 * 24) << 14, - SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, - SafeLoadAs(in + 4 * 25) >> 12 | SafeLoadAs(in + 4 * 26) << 20, - SafeLoadAs(in + 4 * 26) >> 9 | SafeLoadAs(in + 4 * 27) << 23, - SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, - SafeLoadAs(in + 4 * 28), - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 29 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3fffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 30-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, - SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, - SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 30-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, - SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, - SafeLoadAs(in + 4 * 9) >> 12 | SafeLoadAs(in + 4 * 10) << 20, - SafeLoadAs(in + 4 * 10) >> 10 | SafeLoadAs(in + 4 * 11) << 22, - SafeLoadAs(in + 4 * 11) >> 8 | SafeLoadAs(in + 4 * 12) << 24, - SafeLoadAs(in + 4 * 12) >> 6 | SafeLoadAs(in + 4 * 13) << 26, - SafeLoadAs(in + 4 * 13) >> 4 | SafeLoadAs(in + 4 * 14) << 28, - SafeLoadAs(in + 4 * 14), - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 30-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 30 | SafeLoadAs(in + 4 * 16) << 2, - SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, - SafeLoadAs(in + 4 * 17) >> 26 | SafeLoadAs(in + 4 * 18) << 6, - SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, - SafeLoadAs(in + 4 * 19) >> 22 | SafeLoadAs(in + 4 * 20) << 10, - SafeLoadAs(in + 4 * 20) >> 20 | SafeLoadAs(in + 4 * 21) << 12, - SafeLoadAs(in + 4 * 21) >> 18 | SafeLoadAs(in + 4 * 22) << 14, - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 30-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 22) >> 16 | SafeLoadAs(in + 4 * 23) << 16, - SafeLoadAs(in + 4 * 23) >> 14 | SafeLoadAs(in + 4 * 24) << 18, - SafeLoadAs(in + 4 * 24) >> 12 | SafeLoadAs(in + 4 * 25) << 20, - SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, - SafeLoadAs(in + 4 * 26) >> 8 | SafeLoadAs(in + 4 * 27) << 24, - SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, - SafeLoadAs(in + 4 * 28) >> 4 | SafeLoadAs(in + 4 * 29) << 28, - SafeLoadAs(in + 4 * 29), - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 30 * 4; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 32; - - static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7fffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 31-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 31 | SafeLoadAs(in + 4 * 1) << 1, - SafeLoadAs(in + 4 * 1) >> 30 | SafeLoadAs(in + 4 * 2) << 2, - SafeLoadAs(in + 4 * 2) >> 29 | SafeLoadAs(in + 4 * 3) << 3, - SafeLoadAs(in + 4 * 3) >> 28 | SafeLoadAs(in + 4 * 4) << 4, - SafeLoadAs(in + 4 * 4) >> 27 | SafeLoadAs(in + 4 * 5) << 5, - SafeLoadAs(in + 4 * 5) >> 26 | SafeLoadAs(in + 4 * 6) << 6, - SafeLoadAs(in + 4 * 6) >> 25 | SafeLoadAs(in + 4 * 7) << 7, - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 31-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 24 | SafeLoadAs(in + 4 * 8) << 8, - SafeLoadAs(in + 4 * 8) >> 23 | SafeLoadAs(in + 4 * 9) << 9, - SafeLoadAs(in + 4 * 9) >> 22 | SafeLoadAs(in + 4 * 10) << 10, - SafeLoadAs(in + 4 * 10) >> 21 | SafeLoadAs(in + 4 * 11) << 11, - SafeLoadAs(in + 4 * 11) >> 20 | SafeLoadAs(in + 4 * 12) << 12, - SafeLoadAs(in + 4 * 12) >> 19 | SafeLoadAs(in + 4 * 13) << 13, - SafeLoadAs(in + 4 * 13) >> 18 | SafeLoadAs(in + 4 * 14) << 14, - SafeLoadAs(in + 4 * 14) >> 17 | SafeLoadAs(in + 4 * 15) << 15, - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 31-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 15) >> 16 | SafeLoadAs(in + 4 * 16) << 16, - SafeLoadAs(in + 4 * 16) >> 15 | SafeLoadAs(in + 4 * 17) << 17, - SafeLoadAs(in + 4 * 17) >> 14 | SafeLoadAs(in + 4 * 18) << 18, - SafeLoadAs(in + 4 * 18) >> 13 | SafeLoadAs(in + 4 * 19) << 19, - SafeLoadAs(in + 4 * 19) >> 12 | SafeLoadAs(in + 4 * 20) << 20, - SafeLoadAs(in + 4 * 20) >> 11 | SafeLoadAs(in + 4 * 21) << 21, - SafeLoadAs(in + 4 * 21) >> 10 | SafeLoadAs(in + 4 * 22) << 22, - SafeLoadAs(in + 4 * 22) >> 9 | SafeLoadAs(in + 4 * 23) << 23, - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 31-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 23) >> 8 | SafeLoadAs(in + 4 * 24) << 24, - SafeLoadAs(in + 4 * 24) >> 7 | SafeLoadAs(in + 4 * 25) << 25, - SafeLoadAs(in + 4 * 25) >> 6 | SafeLoadAs(in + 4 * 26) << 26, - SafeLoadAs(in + 4 * 26) >> 5 | SafeLoadAs(in + 4 * 27) << 27, - SafeLoadAs(in + 4 * 27) >> 4 | SafeLoadAs(in + 4 * 28) << 28, - SafeLoadAs(in + 4 * 28) >> 3 | SafeLoadAs(in + 4 * 29) << 29, - SafeLoadAs(in + 4 * 29) >> 2 | SafeLoadAs(in + 4 * 30) << 30, - SafeLoadAs(in + 4 * 30), - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 31 * 4; - return in; - } -}; - - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 1-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 1, 2, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 4, 5, 6, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 8, 9, 10, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 12, 13, 14, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 16, 17, 18, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 20, 21, 22, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 24, 25, 26, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 28, 29, 30, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 32, 33, 34, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 36, 37, 38, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 40, 41, 42, 43 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 44, 45, 46, 47 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 48, 49, 50, 51 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 52, 53, 54, 55 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 56, 57, 58, 59 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 60, 61, 62, 63 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 1 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 2-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 2, 4, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 8, 10, 12, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 16, 18, 20, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 24, 26, 28, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 32, 34, 36, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 40, 42, 44, 46 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 48, 50, 52, 54 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 56, 58, 60, 62 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 2, 4, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 8, 10, 12, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 16, 18, 20, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 24, 26, 28, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 32, 34, 36, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 40, 42, 44, 46 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 48, 50, 52, 54 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 56, 58, 60, 62 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 2 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 3-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 3, 6, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 12, 15, 18, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 24, 27, 30, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 36, 39, 42, 45 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 48, 51, 54, 57 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 63 | SafeLoadAs(in + 8 * 1) << 1, - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 60, 0, 2, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 8, 11, 14, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 20, 23, 26, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 32, 35, 38, 41 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 44, 47, 50, 53 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 62 | SafeLoadAs(in + 8 * 2) << 2, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 56, 59, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 4, 7, 10, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 16, 19, 22, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 28, 31, 34, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 40, 43, 46, 49 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 52, 55, 58, 61 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 3 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xf; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 4-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 4, 8, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 16, 20, 24, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 32, 36, 40, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 48, 52, 56, 60 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 4, 8, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 16, 20, 24, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 32, 36, 40, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 48, 52, 56, 60 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 4, 8, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 16, 20, 24, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 32, 36, 40, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 48, 52, 56, 60 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 4, 8, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 16, 20, 24, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 32, 36, 40, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 48, 52, 56, 60 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 4 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 5-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 5, 10, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 20, 25, 30, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 40, 45, 50, 55 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 60 | SafeLoadAs(in + 8 * 1) << 4, - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 1, 6, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 16, 21, 26, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 36, 41, 46, 51 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 61 | SafeLoadAs(in + 8 * 2) << 3, - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 56, 0, 2, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 12, 17, 22, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 32, 37, 42, 47 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 62 | SafeLoadAs(in + 8 * 3) << 2, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 52, 57, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 8, 13, 18, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 28, 33, 38, 43 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 63 | SafeLoadAs(in + 8 * 4) << 1, - }; - shifts = simd_batch{ 48, 53, 58, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 4, 9, 14, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 24, 29, 34, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 44, 49, 54, 59 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 5 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 6-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 6, 12, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 24, 30, 36, 42 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 60 | SafeLoadAs(in + 8 * 1) << 4, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 48, 54, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 8, 14, 20, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 32, 38, 44, 50 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 62 | SafeLoadAs(in + 8 * 2) << 2, - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 56, 0, 4, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 16, 22, 28, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 40, 46, 52, 58 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 6, 12, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 24, 30, 36, 42 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 48, 54, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 8, 14, 20, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 32, 38, 44, 50 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 62 | SafeLoadAs(in + 8 * 5) << 2, - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 56, 0, 4, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 16, 22, 28, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 40, 46, 52, 58 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 6 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 7-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 7, 14, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 28, 35, 42, 49 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 63 | SafeLoadAs(in + 8 * 1) << 1, - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 56, 0, 6, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 20, 27, 34, 41 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 62 | SafeLoadAs(in + 8 * 2) << 2, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 48, 55, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 12, 19, 26, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 61 | SafeLoadAs(in + 8 * 3) << 3, - }; - shifts = simd_batch{ 40, 47, 54, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 4, 11, 18, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 32, 39, 46, 53 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 3, 10, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 24, 31, 38, 45 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 59 | SafeLoadAs(in + 8 * 5) << 5, - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 52, 0, 2, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 16, 23, 30, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 58 | SafeLoadAs(in + 8 * 6) << 6, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 44, 51, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 8, 15, 22, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 36, 43, 50, 57 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 7 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 8-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 32, 40, 48, 56 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 32, 40, 48, 56 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 32, 40, 48, 56 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 32, 40, 48, 56 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 32, 40, 48, 56 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 32, 40, 48, 56 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 32, 40, 48, 56 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 8, 16, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 32, 40, 48, 56 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 8 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 9-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 9, 18, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 63 | SafeLoadAs(in + 8 * 1) << 1, - }; - shifts = simd_batch{ 36, 45, 54, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 8, 17, 26, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 62 | SafeLoadAs(in + 8 * 2) << 2, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 44, 53, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 16, 25, 34, 43 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 61 | SafeLoadAs(in + 8 * 3) << 3, - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 52, 0, 6, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 24, 33, 42, 51 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 5, 14, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 59 | SafeLoadAs(in + 8 * 5) << 5, - }; - shifts = simd_batch{ 32, 41, 50, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 4, 13, 22, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 58 | SafeLoadAs(in + 8 * 6) << 6, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 40, 49, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 12, 21, 30, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 57 | SafeLoadAs(in + 8 * 7) << 7, - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 48, 0, 2, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 20, 29, 38, 47 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 1, 10, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 28, 37, 46, 55 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 9 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 10-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 10, 20, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 60 | SafeLoadAs(in + 8 * 1) << 4, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 40, 50, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 16, 26, 36, 46 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 56 | SafeLoadAs(in + 8 * 2) << 8, - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 2, 12, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 62 | SafeLoadAs(in + 8 * 3) << 2, - }; - shifts = simd_batch{ 32, 42, 52, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 8, 18, 28, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 58 | SafeLoadAs(in + 8 * 4) << 6, - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 48, 0, 4, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 24, 34, 44, 54 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 10, 20, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 60 | SafeLoadAs(in + 8 * 6) << 4, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 40, 50, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 16, 26, 36, 46 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 56 | SafeLoadAs(in + 8 * 7) << 8, - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 2, 12, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 62 | SafeLoadAs(in + 8 * 8) << 2, - }; - shifts = simd_batch{ 32, 42, 52, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 8, 18, 28, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 58 | SafeLoadAs(in + 8 * 9) << 6, - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 48, 0, 4, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 24, 34, 44, 54 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 10 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 11-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 11, 22, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 55 | SafeLoadAs(in + 8 * 1) << 9, - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 44, 0, 2, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 57 | SafeLoadAs(in + 8 * 2) << 7, - }; - shifts = simd_batch{ 24, 35, 46, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 4, 15, 26, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 59 | SafeLoadAs(in + 8 * 3) << 5, - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 48, 0, 6, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 61 | SafeLoadAs(in + 8 * 4) << 3, - }; - shifts = simd_batch{ 28, 39, 50, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 8, 19, 30, 41 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 63 | SafeLoadAs(in + 8 * 5) << 1, - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 52, 0, 10, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 54 | SafeLoadAs(in + 8 * 6) << 10, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 32, 43, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 12, 23, 34, 45 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 56 | SafeLoadAs(in + 8 * 7) << 8, - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 3, 14, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 58 | SafeLoadAs(in + 8 * 8) << 6, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 36, 47, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 16, 27, 38, 49 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 60 | SafeLoadAs(in + 8 * 9) << 4, - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 7, 18, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 62 | SafeLoadAs(in + 8 * 10) << 2, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 40, 51, 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 20, 31, 42, 53 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 11 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xfff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 12-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 12, 24, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 60 | SafeLoadAs(in + 8 * 1) << 4, - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 48, 0, 8, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 56 | SafeLoadAs(in + 8 * 2) << 8, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 32, 44, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 16, 28, 40, 52 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 12, 24, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 48, 0, 8, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 56 | SafeLoadAs(in + 8 * 5) << 8, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 32, 44, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 16, 28, 40, 52 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 12, 24, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 60 | SafeLoadAs(in + 8 * 7) << 4, - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 48, 0, 8, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 32, 44, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 16, 28, 40, 52 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 12, 24, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 60 | SafeLoadAs(in + 8 * 10) << 4, - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 48, 0, 8, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 56 | SafeLoadAs(in + 8 * 11) << 8, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 32, 44, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 16, 28, 40, 52 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 12 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 13-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 13, 26, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 13-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 52 | SafeLoadAs(in + 8 * 1) << 12, - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 1, 14, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 13-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 53 | SafeLoadAs(in + 8 * 2) << 11, - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 40, 0, 2, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 13-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 54 | SafeLoadAs(in + 8 * 3) << 10, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 28, 41, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 13-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 55 | SafeLoadAs(in + 8 * 4) << 9, - }; - shifts = simd_batch{ 16, 29, 42, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 13-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 4, 17, 30, 43 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 13-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 56 | SafeLoadAs(in + 8 * 5) << 8, - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 5, 18, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 13-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 57 | SafeLoadAs(in + 8 * 6) << 7, - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 44, 0, 6, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 13-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 58 | SafeLoadAs(in + 8 * 7) << 6, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 32, 45, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 13-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 59 | SafeLoadAs(in + 8 * 8) << 5, - }; - shifts = simd_batch{ 20, 33, 46, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 13-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 8, 21, 34, 47 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 13-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 60 | SafeLoadAs(in + 8 * 9) << 4, - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 9, 22, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 13-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 61 | SafeLoadAs(in + 8 * 10) << 3, - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 48, 0, 10, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 13-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 62 | SafeLoadAs(in + 8 * 11) << 2, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 36, 49, 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 13-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 63 | SafeLoadAs(in + 8 * 12) << 1, - }; - shifts = simd_batch{ 24, 37, 50, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 13-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 12, 25, 38, 51 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 13 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 14-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 14, 28, 42 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 56 | SafeLoadAs(in + 8 * 1) << 8, - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 6, 20, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 62 | SafeLoadAs(in + 8 * 2) << 2, - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 48, 0, 12, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 54 | SafeLoadAs(in + 8 * 3) << 10, - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 40, 0, 4, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 32, 46, 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 52 | SafeLoadAs(in + 8 * 5) << 12, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 24, 38, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 58 | SafeLoadAs(in + 8 * 6) << 6, - }; - shifts = simd_batch{ 16, 30, 44, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 8, 22, 36, 50 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 14, 28, 42 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 6, 20, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 62 | SafeLoadAs(in + 8 * 9) << 2, - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 48, 0, 12, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 54 | SafeLoadAs(in + 8 * 10) << 10, - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 40, 0, 4, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 60 | SafeLoadAs(in + 8 * 11) << 4, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 32, 46, 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 52 | SafeLoadAs(in + 8 * 12) << 12, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 24, 38, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 58 | SafeLoadAs(in + 8 * 13) << 6, - }; - shifts = simd_batch{ 16, 30, 44, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 8, 22, 36, 50 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 14 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 15-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 15, 30, 45 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 0) >> 60 | SafeLoadAs(in + 8 * 1) << 4, - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 11, 26, 41 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 56 | SafeLoadAs(in + 8 * 2) << 8, - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 7, 22, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 52 | SafeLoadAs(in + 8 * 3) << 12, - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 3, 18, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 63 | SafeLoadAs(in + 8 * 4) << 1, - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 48, 0, 14, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 59 | SafeLoadAs(in + 8 * 5) << 5, - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 44, 0, 10, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 55 | SafeLoadAs(in + 8 * 6) << 9, - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 40, 0, 6, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 51 | SafeLoadAs(in + 8 * 7) << 13, - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 36, 0, 2, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 62 | SafeLoadAs(in + 8 * 8) << 2, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 32, 47, 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 58 | SafeLoadAs(in + 8 * 9) << 6, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 28, 43, 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 54 | SafeLoadAs(in + 8 * 10) << 10, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 24, 39, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 50 | SafeLoadAs(in + 8 * 11) << 14, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 20, 35, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 61 | SafeLoadAs(in + 8 * 12) << 3, - }; - shifts = simd_batch{ 16, 31, 46, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 57 | SafeLoadAs(in + 8 * 13) << 7, - }; - shifts = simd_batch{ 12, 27, 42, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 53 | SafeLoadAs(in + 8 * 14) << 11, - }; - shifts = simd_batch{ 8, 23, 38, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 4, 19, 34, 49 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 15 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 16-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - }; - shifts = simd_batch{ 0, 16, 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 16, 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 16, 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 16, 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 16, 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 16, 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 16, 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 16, 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 16, 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 16, 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 16, 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 16, 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 16, 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 16, 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 16, 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 16, 32, 48 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 16 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1ffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 17-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 51 | SafeLoadAs(in + 8 * 1) << 13, - }; - shifts = simd_batch{ 0, 17, 34, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 55 | SafeLoadAs(in + 8 * 2) << 9, - }; - shifts = simd_batch{ 4, 21, 38, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 59 | SafeLoadAs(in + 8 * 3) << 5, - }; - shifts = simd_batch{ 8, 25, 42, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 63 | SafeLoadAs(in + 8 * 4) << 1, - }; - shifts = simd_batch{ 12, 29, 46, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 50 | SafeLoadAs(in + 8 * 5) << 14, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 16, 33, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 54 | SafeLoadAs(in + 8 * 6) << 10, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 20, 37, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 58 | SafeLoadAs(in + 8 * 7) << 6, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 24, 41, 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 62 | SafeLoadAs(in + 8 * 8) << 2, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 28, 45, 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 49 | SafeLoadAs(in + 8 * 9) << 15, - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 32, 0, 2, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 53 | SafeLoadAs(in + 8 * 10) << 11, - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 36, 0, 6, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 57 | SafeLoadAs(in + 8 * 11) << 7, - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 40, 0, 10, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 61 | SafeLoadAs(in + 8 * 12) << 3, - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 44, 0, 14, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 48 | SafeLoadAs(in + 8 * 13) << 16, - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 1, 18, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 52 | SafeLoadAs(in + 8 * 14) << 12, - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 5, 22, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 56 | SafeLoadAs(in + 8 * 15) << 8, - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 9, 26, 43 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 60 | SafeLoadAs(in + 8 * 16) << 4, - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 13, 30, 47 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 17 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3ffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 18-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 54 | SafeLoadAs(in + 8 * 1) << 10, - }; - shifts = simd_batch{ 0, 18, 36, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 62 | SafeLoadAs(in + 8 * 2) << 2, - }; - shifts = simd_batch{ 8, 26, 44, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 52 | SafeLoadAs(in + 8 * 3) << 12, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 16, 34, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 24, 42, 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 50 | SafeLoadAs(in + 8 * 5) << 14, - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 32, 0, 4, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 58 | SafeLoadAs(in + 8 * 6) << 6, - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 40, 0, 12, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 48 | SafeLoadAs(in + 8 * 7) << 16, - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 2, 20, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 10, 28, 46 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 54 | SafeLoadAs(in + 8 * 10) << 10, - }; - shifts = simd_batch{ 0, 18, 36, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 62 | SafeLoadAs(in + 8 * 11) << 2, - }; - shifts = simd_batch{ 8, 26, 44, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 52 | SafeLoadAs(in + 8 * 12) << 12, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 16, 34, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 60 | SafeLoadAs(in + 8 * 13) << 4, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 24, 42, 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 50 | SafeLoadAs(in + 8 * 14) << 14, - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 32, 0, 4, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 58 | SafeLoadAs(in + 8 * 15) << 6, - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 40, 0, 12, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 48 | SafeLoadAs(in + 8 * 16) << 16, - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 2, 20, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 0, 10, 28, 46 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 18 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7ffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 19-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 57 | SafeLoadAs(in + 8 * 1) << 7, - }; - shifts = simd_batch{ 0, 19, 38, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 50 | SafeLoadAs(in + 8 * 2) << 14, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 12, 31, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 62 | SafeLoadAs(in + 8 * 3) << 2, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 24, 43, 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 55 | SafeLoadAs(in + 8 * 4) << 9, - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 36, 0, 10, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 48 | SafeLoadAs(in + 8 * 5) << 16, - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 3, 22, 41 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 60 | SafeLoadAs(in + 8 * 6) << 4, - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 53 | SafeLoadAs(in + 8 * 7) << 11, - }; - shifts = simd_batch{ 0, 15, 34, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 46 | SafeLoadAs(in + 8 * 8) << 18, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 8, 27, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 58 | SafeLoadAs(in + 8 * 9) << 6, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 20, 39, 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 51 | SafeLoadAs(in + 8 * 10) << 13, - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 32, 0, 6, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 63 | SafeLoadAs(in + 8 * 11) << 1, - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 44, 0, 18, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 56 | SafeLoadAs(in + 8 * 12) << 8, - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 49 | SafeLoadAs(in + 8 * 13) << 15, - }; - shifts = simd_batch{ 0, 11, 30, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 61 | SafeLoadAs(in + 8 * 14) << 3, - }; - shifts = simd_batch{ 4, 23, 42, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 54 | SafeLoadAs(in + 8 * 15) << 10, - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 16, 35, 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 47 | SafeLoadAs(in + 8 * 16) << 17, - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 28, 0, 2, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 59 | SafeLoadAs(in + 8 * 17) << 5, - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 40, 0, 14, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 52 | SafeLoadAs(in + 8 * 18) << 12, - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 7, 26, 45 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 19 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xfffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 20-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 60 | SafeLoadAs(in + 8 * 1) << 4, - }; - shifts = simd_batch{ 0, 20, 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 56 | SafeLoadAs(in + 8 * 2) << 8, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 16, 36, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 52 | SafeLoadAs(in + 8 * 3) << 12, - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 32, 0, 8, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 48 | SafeLoadAs(in + 8 * 4) << 16, - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 4, 24, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 60 | SafeLoadAs(in + 8 * 6) << 4, - }; - shifts = simd_batch{ 0, 20, 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 56 | SafeLoadAs(in + 8 * 7) << 8, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 16, 36, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 52 | SafeLoadAs(in + 8 * 8) << 12, - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 32, 0, 8, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 48 | SafeLoadAs(in + 8 * 9) << 16, - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 4, 24, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 60 | SafeLoadAs(in + 8 * 11) << 4, - }; - shifts = simd_batch{ 0, 20, 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 56 | SafeLoadAs(in + 8 * 12) << 8, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 16, 36, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 52 | SafeLoadAs(in + 8 * 13) << 12, - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 32, 0, 8, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 48 | SafeLoadAs(in + 8 * 14) << 16, - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 4, 24, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 60 | SafeLoadAs(in + 8 * 16) << 4, - }; - shifts = simd_batch{ 0, 20, 40, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 16, 36, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 52 | SafeLoadAs(in + 8 * 18) << 12, - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 32, 0, 8, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 48 | SafeLoadAs(in + 8 * 19) << 16, - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 4, 24, 44 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 20 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1fffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 21-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 63 | SafeLoadAs(in + 8 * 1) << 1, - }; - shifts = simd_batch{ 0, 21, 42, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 62 | SafeLoadAs(in + 8 * 2) << 2, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 20, 41, 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 61 | SafeLoadAs(in + 8 * 3) << 3, - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 40, 0, 18, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 59 | SafeLoadAs(in + 8 * 5) << 5, - }; - shifts = simd_batch{ 0, 17, 38, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 58 | SafeLoadAs(in + 8 * 6) << 6, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 16, 37, 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 57 | SafeLoadAs(in + 8 * 7) << 7, - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 36, 0, 14, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 55 | SafeLoadAs(in + 8 * 9) << 9, - }; - shifts = simd_batch{ 0, 13, 34, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 54 | SafeLoadAs(in + 8 * 10) << 10, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 12, 33, 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 53 | SafeLoadAs(in + 8 * 11) << 11, - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 32, 0, 10, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 52 | SafeLoadAs(in + 8 * 12) << 12, - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 51 | SafeLoadAs(in + 8 * 13) << 13, - }; - shifts = simd_batch{ 0, 9, 30, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 50 | SafeLoadAs(in + 8 * 14) << 14, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 8, 29, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 49 | SafeLoadAs(in + 8 * 15) << 15, - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 28, 0, 6, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 48 | SafeLoadAs(in + 8 * 16) << 16, - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 47 | SafeLoadAs(in + 8 * 17) << 17, - }; - shifts = simd_batch{ 0, 5, 26, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 46 | SafeLoadAs(in + 8 * 18) << 18, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 4, 25, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 45 | SafeLoadAs(in + 8 * 19) << 19, - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 24, 0, 2, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 44 | SafeLoadAs(in + 8 * 20) << 20, - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 1, 22, 43 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 21 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3fffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 22-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 44 | SafeLoadAs(in + 8 * 1) << 20, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 22, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 46 | SafeLoadAs(in + 8 * 2) << 18, - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 24, 0, 4, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 48 | SafeLoadAs(in + 8 * 3) << 16, - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 50 | SafeLoadAs(in + 8 * 4) << 14, - }; - shifts = simd_batch{ 0, 6, 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 52 | SafeLoadAs(in + 8 * 5) << 12, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 8, 30, 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 54 | SafeLoadAs(in + 8 * 6) << 10, - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 32, 0, 12, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 56 | SafeLoadAs(in + 8 * 7) << 8, - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 58 | SafeLoadAs(in + 8 * 8) << 6, - }; - shifts = simd_batch{ 0, 14, 36, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 60 | SafeLoadAs(in + 8 * 9) << 4, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 16, 38, 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 62 | SafeLoadAs(in + 8 * 10) << 2, - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 40, 0, 20, 42 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 44 | SafeLoadAs(in + 8 * 12) << 20, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 22, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 46 | SafeLoadAs(in + 8 * 13) << 18, - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 24, 0, 4, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 48 | SafeLoadAs(in + 8 * 14) << 16, - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 50 | SafeLoadAs(in + 8 * 15) << 14, - }; - shifts = simd_batch{ 0, 6, 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 52 | SafeLoadAs(in + 8 * 16) << 12, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 8, 30, 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 54 | SafeLoadAs(in + 8 * 17) << 10, - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 32, 0, 12, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 56 | SafeLoadAs(in + 8 * 18) << 8, - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 58 | SafeLoadAs(in + 8 * 19) << 6, - }; - shifts = simd_batch{ 0, 14, 36, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 60 | SafeLoadAs(in + 8 * 20) << 4, - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 16, 38, 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 62 | SafeLoadAs(in + 8 * 21) << 2, - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 40, 0, 20, 42 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 22 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7fffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 23-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 46 | SafeLoadAs(in + 8 * 1) << 18, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 23, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 51 | SafeLoadAs(in + 8 * 2) << 13, - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 28, 0, 10, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 56 | SafeLoadAs(in + 8 * 3) << 8, - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 61 | SafeLoadAs(in + 8 * 4) << 3, - }; - shifts = simd_batch{ 0, 15, 38, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 43 | SafeLoadAs(in + 8 * 5) << 21, - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 20, 0, 2, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 48 | SafeLoadAs(in + 8 * 6) << 16, - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 53 | SafeLoadAs(in + 8 * 7) << 11, - }; - shifts = simd_batch{ 0, 7, 30, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 58 | SafeLoadAs(in + 8 * 8) << 6, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 12, 35, 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 63 | SafeLoadAs(in + 8 * 9) << 1, - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 45 | SafeLoadAs(in + 8 * 10) << 19, - }; - shifts = simd_batch{ 40, 0, 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 50 | SafeLoadAs(in + 8 * 11) << 14, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 4, 27, 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 55 | SafeLoadAs(in + 8 * 12) << 9, - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 32, 0, 14, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 60 | SafeLoadAs(in + 8 * 13) << 4, - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 42 | SafeLoadAs(in + 8 * 14) << 22, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 19, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 47 | SafeLoadAs(in + 8 * 15) << 17, - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 24, 0, 6, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 52 | SafeLoadAs(in + 8 * 16) << 12, - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 57 | SafeLoadAs(in + 8 * 17) << 7, - }; - shifts = simd_batch{ 0, 11, 34, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 62 | SafeLoadAs(in + 8 * 18) << 2, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 16, 39, 0, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 44 | SafeLoadAs(in + 8 * 19) << 20, - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 49 | SafeLoadAs(in + 8 * 20) << 15, - }; - shifts = simd_batch{ 0, 3, 26, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 54 | SafeLoadAs(in + 8 * 21) << 10, - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 8, 31, 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 59 | SafeLoadAs(in + 8 * 22) << 5, - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 36, 0, 18, 41 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 23 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 24-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 48 | SafeLoadAs(in + 8 * 1) << 16, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 24, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 56 | SafeLoadAs(in + 8 * 2) << 8, - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 32, 0, 16, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 48 | SafeLoadAs(in + 8 * 4) << 16, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 24, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 56 | SafeLoadAs(in + 8 * 5) << 8, - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 32, 0, 16, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 48 | SafeLoadAs(in + 8 * 7) << 16, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 24, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 32, 0, 16, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 48 | SafeLoadAs(in + 8 * 10) << 16, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 24, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 56 | SafeLoadAs(in + 8 * 11) << 8, - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 32, 0, 16, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 48 | SafeLoadAs(in + 8 * 13) << 16, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 24, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 56 | SafeLoadAs(in + 8 * 14) << 8, - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 32, 0, 16, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 48 | SafeLoadAs(in + 8 * 16) << 16, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 24, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 32, 0, 16, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 48 | SafeLoadAs(in + 8 * 19) << 16, - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 24, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 56 | SafeLoadAs(in + 8 * 20) << 8, - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 32, 0, 16, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 48 | SafeLoadAs(in + 8 * 22) << 16, - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 0, 24, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 56 | SafeLoadAs(in + 8 * 23) << 8, - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 32, 0, 16, 40 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 24 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1ffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 25-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 50 | SafeLoadAs(in + 8 * 1) << 14, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 25, 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 61 | SafeLoadAs(in + 8 * 2) << 3, - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 47 | SafeLoadAs(in + 8 * 3) << 17, - }; - shifts = simd_batch{ 36, 0, 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 58 | SafeLoadAs(in + 8 * 4) << 6, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 8, 33, 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 44 | SafeLoadAs(in + 8 * 5) << 20, - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 55 | SafeLoadAs(in + 8 * 6) << 9, - }; - shifts = simd_batch{ 0, 5, 30, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 41 | SafeLoadAs(in + 8 * 7) << 23, - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 16, 0, 2, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 52 | SafeLoadAs(in + 8 * 8) << 12, - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 63 | SafeLoadAs(in + 8 * 9) << 1, - }; - shifts = simd_batch{ 0, 13, 38, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 49 | SafeLoadAs(in + 8 * 10) << 15, - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 24, 0, 10, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 60 | SafeLoadAs(in + 8 * 11) << 4, - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 46 | SafeLoadAs(in + 8 * 12) << 18, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 21, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 57 | SafeLoadAs(in + 8 * 13) << 7, - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 43 | SafeLoadAs(in + 8 * 14) << 21, - }; - shifts = simd_batch{ 32, 0, 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 54 | SafeLoadAs(in + 8 * 15) << 10, - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 4, 29, 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 40 | SafeLoadAs(in + 8 * 16) << 24, - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 51 | SafeLoadAs(in + 8 * 17) << 13, - }; - shifts = simd_batch{ 0, 1, 26, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 62 | SafeLoadAs(in + 8 * 18) << 2, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 12, 37, 0, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 48 | SafeLoadAs(in + 8 * 19) << 16, - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 59 | SafeLoadAs(in + 8 * 20) << 5, - }; - shifts = simd_batch{ 0, 9, 34, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 45 | SafeLoadAs(in + 8 * 21) << 19, - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 20, 0, 6, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 56 | SafeLoadAs(in + 8 * 22) << 8, - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 42 | SafeLoadAs(in + 8 * 23) << 22, - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 0, 17, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 53 | SafeLoadAs(in + 8 * 24) << 11, - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 28, 0, 14, 39 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 25 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3ffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 26-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 52 | SafeLoadAs(in + 8 * 1) << 12, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 26, 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 40 | SafeLoadAs(in + 8 * 2) << 24, - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 54 | SafeLoadAs(in + 8 * 3) << 10, - }; - shifts = simd_batch{ 0, 2, 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 42 | SafeLoadAs(in + 8 * 4) << 22, - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 16, 0, 4, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 56 | SafeLoadAs(in + 8 * 5) << 8, - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 44 | SafeLoadAs(in + 8 * 6) << 20, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 18, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 58 | SafeLoadAs(in + 8 * 7) << 6, - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 46 | SafeLoadAs(in + 8 * 8) << 18, - }; - shifts = simd_batch{ 32, 0, 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 60 | SafeLoadAs(in + 8 * 9) << 4, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 8, 34, 0, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 48 | SafeLoadAs(in + 8 * 10) << 16, - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 62 | SafeLoadAs(in + 8 * 11) << 2, - }; - shifts = simd_batch{ 0, 10, 36, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 50 | SafeLoadAs(in + 8 * 12) << 14, - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 24, 0, 12, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 52 | SafeLoadAs(in + 8 * 14) << 12, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 26, 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 40 | SafeLoadAs(in + 8 * 15) << 24, - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 54 | SafeLoadAs(in + 8 * 16) << 10, - }; - shifts = simd_batch{ 0, 2, 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 42 | SafeLoadAs(in + 8 * 17) << 22, - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 16, 0, 4, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 56 | SafeLoadAs(in + 8 * 18) << 8, - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 44 | SafeLoadAs(in + 8 * 19) << 20, - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 18, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 58 | SafeLoadAs(in + 8 * 20) << 6, - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 46 | SafeLoadAs(in + 8 * 21) << 18, - }; - shifts = simd_batch{ 32, 0, 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 60 | SafeLoadAs(in + 8 * 22) << 4, - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 8, 34, 0, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 48 | SafeLoadAs(in + 8 * 23) << 16, - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 62 | SafeLoadAs(in + 8 * 24) << 2, - }; - shifts = simd_batch{ 0, 10, 36, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 50 | SafeLoadAs(in + 8 * 25) << 14, - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25), - }; - shifts = simd_batch{ 24, 0, 12, 38 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 26 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7ffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 27-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 54 | SafeLoadAs(in + 8 * 1) << 10, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 27, 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 44 | SafeLoadAs(in + 8 * 2) << 20, - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 61 | SafeLoadAs(in + 8 * 3) << 3, - }; - shifts = simd_batch{ 0, 7, 34, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 51 | SafeLoadAs(in + 8 * 4) << 13, - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 41 | SafeLoadAs(in + 8 * 5) << 23, - }; - shifts = simd_batch{ 24, 0, 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 58 | SafeLoadAs(in + 8 * 6) << 6, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 4, 31, 0, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 48 | SafeLoadAs(in + 8 * 7) << 16, - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 38 | SafeLoadAs(in + 8 * 8) << 26, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 11, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 55 | SafeLoadAs(in + 8 * 9) << 9, - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 45 | SafeLoadAs(in + 8 * 10) << 19, - }; - shifts = simd_batch{ 28, 0, 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 62 | SafeLoadAs(in + 8 * 11) << 2, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 8, 35, 0, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 52 | SafeLoadAs(in + 8 * 12) << 12, - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 42 | SafeLoadAs(in + 8 * 13) << 22, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 15, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 59 | SafeLoadAs(in + 8 * 14) << 5, - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 49 | SafeLoadAs(in + 8 * 15) << 15, - }; - shifts = simd_batch{ 32, 0, 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 39 | SafeLoadAs(in + 8 * 16) << 25, - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 12, 0, 2, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 46 | SafeLoadAs(in + 8 * 18) << 18, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 19, 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 63 | SafeLoadAs(in + 8 * 19) << 1, - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 53 | SafeLoadAs(in + 8 * 20) << 11, - }; - shifts = simd_batch{ 36, 0, 26, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 43 | SafeLoadAs(in + 8 * 21) << 21, - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 16, 0, 6, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 60 | SafeLoadAs(in + 8 * 22) << 4, - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 50 | SafeLoadAs(in + 8 * 23) << 14, - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 0, 23, 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 40 | SafeLoadAs(in + 8 * 24) << 24, - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 57 | SafeLoadAs(in + 8 * 25) << 7, - }; - shifts = simd_batch{ 0, 3, 30, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25) >> 47 | SafeLoadAs(in + 8 * 26) << 17, - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26), - }; - shifts = simd_batch{ 20, 0, 10, 37 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 27 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xfffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 28-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 56 | SafeLoadAs(in + 8 * 1) << 8, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 28, 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 48 | SafeLoadAs(in + 8 * 2) << 16, - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 40 | SafeLoadAs(in + 8 * 3) << 24, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 12, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 52 | SafeLoadAs(in + 8 * 5) << 12, - }; - shifts = simd_batch{ 32, 0, 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 44 | SafeLoadAs(in + 8 * 6) << 20, - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 16, 0, 8, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 28, 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 48 | SafeLoadAs(in + 8 * 9) << 16, - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 40 | SafeLoadAs(in + 8 * 10) << 24, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 12, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 60 | SafeLoadAs(in + 8 * 11) << 4, - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 52 | SafeLoadAs(in + 8 * 12) << 12, - }; - shifts = simd_batch{ 32, 0, 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 44 | SafeLoadAs(in + 8 * 13) << 20, - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 16, 0, 8, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 56 | SafeLoadAs(in + 8 * 15) << 8, - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 28, 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 48 | SafeLoadAs(in + 8 * 16) << 16, - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 40 | SafeLoadAs(in + 8 * 17) << 24, - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 0, 12, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 60 | SafeLoadAs(in + 8 * 18) << 4, - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 52 | SafeLoadAs(in + 8 * 19) << 12, - }; - shifts = simd_batch{ 32, 0, 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 44 | SafeLoadAs(in + 8 * 20) << 20, - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 16, 0, 8, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 56 | SafeLoadAs(in + 8 * 22) << 8, - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 0, 28, 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 48 | SafeLoadAs(in + 8 * 23) << 16, - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 40 | SafeLoadAs(in + 8 * 24) << 24, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 12, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 60 | SafeLoadAs(in + 8 * 25) << 4, - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25) >> 52 | SafeLoadAs(in + 8 * 26) << 12, - }; - shifts = simd_batch{ 32, 0, 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 44 | SafeLoadAs(in + 8 * 27) << 20, - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27), - }; - shifts = simd_batch{ 16, 0, 8, 36 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 28 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1fffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 29-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 58 | SafeLoadAs(in + 8 * 1) << 6, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 29, 0, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 52 | SafeLoadAs(in + 8 * 2) << 12, - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 46 | SafeLoadAs(in + 8 * 3) << 18, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 17, 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 40 | SafeLoadAs(in + 8 * 4) << 24, - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 63 | SafeLoadAs(in + 8 * 5) << 1, - }; - shifts = simd_batch{ 0, 5, 34, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 57 | SafeLoadAs(in + 8 * 6) << 7, - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 51 | SafeLoadAs(in + 8 * 7) << 13, - }; - shifts = simd_batch{ 28, 0, 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 45 | SafeLoadAs(in + 8 * 8) << 19, - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 39 | SafeLoadAs(in + 8 * 9) << 25, - }; - shifts = simd_batch{ 16, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 62 | SafeLoadAs(in + 8 * 10) << 2, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 4, 33, 0, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 56 | SafeLoadAs(in + 8 * 11) << 8, - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 50 | SafeLoadAs(in + 8 * 12) << 14, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 21, 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 44 | SafeLoadAs(in + 8 * 13) << 20, - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 38 | SafeLoadAs(in + 8 * 14) << 26, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 9, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 61 | SafeLoadAs(in + 8 * 15) << 3, - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 55 | SafeLoadAs(in + 8 * 16) << 9, - }; - shifts = simd_batch{ 32, 0, 26, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 49 | SafeLoadAs(in + 8 * 17) << 15, - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 43 | SafeLoadAs(in + 8 * 18) << 21, - }; - shifts = simd_batch{ 20, 0, 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 37 | SafeLoadAs(in + 8 * 19) << 27, - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 8, 0, 2, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 60 | SafeLoadAs(in + 8 * 20) << 4, - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 54 | SafeLoadAs(in + 8 * 21) << 10, - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 0, 25, 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 48 | SafeLoadAs(in + 8 * 22) << 16, - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 42 | SafeLoadAs(in + 8 * 23) << 22, - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 0, 13, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 36 | SafeLoadAs(in + 8 * 24) << 28, - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 59 | SafeLoadAs(in + 8 * 25) << 5, - }; - shifts = simd_batch{ 0, 1, 30, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25) >> 53 | SafeLoadAs(in + 8 * 26) << 11, - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 47 | SafeLoadAs(in + 8 * 27) << 17, - }; - shifts = simd_batch{ 24, 0, 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 41 | SafeLoadAs(in + 8 * 28) << 23, - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28), - }; - shifts = simd_batch{ 12, 0, 6, 35 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 29 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3fffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 30-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 60 | SafeLoadAs(in + 8 * 1) << 4, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 30, 0, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 56 | SafeLoadAs(in + 8 * 2) << 8, - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 52 | SafeLoadAs(in + 8 * 3) << 12, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 22, 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 48 | SafeLoadAs(in + 8 * 4) << 16, - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 44 | SafeLoadAs(in + 8 * 5) << 20, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 14, 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 40 | SafeLoadAs(in + 8 * 6) << 24, - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 36 | SafeLoadAs(in + 8 * 7) << 28, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 6, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 62 | SafeLoadAs(in + 8 * 8) << 2, - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 58 | SafeLoadAs(in + 8 * 9) << 6, - }; - shifts = simd_batch{ 32, 0, 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 54 | SafeLoadAs(in + 8 * 10) << 10, - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 50 | SafeLoadAs(in + 8 * 11) << 14, - }; - shifts = simd_batch{ 24, 0, 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 46 | SafeLoadAs(in + 8 * 12) << 18, - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 42 | SafeLoadAs(in + 8 * 13) << 22, - }; - shifts = simd_batch{ 16, 0, 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 38 | SafeLoadAs(in + 8 * 14) << 26, - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 8, 0, 4, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 60 | SafeLoadAs(in + 8 * 16) << 4, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 30, 0, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 52 | SafeLoadAs(in + 8 * 18) << 12, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 22, 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 48 | SafeLoadAs(in + 8 * 19) << 16, - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 44 | SafeLoadAs(in + 8 * 20) << 20, - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 14, 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 40 | SafeLoadAs(in + 8 * 21) << 24, - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 36 | SafeLoadAs(in + 8 * 22) << 28, - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 0, 6, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 62 | SafeLoadAs(in + 8 * 23) << 2, - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 58 | SafeLoadAs(in + 8 * 24) << 6, - }; - shifts = simd_batch{ 32, 0, 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 54 | SafeLoadAs(in + 8 * 25) << 10, - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25) >> 50 | SafeLoadAs(in + 8 * 26) << 14, - }; - shifts = simd_batch{ 24, 0, 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 46 | SafeLoadAs(in + 8 * 27) << 18, - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 42 | SafeLoadAs(in + 8 * 28) << 22, - }; - shifts = simd_batch{ 16, 0, 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 38 | SafeLoadAs(in + 8 * 29) << 26, - SafeLoadAs(in + 8 * 29), - SafeLoadAs(in + 8 * 29), - }; - shifts = simd_batch{ 8, 0, 4, 34 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 30 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7fffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 31-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 62 | SafeLoadAs(in + 8 * 1) << 2, - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 31, 0, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 1) >> 60 | SafeLoadAs(in + 8 * 2) << 4, - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 58 | SafeLoadAs(in + 8 * 3) << 6, - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 27, 0, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 56 | SafeLoadAs(in + 8 * 4) << 8, - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 54 | SafeLoadAs(in + 8 * 5) << 10, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 23, 0, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 52 | SafeLoadAs(in + 8 * 6) << 12, - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 50 | SafeLoadAs(in + 8 * 7) << 14, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 19, 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 48 | SafeLoadAs(in + 8 * 8) << 16, - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 46 | SafeLoadAs(in + 8 * 9) << 18, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 15, 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 44 | SafeLoadAs(in + 8 * 10) << 20, - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 42 | SafeLoadAs(in + 8 * 11) << 22, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 11, 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 40 | SafeLoadAs(in + 8 * 12) << 24, - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 38 | SafeLoadAs(in + 8 * 13) << 26, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 7, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 36 | SafeLoadAs(in + 8 * 14) << 28, - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 34 | SafeLoadAs(in + 8 * 15) << 30, - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 3, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 63 | SafeLoadAs(in + 8 * 16) << 1, - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 61 | SafeLoadAs(in + 8 * 17) << 3, - }; - shifts = simd_batch{ 32, 0, 30, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 59 | SafeLoadAs(in + 8 * 18) << 5, - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 57 | SafeLoadAs(in + 8 * 19) << 7, - }; - shifts = simd_batch{ 28, 0, 26, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 55 | SafeLoadAs(in + 8 * 20) << 9, - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 53 | SafeLoadAs(in + 8 * 21) << 11, - }; - shifts = simd_batch{ 24, 0, 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 51 | SafeLoadAs(in + 8 * 22) << 13, - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 49 | SafeLoadAs(in + 8 * 23) << 15, - }; - shifts = simd_batch{ 20, 0, 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 47 | SafeLoadAs(in + 8 * 24) << 17, - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 45 | SafeLoadAs(in + 8 * 25) << 19, - }; - shifts = simd_batch{ 16, 0, 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25) >> 43 | SafeLoadAs(in + 8 * 26) << 21, - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 41 | SafeLoadAs(in + 8 * 27) << 23, - }; - shifts = simd_batch{ 12, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 39 | SafeLoadAs(in + 8 * 28) << 25, - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 37 | SafeLoadAs(in + 8 * 29) << 27, - }; - shifts = simd_batch{ 8, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 29), - SafeLoadAs(in + 8 * 29) >> 35 | SafeLoadAs(in + 8 * 30) << 29, - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 4, 0, 2, 33 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 31 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 32-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1), - }; - shifts = simd_batch{ 0, 32, 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 32-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3), - }; - shifts = simd_batch{ 0, 32, 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 32-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 32, 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 32-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 0, 32, 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 32-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 32, 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 32-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 32, 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 32-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 32, 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 32-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 32, 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 32-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 0, 32, 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 32-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 32, 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 32-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 0, 32, 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 32-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 0, 32, 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 32-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25), - }; - shifts = simd_batch{ 0, 32, 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 32-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27), - }; - shifts = simd_batch{ 0, 32, 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 32-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 29), - SafeLoadAs(in + 8 * 29), - }; - shifts = simd_batch{ 0, 32, 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 32-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 31), - SafeLoadAs(in + 8 * 31), - }; - shifts = simd_batch{ 0, 32, 0, 32 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 32 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1ffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 33-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 33 | SafeLoadAs(in + 8 * 1) << 31, - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 35 | SafeLoadAs(in + 8 * 2) << 29, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 33-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 37 | SafeLoadAs(in + 8 * 3) << 27, - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 39 | SafeLoadAs(in + 8 * 4) << 25, - }; - shifts = simd_batch{ 4, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 33-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 41 | SafeLoadAs(in + 8 * 5) << 23, - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 43 | SafeLoadAs(in + 8 * 6) << 21, - }; - shifts = simd_batch{ 8, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 33-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 45 | SafeLoadAs(in + 8 * 7) << 19, - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 47 | SafeLoadAs(in + 8 * 8) << 17, - }; - shifts = simd_batch{ 12, 0, 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 33-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 49 | SafeLoadAs(in + 8 * 9) << 15, - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 51 | SafeLoadAs(in + 8 * 10) << 13, - }; - shifts = simd_batch{ 16, 0, 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 33-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 53 | SafeLoadAs(in + 8 * 11) << 11, - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 55 | SafeLoadAs(in + 8 * 12) << 9, - }; - shifts = simd_batch{ 20, 0, 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 33-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 57 | SafeLoadAs(in + 8 * 13) << 7, - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 59 | SafeLoadAs(in + 8 * 14) << 5, - }; - shifts = simd_batch{ 24, 0, 26, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 33-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 61 | SafeLoadAs(in + 8 * 15) << 3, - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 63 | SafeLoadAs(in + 8 * 16) << 1, - }; - shifts = simd_batch{ 28, 0, 30, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 33-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 32 | SafeLoadAs(in + 8 * 17) << 32, - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 34 | SafeLoadAs(in + 8 * 18) << 30, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 1, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 33-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 36 | SafeLoadAs(in + 8 * 19) << 28, - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 38 | SafeLoadAs(in + 8 * 20) << 26, - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 5, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 33-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 40 | SafeLoadAs(in + 8 * 21) << 24, - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 42 | SafeLoadAs(in + 8 * 22) << 22, - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 0, 9, 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 33-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 44 | SafeLoadAs(in + 8 * 23) << 20, - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 46 | SafeLoadAs(in + 8 * 24) << 18, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 13, 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 33-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 48 | SafeLoadAs(in + 8 * 25) << 16, - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25) >> 50 | SafeLoadAs(in + 8 * 26) << 14, - SafeLoadAs(in + 8 * 26), - }; - shifts = simd_batch{ 0, 17, 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 33-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 52 | SafeLoadAs(in + 8 * 27) << 12, - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 54 | SafeLoadAs(in + 8 * 28) << 10, - SafeLoadAs(in + 8 * 28), - }; - shifts = simd_batch{ 0, 21, 0, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 33-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 56 | SafeLoadAs(in + 8 * 29) << 8, - SafeLoadAs(in + 8 * 29), - SafeLoadAs(in + 8 * 29) >> 58 | SafeLoadAs(in + 8 * 30) << 6, - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 0, 25, 0, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 33-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 60 | SafeLoadAs(in + 8 * 31) << 4, - SafeLoadAs(in + 8 * 31), - SafeLoadAs(in + 8 * 31) >> 62 | SafeLoadAs(in + 8 * 32) << 2, - SafeLoadAs(in + 8 * 32), - }; - shifts = simd_batch{ 0, 29, 0, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 33 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3ffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 34-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 34 | SafeLoadAs(in + 8 * 1) << 30, - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 38 | SafeLoadAs(in + 8 * 2) << 26, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 34-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 42 | SafeLoadAs(in + 8 * 3) << 22, - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 46 | SafeLoadAs(in + 8 * 4) << 18, - }; - shifts = simd_batch{ 8, 0, 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 34-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 50 | SafeLoadAs(in + 8 * 5) << 14, - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 54 | SafeLoadAs(in + 8 * 6) << 10, - }; - shifts = simd_batch{ 16, 0, 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 34-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 58 | SafeLoadAs(in + 8 * 7) << 6, - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 62 | SafeLoadAs(in + 8 * 8) << 2, - }; - shifts = simd_batch{ 24, 0, 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 34-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 32 | SafeLoadAs(in + 8 * 9) << 32, - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 36 | SafeLoadAs(in + 8 * 10) << 28, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 2, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 34-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 40 | SafeLoadAs(in + 8 * 11) << 24, - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 44 | SafeLoadAs(in + 8 * 12) << 20, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 10, 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 34-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 48 | SafeLoadAs(in + 8 * 13) << 16, - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 52 | SafeLoadAs(in + 8 * 14) << 12, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 18, 0, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 34-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 56 | SafeLoadAs(in + 8 * 15) << 8, - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 60 | SafeLoadAs(in + 8 * 16) << 4, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 26, 0, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 34-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 34 | SafeLoadAs(in + 8 * 18) << 30, - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 38 | SafeLoadAs(in + 8 * 19) << 26, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 34-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 42 | SafeLoadAs(in + 8 * 20) << 22, - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 46 | SafeLoadAs(in + 8 * 21) << 18, - }; - shifts = simd_batch{ 8, 0, 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 34-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 50 | SafeLoadAs(in + 8 * 22) << 14, - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 54 | SafeLoadAs(in + 8 * 23) << 10, - }; - shifts = simd_batch{ 16, 0, 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 34-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 58 | SafeLoadAs(in + 8 * 24) << 6, - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 62 | SafeLoadAs(in + 8 * 25) << 2, - }; - shifts = simd_batch{ 24, 0, 28, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 34-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 32 | SafeLoadAs(in + 8 * 26) << 32, - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 36 | SafeLoadAs(in + 8 * 27) << 28, - SafeLoadAs(in + 8 * 27), - }; - shifts = simd_batch{ 0, 2, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 34-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 40 | SafeLoadAs(in + 8 * 28) << 24, - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 44 | SafeLoadAs(in + 8 * 29) << 20, - SafeLoadAs(in + 8 * 29), - }; - shifts = simd_batch{ 0, 10, 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 34-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 48 | SafeLoadAs(in + 8 * 30) << 16, - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 52 | SafeLoadAs(in + 8 * 31) << 12, - SafeLoadAs(in + 8 * 31), - }; - shifts = simd_batch{ 0, 18, 0, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 34-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 56 | SafeLoadAs(in + 8 * 32) << 8, - SafeLoadAs(in + 8 * 32), - SafeLoadAs(in + 8 * 32) >> 60 | SafeLoadAs(in + 8 * 33) << 4, - SafeLoadAs(in + 8 * 33), - }; - shifts = simd_batch{ 0, 26, 0, 30 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 34 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7ffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 35-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 35 | SafeLoadAs(in + 8 * 1) << 29, - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 41 | SafeLoadAs(in + 8 * 2) << 23, - }; - shifts = simd_batch{ 0, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 35-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 47 | SafeLoadAs(in + 8 * 3) << 17, - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 53 | SafeLoadAs(in + 8 * 4) << 11, - }; - shifts = simd_batch{ 12, 0, 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 35-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 59 | SafeLoadAs(in + 8 * 5) << 5, - SafeLoadAs(in + 8 * 5) >> 30 | SafeLoadAs(in + 8 * 6) << 34, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 24, 0, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 35-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 36 | SafeLoadAs(in + 8 * 7) << 28, - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 42 | SafeLoadAs(in + 8 * 8) << 22, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 7, 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 35-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 48 | SafeLoadAs(in + 8 * 9) << 16, - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 54 | SafeLoadAs(in + 8 * 10) << 10, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 0, 19, 0, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 35-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 60 | SafeLoadAs(in + 8 * 11) << 4, - SafeLoadAs(in + 8 * 11) >> 31 | SafeLoadAs(in + 8 * 12) << 33, - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 37 | SafeLoadAs(in + 8 * 13) << 27, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 35-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 43 | SafeLoadAs(in + 8 * 14) << 21, - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 49 | SafeLoadAs(in + 8 * 15) << 15, - }; - shifts = simd_batch{ 8, 0, 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 35-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 55 | SafeLoadAs(in + 8 * 16) << 9, - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 61 | SafeLoadAs(in + 8 * 17) << 3, - }; - shifts = simd_batch{ 20, 0, 26, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 35-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 32 | SafeLoadAs(in + 8 * 18) << 32, - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 38 | SafeLoadAs(in + 8 * 19) << 26, - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 3, 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 35-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 44 | SafeLoadAs(in + 8 * 20) << 20, - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 50 | SafeLoadAs(in + 8 * 21) << 14, - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 0, 15, 0, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 35-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 56 | SafeLoadAs(in + 8 * 22) << 8, - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 62 | SafeLoadAs(in + 8 * 23) << 2, - SafeLoadAs(in + 8 * 23) >> 33 | SafeLoadAs(in + 8 * 24) << 31, - }; - shifts = simd_batch{ 0, 27, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 35-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 39 | SafeLoadAs(in + 8 * 25) << 25, - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25) >> 45 | SafeLoadAs(in + 8 * 26) << 19, - }; - shifts = simd_batch{ 4, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 35-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 51 | SafeLoadAs(in + 8 * 27) << 13, - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 57 | SafeLoadAs(in + 8 * 28) << 7, - }; - shifts = simd_batch{ 16, 0, 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 35-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 63 | SafeLoadAs(in + 8 * 29) << 1, - SafeLoadAs(in + 8 * 29) >> 34 | SafeLoadAs(in + 8 * 30) << 30, - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 28, 0, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 35-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 40 | SafeLoadAs(in + 8 * 31) << 24, - SafeLoadAs(in + 8 * 31), - SafeLoadAs(in + 8 * 31) >> 46 | SafeLoadAs(in + 8 * 32) << 18, - SafeLoadAs(in + 8 * 32), - }; - shifts = simd_batch{ 0, 11, 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 35-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 32) >> 52 | SafeLoadAs(in + 8 * 33) << 12, - SafeLoadAs(in + 8 * 33), - SafeLoadAs(in + 8 * 33) >> 58 | SafeLoadAs(in + 8 * 34) << 6, - SafeLoadAs(in + 8 * 34), - }; - shifts = simd_batch{ 0, 23, 0, 29 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 35 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xfffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 36-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 36 | SafeLoadAs(in + 8 * 1) << 28, - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 44 | SafeLoadAs(in + 8 * 2) << 20, - }; - shifts = simd_batch{ 0, 0, 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 36-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 52 | SafeLoadAs(in + 8 * 3) << 12, - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - }; - shifts = simd_batch{ 16, 0, 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 36-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 32 | SafeLoadAs(in + 8 * 5) << 32, - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 40 | SafeLoadAs(in + 8 * 6) << 24, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 4, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 36-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 48 | SafeLoadAs(in + 8 * 7) << 16, - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 20, 0, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 36-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 36 | SafeLoadAs(in + 8 * 10) << 28, - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 44 | SafeLoadAs(in + 8 * 11) << 20, - }; - shifts = simd_batch{ 0, 0, 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 36-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 52 | SafeLoadAs(in + 8 * 12) << 12, - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 60 | SafeLoadAs(in + 8 * 13) << 4, - }; - shifts = simd_batch{ 16, 0, 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 36-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 32 | SafeLoadAs(in + 8 * 14) << 32, - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 40 | SafeLoadAs(in + 8 * 15) << 24, - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 4, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 36-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 48 | SafeLoadAs(in + 8 * 16) << 16, - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 0, 20, 0, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 36-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 36 | SafeLoadAs(in + 8 * 19) << 28, - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 44 | SafeLoadAs(in + 8 * 20) << 20, - }; - shifts = simd_batch{ 0, 0, 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 36-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 52 | SafeLoadAs(in + 8 * 21) << 12, - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 60 | SafeLoadAs(in + 8 * 22) << 4, - }; - shifts = simd_batch{ 16, 0, 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 36-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 32 | SafeLoadAs(in + 8 * 23) << 32, - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 40 | SafeLoadAs(in + 8 * 24) << 24, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 4, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 36-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 48 | SafeLoadAs(in + 8 * 25) << 16, - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25) >> 56 | SafeLoadAs(in + 8 * 26) << 8, - SafeLoadAs(in + 8 * 26), - }; - shifts = simd_batch{ 0, 20, 0, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 36-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 36 | SafeLoadAs(in + 8 * 28) << 28, - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 44 | SafeLoadAs(in + 8 * 29) << 20, - }; - shifts = simd_batch{ 0, 0, 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 36-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 29), - SafeLoadAs(in + 8 * 29) >> 52 | SafeLoadAs(in + 8 * 30) << 12, - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 60 | SafeLoadAs(in + 8 * 31) << 4, - }; - shifts = simd_batch{ 16, 0, 24, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 36-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 32 | SafeLoadAs(in + 8 * 32) << 32, - SafeLoadAs(in + 8 * 32), - SafeLoadAs(in + 8 * 32) >> 40 | SafeLoadAs(in + 8 * 33) << 24, - SafeLoadAs(in + 8 * 33), - }; - shifts = simd_batch{ 0, 4, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 36-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 48 | SafeLoadAs(in + 8 * 34) << 16, - SafeLoadAs(in + 8 * 34), - SafeLoadAs(in + 8 * 34) >> 56 | SafeLoadAs(in + 8 * 35) << 8, - SafeLoadAs(in + 8 * 35), - }; - shifts = simd_batch{ 0, 20, 0, 28 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 36 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1fffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 37-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 37 | SafeLoadAs(in + 8 * 1) << 27, - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 47 | SafeLoadAs(in + 8 * 2) << 17, - }; - shifts = simd_batch{ 0, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 37-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 57 | SafeLoadAs(in + 8 * 3) << 7, - SafeLoadAs(in + 8 * 3) >> 30 | SafeLoadAs(in + 8 * 4) << 34, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 20, 0, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 37-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 40 | SafeLoadAs(in + 8 * 5) << 24, - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 50 | SafeLoadAs(in + 8 * 6) << 14, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 13, 0, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 37-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 60 | SafeLoadAs(in + 8 * 7) << 4, - SafeLoadAs(in + 8 * 7) >> 33 | SafeLoadAs(in + 8 * 8) << 31, - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 43 | SafeLoadAs(in + 8 * 9) << 21, - }; - shifts = simd_batch{ 0, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 37-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 53 | SafeLoadAs(in + 8 * 10) << 11, - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 63 | SafeLoadAs(in + 8 * 11) << 1, - }; - shifts = simd_batch{ 16, 0, 26, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 37-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 36 | SafeLoadAs(in + 8 * 12) << 28, - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 46 | SafeLoadAs(in + 8 * 13) << 18, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 9, 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 37-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 56 | SafeLoadAs(in + 8 * 14) << 8, - SafeLoadAs(in + 8 * 14) >> 29 | SafeLoadAs(in + 8 * 15) << 35, - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 39 | SafeLoadAs(in + 8 * 16) << 25, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 37-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 49 | SafeLoadAs(in + 8 * 17) << 15, - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 59 | SafeLoadAs(in + 8 * 18) << 5, - }; - shifts = simd_batch{ 12, 0, 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 37-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 32 | SafeLoadAs(in + 8 * 19) << 32, - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 42 | SafeLoadAs(in + 8 * 20) << 22, - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 5, 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 37-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 52 | SafeLoadAs(in + 8 * 21) << 12, - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 62 | SafeLoadAs(in + 8 * 22) << 2, - SafeLoadAs(in + 8 * 22) >> 35 | SafeLoadAs(in + 8 * 23) << 29, - }; - shifts = simd_batch{ 0, 25, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 37-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 45 | SafeLoadAs(in + 8 * 24) << 19, - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 55 | SafeLoadAs(in + 8 * 25) << 9, - }; - shifts = simd_batch{ 8, 0, 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 37-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 28 | SafeLoadAs(in + 8 * 26) << 36, - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 38 | SafeLoadAs(in + 8 * 27) << 26, - SafeLoadAs(in + 8 * 27), - }; - shifts = simd_batch{ 0, 1, 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 37-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 48 | SafeLoadAs(in + 8 * 28) << 16, - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 58 | SafeLoadAs(in + 8 * 29) << 6, - SafeLoadAs(in + 8 * 29) >> 31 | SafeLoadAs(in + 8 * 30) << 33, - }; - shifts = simd_batch{ 0, 21, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 37-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 41 | SafeLoadAs(in + 8 * 31) << 23, - SafeLoadAs(in + 8 * 31), - SafeLoadAs(in + 8 * 31) >> 51 | SafeLoadAs(in + 8 * 32) << 13, - }; - shifts = simd_batch{ 4, 0, 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 37-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 32), - SafeLoadAs(in + 8 * 32) >> 61 | SafeLoadAs(in + 8 * 33) << 3, - SafeLoadAs(in + 8 * 33) >> 34 | SafeLoadAs(in + 8 * 34) << 30, - SafeLoadAs(in + 8 * 34), - }; - shifts = simd_batch{ 24, 0, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 37-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 44 | SafeLoadAs(in + 8 * 35) << 20, - SafeLoadAs(in + 8 * 35), - SafeLoadAs(in + 8 * 35) >> 54 | SafeLoadAs(in + 8 * 36) << 10, - SafeLoadAs(in + 8 * 36), - }; - shifts = simd_batch{ 0, 17, 0, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 37 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3fffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 38-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 38 | SafeLoadAs(in + 8 * 1) << 26, - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 50 | SafeLoadAs(in + 8 * 2) << 14, - }; - shifts = simd_batch{ 0, 0, 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 38-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2), - SafeLoadAs(in + 8 * 2) >> 62 | SafeLoadAs(in + 8 * 3) << 2, - SafeLoadAs(in + 8 * 3) >> 36 | SafeLoadAs(in + 8 * 4) << 28, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 24, 0, 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 38-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 48 | SafeLoadAs(in + 8 * 5) << 16, - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 60 | SafeLoadAs(in + 8 * 6) << 4, - SafeLoadAs(in + 8 * 6) >> 34 | SafeLoadAs(in + 8 * 7) << 30, - }; - shifts = simd_batch{ 0, 22, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 38-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 46 | SafeLoadAs(in + 8 * 8) << 18, - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 58 | SafeLoadAs(in + 8 * 9) << 6, - }; - shifts = simd_batch{ 8, 0, 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 38-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 32 | SafeLoadAs(in + 8 * 10) << 32, - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 44 | SafeLoadAs(in + 8 * 11) << 20, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 6, 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 38-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 56 | SafeLoadAs(in + 8 * 12) << 8, - SafeLoadAs(in + 8 * 12) >> 30 | SafeLoadAs(in + 8 * 13) << 34, - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 42 | SafeLoadAs(in + 8 * 14) << 22, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 38-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 54 | SafeLoadAs(in + 8 * 15) << 10, - SafeLoadAs(in + 8 * 15) >> 28 | SafeLoadAs(in + 8 * 16) << 36, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 16, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 38-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 40 | SafeLoadAs(in + 8 * 17) << 24, - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 52 | SafeLoadAs(in + 8 * 18) << 12, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 0, 14, 0, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 38-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 38 | SafeLoadAs(in + 8 * 20) << 26, - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 50 | SafeLoadAs(in + 8 * 21) << 14, - }; - shifts = simd_batch{ 0, 0, 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 38-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 62 | SafeLoadAs(in + 8 * 22) << 2, - SafeLoadAs(in + 8 * 22) >> 36 | SafeLoadAs(in + 8 * 23) << 28, - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 24, 0, 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 38-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 48 | SafeLoadAs(in + 8 * 24) << 16, - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 60 | SafeLoadAs(in + 8 * 25) << 4, - SafeLoadAs(in + 8 * 25) >> 34 | SafeLoadAs(in + 8 * 26) << 30, - }; - shifts = simd_batch{ 0, 22, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 38-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 46 | SafeLoadAs(in + 8 * 27) << 18, - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 58 | SafeLoadAs(in + 8 * 28) << 6, - }; - shifts = simd_batch{ 8, 0, 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 38-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 32 | SafeLoadAs(in + 8 * 29) << 32, - SafeLoadAs(in + 8 * 29), - SafeLoadAs(in + 8 * 29) >> 44 | SafeLoadAs(in + 8 * 30) << 20, - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 0, 6, 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 38-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 56 | SafeLoadAs(in + 8 * 31) << 8, - SafeLoadAs(in + 8 * 31) >> 30 | SafeLoadAs(in + 8 * 32) << 34, - SafeLoadAs(in + 8 * 32), - SafeLoadAs(in + 8 * 32) >> 42 | SafeLoadAs(in + 8 * 33) << 22, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 38-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 33), - SafeLoadAs(in + 8 * 33) >> 54 | SafeLoadAs(in + 8 * 34) << 10, - SafeLoadAs(in + 8 * 34) >> 28 | SafeLoadAs(in + 8 * 35) << 36, - SafeLoadAs(in + 8 * 35), - }; - shifts = simd_batch{ 16, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 38-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 35) >> 40 | SafeLoadAs(in + 8 * 36) << 24, - SafeLoadAs(in + 8 * 36), - SafeLoadAs(in + 8 * 36) >> 52 | SafeLoadAs(in + 8 * 37) << 12, - SafeLoadAs(in + 8 * 37), - }; - shifts = simd_batch{ 0, 14, 0, 26 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 38 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7fffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 39-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 39 | SafeLoadAs(in + 8 * 1) << 25, - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 53 | SafeLoadAs(in + 8 * 2) << 11, - }; - shifts = simd_batch{ 0, 0, 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 39-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 28 | SafeLoadAs(in + 8 * 3) << 36, - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 42 | SafeLoadAs(in + 8 * 4) << 22, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 3, 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 39-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 4) >> 56 | SafeLoadAs(in + 8 * 5) << 8, - SafeLoadAs(in + 8 * 5) >> 31 | SafeLoadAs(in + 8 * 6) << 33, - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 45 | SafeLoadAs(in + 8 * 7) << 19, - }; - shifts = simd_batch{ 0, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 39-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 59 | SafeLoadAs(in + 8 * 8) << 5, - SafeLoadAs(in + 8 * 8) >> 34 | SafeLoadAs(in + 8 * 9) << 30, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 20, 0, 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 39-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 48 | SafeLoadAs(in + 8 * 10) << 16, - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 62 | SafeLoadAs(in + 8 * 11) << 2, - SafeLoadAs(in + 8 * 11) >> 37 | SafeLoadAs(in + 8 * 12) << 27, - }; - shifts = simd_batch{ 0, 23, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 39-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 51 | SafeLoadAs(in + 8 * 13) << 13, - SafeLoadAs(in + 8 * 13) >> 26 | SafeLoadAs(in + 8 * 14) << 38, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 12, 0, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 39-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 40 | SafeLoadAs(in + 8 * 15) << 24, - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 54 | SafeLoadAs(in + 8 * 16) << 10, - SafeLoadAs(in + 8 * 16) >> 29 | SafeLoadAs(in + 8 * 17) << 35, - }; - shifts = simd_batch{ 0, 15, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 39-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 43 | SafeLoadAs(in + 8 * 18) << 21, - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 57 | SafeLoadAs(in + 8 * 19) << 7, - }; - shifts = simd_batch{ 4, 0, 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 39-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 32 | SafeLoadAs(in + 8 * 20) << 32, - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 46 | SafeLoadAs(in + 8 * 21) << 18, - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 0, 7, 0, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 39-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 60 | SafeLoadAs(in + 8 * 22) << 4, - SafeLoadAs(in + 8 * 22) >> 35 | SafeLoadAs(in + 8 * 23) << 29, - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 49 | SafeLoadAs(in + 8 * 24) << 15, - }; - shifts = simd_batch{ 0, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 39-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 63 | SafeLoadAs(in + 8 * 25) << 1, - SafeLoadAs(in + 8 * 25) >> 38 | SafeLoadAs(in + 8 * 26) << 26, - SafeLoadAs(in + 8 * 26), - }; - shifts = simd_batch{ 24, 0, 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 39-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 52 | SafeLoadAs(in + 8 * 27) << 12, - SafeLoadAs(in + 8 * 27) >> 27 | SafeLoadAs(in + 8 * 28) << 37, - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 41 | SafeLoadAs(in + 8 * 29) << 23, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 39-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 29), - SafeLoadAs(in + 8 * 29) >> 55 | SafeLoadAs(in + 8 * 30) << 9, - SafeLoadAs(in + 8 * 30) >> 30 | SafeLoadAs(in + 8 * 31) << 34, - SafeLoadAs(in + 8 * 31), - }; - shifts = simd_batch{ 16, 0, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 39-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 44 | SafeLoadAs(in + 8 * 32) << 20, - SafeLoadAs(in + 8 * 32), - SafeLoadAs(in + 8 * 32) >> 58 | SafeLoadAs(in + 8 * 33) << 6, - SafeLoadAs(in + 8 * 33) >> 33 | SafeLoadAs(in + 8 * 34) << 31, - }; - shifts = simd_batch{ 0, 19, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 39-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 34), - SafeLoadAs(in + 8 * 34) >> 47 | SafeLoadAs(in + 8 * 35) << 17, - SafeLoadAs(in + 8 * 35), - SafeLoadAs(in + 8 * 35) >> 61 | SafeLoadAs(in + 8 * 36) << 3, - }; - shifts = simd_batch{ 8, 0, 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 39-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 36 | SafeLoadAs(in + 8 * 37) << 28, - SafeLoadAs(in + 8 * 37), - SafeLoadAs(in + 8 * 37) >> 50 | SafeLoadAs(in + 8 * 38) << 14, - SafeLoadAs(in + 8 * 38), - }; - shifts = simd_batch{ 0, 11, 0, 25 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 39 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 40-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 40 | SafeLoadAs(in + 8 * 1) << 24, - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 56 | SafeLoadAs(in + 8 * 2) << 8, - }; - shifts = simd_batch{ 0, 0, 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 40-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 32 | SafeLoadAs(in + 8 * 3) << 32, - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 48 | SafeLoadAs(in + 8 * 4) << 16, - SafeLoadAs(in + 8 * 4), - }; - shifts = simd_batch{ 0, 8, 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 40-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 40 | SafeLoadAs(in + 8 * 6) << 24, - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 56 | SafeLoadAs(in + 8 * 7) << 8, - }; - shifts = simd_batch{ 0, 0, 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 40-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 32 | SafeLoadAs(in + 8 * 8) << 32, - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 48 | SafeLoadAs(in + 8 * 9) << 16, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 8, 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 40-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 40 | SafeLoadAs(in + 8 * 11) << 24, - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 56 | SafeLoadAs(in + 8 * 12) << 8, - }; - shifts = simd_batch{ 0, 0, 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 40-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 32 | SafeLoadAs(in + 8 * 13) << 32, - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 48 | SafeLoadAs(in + 8 * 14) << 16, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 8, 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 40-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 40 | SafeLoadAs(in + 8 * 16) << 24, - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - }; - shifts = simd_batch{ 0, 0, 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 40-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 32 | SafeLoadAs(in + 8 * 18) << 32, - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 48 | SafeLoadAs(in + 8 * 19) << 16, - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 8, 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 40-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 40 | SafeLoadAs(in + 8 * 21) << 24, - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 56 | SafeLoadAs(in + 8 * 22) << 8, - }; - shifts = simd_batch{ 0, 0, 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 40-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 32 | SafeLoadAs(in + 8 * 23) << 32, - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 48 | SafeLoadAs(in + 8 * 24) << 16, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 8, 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 40-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25) >> 40 | SafeLoadAs(in + 8 * 26) << 24, - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 56 | SafeLoadAs(in + 8 * 27) << 8, - }; - shifts = simd_batch{ 0, 0, 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 40-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 32 | SafeLoadAs(in + 8 * 28) << 32, - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 48 | SafeLoadAs(in + 8 * 29) << 16, - SafeLoadAs(in + 8 * 29), - }; - shifts = simd_batch{ 0, 8, 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 40-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 40 | SafeLoadAs(in + 8 * 31) << 24, - SafeLoadAs(in + 8 * 31), - SafeLoadAs(in + 8 * 31) >> 56 | SafeLoadAs(in + 8 * 32) << 8, - }; - shifts = simd_batch{ 0, 0, 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 40-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 32) >> 32 | SafeLoadAs(in + 8 * 33) << 32, - SafeLoadAs(in + 8 * 33), - SafeLoadAs(in + 8 * 33) >> 48 | SafeLoadAs(in + 8 * 34) << 16, - SafeLoadAs(in + 8 * 34), - }; - shifts = simd_batch{ 0, 8, 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 40-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 35), - SafeLoadAs(in + 8 * 35) >> 40 | SafeLoadAs(in + 8 * 36) << 24, - SafeLoadAs(in + 8 * 36), - SafeLoadAs(in + 8 * 36) >> 56 | SafeLoadAs(in + 8 * 37) << 8, - }; - shifts = simd_batch{ 0, 0, 16, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 40-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 37) >> 32 | SafeLoadAs(in + 8 * 38) << 32, - SafeLoadAs(in + 8 * 38), - SafeLoadAs(in + 8 * 38) >> 48 | SafeLoadAs(in + 8 * 39) << 16, - SafeLoadAs(in + 8 * 39), - }; - shifts = simd_batch{ 0, 8, 0, 24 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 40 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1ffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 41-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 41 | SafeLoadAs(in + 8 * 1) << 23, - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 59 | SafeLoadAs(in + 8 * 2) << 5, - }; - shifts = simd_batch{ 0, 0, 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 41-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 36 | SafeLoadAs(in + 8 * 3) << 28, - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 54 | SafeLoadAs(in + 8 * 4) << 10, - SafeLoadAs(in + 8 * 4) >> 31 | SafeLoadAs(in + 8 * 5) << 33, - }; - shifts = simd_batch{ 0, 13, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 41-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 49 | SafeLoadAs(in + 8 * 6) << 15, - SafeLoadAs(in + 8 * 6) >> 26 | SafeLoadAs(in + 8 * 7) << 38, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 8, 0, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 41-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 44 | SafeLoadAs(in + 8 * 8) << 20, - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 62 | SafeLoadAs(in + 8 * 9) << 2, - SafeLoadAs(in + 8 * 9) >> 39 | SafeLoadAs(in + 8 * 10) << 25, - }; - shifts = simd_batch{ 0, 21, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 41-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 57 | SafeLoadAs(in + 8 * 11) << 7, - SafeLoadAs(in + 8 * 11) >> 34 | SafeLoadAs(in + 8 * 12) << 30, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 16, 0, 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 41-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 52 | SafeLoadAs(in + 8 * 13) << 12, - SafeLoadAs(in + 8 * 13) >> 29 | SafeLoadAs(in + 8 * 14) << 35, - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 47 | SafeLoadAs(in + 8 * 15) << 17, - }; - shifts = simd_batch{ 0, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 41-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 24 | SafeLoadAs(in + 8 * 16) << 40, - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 42 | SafeLoadAs(in + 8 * 17) << 22, - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 0, 1, 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 41-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 60 | SafeLoadAs(in + 8 * 18) << 4, - SafeLoadAs(in + 8 * 18) >> 37 | SafeLoadAs(in + 8 * 19) << 27, - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 55 | SafeLoadAs(in + 8 * 20) << 9, - }; - shifts = simd_batch{ 0, 0, 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 41-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 32 | SafeLoadAs(in + 8 * 21) << 32, - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 50 | SafeLoadAs(in + 8 * 22) << 14, - SafeLoadAs(in + 8 * 22) >> 27 | SafeLoadAs(in + 8 * 23) << 37, - }; - shifts = simd_batch{ 0, 9, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 41-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 45 | SafeLoadAs(in + 8 * 24) << 19, - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 63 | SafeLoadAs(in + 8 * 25) << 1, - }; - shifts = simd_batch{ 4, 0, 22, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 41-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 40 | SafeLoadAs(in + 8 * 26) << 24, - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 58 | SafeLoadAs(in + 8 * 27) << 6, - SafeLoadAs(in + 8 * 27) >> 35 | SafeLoadAs(in + 8 * 28) << 29, - }; - shifts = simd_batch{ 0, 17, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 41-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 53 | SafeLoadAs(in + 8 * 29) << 11, - SafeLoadAs(in + 8 * 29) >> 30 | SafeLoadAs(in + 8 * 30) << 34, - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 12, 0, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 41-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 48 | SafeLoadAs(in + 8 * 31) << 16, - SafeLoadAs(in + 8 * 31) >> 25 | SafeLoadAs(in + 8 * 32) << 39, - SafeLoadAs(in + 8 * 32), - SafeLoadAs(in + 8 * 32) >> 43 | SafeLoadAs(in + 8 * 33) << 21, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 41-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 33), - SafeLoadAs(in + 8 * 33) >> 61 | SafeLoadAs(in + 8 * 34) << 3, - SafeLoadAs(in + 8 * 34) >> 38 | SafeLoadAs(in + 8 * 35) << 26, - SafeLoadAs(in + 8 * 35), - }; - shifts = simd_batch{ 20, 0, 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 41-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 35) >> 56 | SafeLoadAs(in + 8 * 36) << 8, - SafeLoadAs(in + 8 * 36) >> 33 | SafeLoadAs(in + 8 * 37) << 31, - SafeLoadAs(in + 8 * 37), - SafeLoadAs(in + 8 * 37) >> 51 | SafeLoadAs(in + 8 * 38) << 13, - }; - shifts = simd_batch{ 0, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 41-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 38) >> 28 | SafeLoadAs(in + 8 * 39) << 36, - SafeLoadAs(in + 8 * 39), - SafeLoadAs(in + 8 * 39) >> 46 | SafeLoadAs(in + 8 * 40) << 18, - SafeLoadAs(in + 8 * 40), - }; - shifts = simd_batch{ 0, 5, 0, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 41 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3ffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 42-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 42 | SafeLoadAs(in + 8 * 1) << 22, - SafeLoadAs(in + 8 * 1), - SafeLoadAs(in + 8 * 1) >> 62 | SafeLoadAs(in + 8 * 2) << 2, - }; - shifts = simd_batch{ 0, 0, 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 42-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 40 | SafeLoadAs(in + 8 * 3) << 24, - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - SafeLoadAs(in + 8 * 4) >> 38 | SafeLoadAs(in + 8 * 5) << 26, - }; - shifts = simd_batch{ 0, 18, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 42-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 58 | SafeLoadAs(in + 8 * 6) << 6, - SafeLoadAs(in + 8 * 6) >> 36 | SafeLoadAs(in + 8 * 7) << 28, - SafeLoadAs(in + 8 * 7), - }; - shifts = simd_batch{ 16, 0, 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 42-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8) >> 34 | SafeLoadAs(in + 8 * 9) << 30, - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 54 | SafeLoadAs(in + 8 * 10) << 10, - }; - shifts = simd_batch{ 0, 0, 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 42-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 32 | SafeLoadAs(in + 8 * 11) << 32, - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 52 | SafeLoadAs(in + 8 * 12) << 12, - SafeLoadAs(in + 8 * 12) >> 30 | SafeLoadAs(in + 8 * 13) << 34, - }; - shifts = simd_batch{ 0, 10, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 42-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 50 | SafeLoadAs(in + 8 * 14) << 14, - SafeLoadAs(in + 8 * 14) >> 28 | SafeLoadAs(in + 8 * 15) << 36, - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 8, 0, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 42-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 48 | SafeLoadAs(in + 8 * 16) << 16, - SafeLoadAs(in + 8 * 16) >> 26 | SafeLoadAs(in + 8 * 17) << 38, - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 46 | SafeLoadAs(in + 8 * 18) << 18, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 42-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 24 | SafeLoadAs(in + 8 * 19) << 40, - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 44 | SafeLoadAs(in + 8 * 20) << 20, - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 2, 0, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 42-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 42 | SafeLoadAs(in + 8 * 22) << 22, - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 62 | SafeLoadAs(in + 8 * 23) << 2, - }; - shifts = simd_batch{ 0, 0, 20, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 42-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 40 | SafeLoadAs(in + 8 * 24) << 24, - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 60 | SafeLoadAs(in + 8 * 25) << 4, - SafeLoadAs(in + 8 * 25) >> 38 | SafeLoadAs(in + 8 * 26) << 26, - }; - shifts = simd_batch{ 0, 18, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 42-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 58 | SafeLoadAs(in + 8 * 27) << 6, - SafeLoadAs(in + 8 * 27) >> 36 | SafeLoadAs(in + 8 * 28) << 28, - SafeLoadAs(in + 8 * 28), - }; - shifts = simd_batch{ 16, 0, 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 42-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 56 | SafeLoadAs(in + 8 * 29) << 8, - SafeLoadAs(in + 8 * 29) >> 34 | SafeLoadAs(in + 8 * 30) << 30, - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 54 | SafeLoadAs(in + 8 * 31) << 10, - }; - shifts = simd_batch{ 0, 0, 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 42-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 32 | SafeLoadAs(in + 8 * 32) << 32, - SafeLoadAs(in + 8 * 32), - SafeLoadAs(in + 8 * 32) >> 52 | SafeLoadAs(in + 8 * 33) << 12, - SafeLoadAs(in + 8 * 33) >> 30 | SafeLoadAs(in + 8 * 34) << 34, - }; - shifts = simd_batch{ 0, 10, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 42-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 34), - SafeLoadAs(in + 8 * 34) >> 50 | SafeLoadAs(in + 8 * 35) << 14, - SafeLoadAs(in + 8 * 35) >> 28 | SafeLoadAs(in + 8 * 36) << 36, - SafeLoadAs(in + 8 * 36), - }; - shifts = simd_batch{ 8, 0, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 42-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 48 | SafeLoadAs(in + 8 * 37) << 16, - SafeLoadAs(in + 8 * 37) >> 26 | SafeLoadAs(in + 8 * 38) << 38, - SafeLoadAs(in + 8 * 38), - SafeLoadAs(in + 8 * 38) >> 46 | SafeLoadAs(in + 8 * 39) << 18, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 42-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 24 | SafeLoadAs(in + 8 * 40) << 40, - SafeLoadAs(in + 8 * 40), - SafeLoadAs(in + 8 * 40) >> 44 | SafeLoadAs(in + 8 * 41) << 20, - SafeLoadAs(in + 8 * 41), - }; - shifts = simd_batch{ 0, 2, 0, 22 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 42 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7ffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 43-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 43 | SafeLoadAs(in + 8 * 1) << 21, - SafeLoadAs(in + 8 * 1) >> 22 | SafeLoadAs(in + 8 * 2) << 42, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 0, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 43-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 44 | SafeLoadAs(in + 8 * 3) << 20, - SafeLoadAs(in + 8 * 3) >> 23 | SafeLoadAs(in + 8 * 4) << 41, - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 45 | SafeLoadAs(in + 8 * 5) << 19, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 43-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 24 | SafeLoadAs(in + 8 * 6) << 40, - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 46 | SafeLoadAs(in + 8 * 7) << 18, - SafeLoadAs(in + 8 * 7) >> 25 | SafeLoadAs(in + 8 * 8) << 39, - }; - shifts = simd_batch{ 0, 3, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 43-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 47 | SafeLoadAs(in + 8 * 9) << 17, - SafeLoadAs(in + 8 * 9) >> 26 | SafeLoadAs(in + 8 * 10) << 38, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 4, 0, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 43-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 48 | SafeLoadAs(in + 8 * 11) << 16, - SafeLoadAs(in + 8 * 11) >> 27 | SafeLoadAs(in + 8 * 12) << 37, - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 49 | SafeLoadAs(in + 8 * 13) << 15, - }; - shifts = simd_batch{ 0, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 43-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 28 | SafeLoadAs(in + 8 * 14) << 36, - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 50 | SafeLoadAs(in + 8 * 15) << 14, - SafeLoadAs(in + 8 * 15) >> 29 | SafeLoadAs(in + 8 * 16) << 35, - }; - shifts = simd_batch{ 0, 7, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 43-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 51 | SafeLoadAs(in + 8 * 17) << 13, - SafeLoadAs(in + 8 * 17) >> 30 | SafeLoadAs(in + 8 * 18) << 34, - SafeLoadAs(in + 8 * 18), - }; - shifts = simd_batch{ 8, 0, 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 43-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 52 | SafeLoadAs(in + 8 * 19) << 12, - SafeLoadAs(in + 8 * 19) >> 31 | SafeLoadAs(in + 8 * 20) << 33, - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 53 | SafeLoadAs(in + 8 * 21) << 11, - }; - shifts = simd_batch{ 0, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 43-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 32 | SafeLoadAs(in + 8 * 22) << 32, - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 54 | SafeLoadAs(in + 8 * 23) << 10, - SafeLoadAs(in + 8 * 23) >> 33 | SafeLoadAs(in + 8 * 24) << 31, - }; - shifts = simd_batch{ 0, 11, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 43-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 55 | SafeLoadAs(in + 8 * 25) << 9, - SafeLoadAs(in + 8 * 25) >> 34 | SafeLoadAs(in + 8 * 26) << 30, - SafeLoadAs(in + 8 * 26), - }; - shifts = simd_batch{ 12, 0, 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 43-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 56 | SafeLoadAs(in + 8 * 27) << 8, - SafeLoadAs(in + 8 * 27) >> 35 | SafeLoadAs(in + 8 * 28) << 29, - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 57 | SafeLoadAs(in + 8 * 29) << 7, - }; - shifts = simd_batch{ 0, 0, 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 43-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 36 | SafeLoadAs(in + 8 * 30) << 28, - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 58 | SafeLoadAs(in + 8 * 31) << 6, - SafeLoadAs(in + 8 * 31) >> 37 | SafeLoadAs(in + 8 * 32) << 27, - }; - shifts = simd_batch{ 0, 15, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 43-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 32), - SafeLoadAs(in + 8 * 32) >> 59 | SafeLoadAs(in + 8 * 33) << 5, - SafeLoadAs(in + 8 * 33) >> 38 | SafeLoadAs(in + 8 * 34) << 26, - SafeLoadAs(in + 8 * 34), - }; - shifts = simd_batch{ 16, 0, 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 43-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 60 | SafeLoadAs(in + 8 * 35) << 4, - SafeLoadAs(in + 8 * 35) >> 39 | SafeLoadAs(in + 8 * 36) << 25, - SafeLoadAs(in + 8 * 36), - SafeLoadAs(in + 8 * 36) >> 61 | SafeLoadAs(in + 8 * 37) << 3, - }; - shifts = simd_batch{ 0, 0, 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 43-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 37) >> 40 | SafeLoadAs(in + 8 * 38) << 24, - SafeLoadAs(in + 8 * 38), - SafeLoadAs(in + 8 * 38) >> 62 | SafeLoadAs(in + 8 * 39) << 2, - SafeLoadAs(in + 8 * 39) >> 41 | SafeLoadAs(in + 8 * 40) << 23, - }; - shifts = simd_batch{ 0, 19, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 43-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 40), - SafeLoadAs(in + 8 * 40) >> 63 | SafeLoadAs(in + 8 * 41) << 1, - SafeLoadAs(in + 8 * 41) >> 42 | SafeLoadAs(in + 8 * 42) << 22, - SafeLoadAs(in + 8 * 42), - }; - shifts = simd_batch{ 20, 0, 0, 21 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 43 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xfffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 44-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 44 | SafeLoadAs(in + 8 * 1) << 20, - SafeLoadAs(in + 8 * 1) >> 24 | SafeLoadAs(in + 8 * 2) << 40, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 44-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 48 | SafeLoadAs(in + 8 * 3) << 16, - SafeLoadAs(in + 8 * 3) >> 28 | SafeLoadAs(in + 8 * 4) << 36, - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 52 | SafeLoadAs(in + 8 * 5) << 12, - }; - shifts = simd_batch{ 0, 0, 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 44-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 32 | SafeLoadAs(in + 8 * 6) << 32, - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 56 | SafeLoadAs(in + 8 * 7) << 8, - SafeLoadAs(in + 8 * 7) >> 36 | SafeLoadAs(in + 8 * 8) << 28, - }; - shifts = simd_batch{ 0, 12, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 44-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 60 | SafeLoadAs(in + 8 * 9) << 4, - SafeLoadAs(in + 8 * 9) >> 40 | SafeLoadAs(in + 8 * 10) << 24, - SafeLoadAs(in + 8 * 10), - }; - shifts = simd_batch{ 16, 0, 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 44-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 44 | SafeLoadAs(in + 8 * 12) << 20, - SafeLoadAs(in + 8 * 12) >> 24 | SafeLoadAs(in + 8 * 13) << 40, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 44-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 48 | SafeLoadAs(in + 8 * 14) << 16, - SafeLoadAs(in + 8 * 14) >> 28 | SafeLoadAs(in + 8 * 15) << 36, - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 52 | SafeLoadAs(in + 8 * 16) << 12, - }; - shifts = simd_batch{ 0, 0, 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 44-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 32 | SafeLoadAs(in + 8 * 17) << 32, - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 56 | SafeLoadAs(in + 8 * 18) << 8, - SafeLoadAs(in + 8 * 18) >> 36 | SafeLoadAs(in + 8 * 19) << 28, - }; - shifts = simd_batch{ 0, 12, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 44-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 60 | SafeLoadAs(in + 8 * 20) << 4, - SafeLoadAs(in + 8 * 20) >> 40 | SafeLoadAs(in + 8 * 21) << 24, - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 16, 0, 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 44-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 44 | SafeLoadAs(in + 8 * 23) << 20, - SafeLoadAs(in + 8 * 23) >> 24 | SafeLoadAs(in + 8 * 24) << 40, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 44-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 48 | SafeLoadAs(in + 8 * 25) << 16, - SafeLoadAs(in + 8 * 25) >> 28 | SafeLoadAs(in + 8 * 26) << 36, - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 52 | SafeLoadAs(in + 8 * 27) << 12, - }; - shifts = simd_batch{ 0, 0, 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 44-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 32 | SafeLoadAs(in + 8 * 28) << 32, - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 56 | SafeLoadAs(in + 8 * 29) << 8, - SafeLoadAs(in + 8 * 29) >> 36 | SafeLoadAs(in + 8 * 30) << 28, - }; - shifts = simd_batch{ 0, 12, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 44-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 60 | SafeLoadAs(in + 8 * 31) << 4, - SafeLoadAs(in + 8 * 31) >> 40 | SafeLoadAs(in + 8 * 32) << 24, - SafeLoadAs(in + 8 * 32), - }; - shifts = simd_batch{ 16, 0, 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 44-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 33), - SafeLoadAs(in + 8 * 33) >> 44 | SafeLoadAs(in + 8 * 34) << 20, - SafeLoadAs(in + 8 * 34) >> 24 | SafeLoadAs(in + 8 * 35) << 40, - SafeLoadAs(in + 8 * 35), - }; - shifts = simd_batch{ 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 44-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 35) >> 48 | SafeLoadAs(in + 8 * 36) << 16, - SafeLoadAs(in + 8 * 36) >> 28 | SafeLoadAs(in + 8 * 37) << 36, - SafeLoadAs(in + 8 * 37), - SafeLoadAs(in + 8 * 37) >> 52 | SafeLoadAs(in + 8 * 38) << 12, - }; - shifts = simd_batch{ 0, 0, 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 44-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 38) >> 32 | SafeLoadAs(in + 8 * 39) << 32, - SafeLoadAs(in + 8 * 39), - SafeLoadAs(in + 8 * 39) >> 56 | SafeLoadAs(in + 8 * 40) << 8, - SafeLoadAs(in + 8 * 40) >> 36 | SafeLoadAs(in + 8 * 41) << 28, - }; - shifts = simd_batch{ 0, 12, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 44-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 41), - SafeLoadAs(in + 8 * 41) >> 60 | SafeLoadAs(in + 8 * 42) << 4, - SafeLoadAs(in + 8 * 42) >> 40 | SafeLoadAs(in + 8 * 43) << 24, - SafeLoadAs(in + 8 * 43), - }; - shifts = simd_batch{ 16, 0, 0, 20 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 44 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1fffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 45-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 45 | SafeLoadAs(in + 8 * 1) << 19, - SafeLoadAs(in + 8 * 1) >> 26 | SafeLoadAs(in + 8 * 2) << 38, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 0, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 45-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 52 | SafeLoadAs(in + 8 * 3) << 12, - SafeLoadAs(in + 8 * 3) >> 33 | SafeLoadAs(in + 8 * 4) << 31, - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 59 | SafeLoadAs(in + 8 * 5) << 5, - }; - shifts = simd_batch{ 0, 0, 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 45-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 40 | SafeLoadAs(in + 8 * 6) << 24, - SafeLoadAs(in + 8 * 6) >> 21 | SafeLoadAs(in + 8 * 7) << 43, - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 47 | SafeLoadAs(in + 8 * 8) << 17, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 45-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 28 | SafeLoadAs(in + 8 * 9) << 36, - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 54 | SafeLoadAs(in + 8 * 10) << 10, - SafeLoadAs(in + 8 * 10) >> 35 | SafeLoadAs(in + 8 * 11) << 29, - }; - shifts = simd_batch{ 0, 9, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 45-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 61 | SafeLoadAs(in + 8 * 12) << 3, - SafeLoadAs(in + 8 * 12) >> 42 | SafeLoadAs(in + 8 * 13) << 22, - SafeLoadAs(in + 8 * 13) >> 23 | SafeLoadAs(in + 8 * 14) << 41, - }; - shifts = simd_batch{ 16, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 45-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 49 | SafeLoadAs(in + 8 * 15) << 15, - SafeLoadAs(in + 8 * 15) >> 30 | SafeLoadAs(in + 8 * 16) << 34, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 4, 0, 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 45-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - SafeLoadAs(in + 8 * 17) >> 37 | SafeLoadAs(in + 8 * 18) << 27, - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 63 | SafeLoadAs(in + 8 * 19) << 1, - }; - shifts = simd_batch{ 0, 0, 18, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 45-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 44 | SafeLoadAs(in + 8 * 20) << 20, - SafeLoadAs(in + 8 * 20) >> 25 | SafeLoadAs(in + 8 * 21) << 39, - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 51 | SafeLoadAs(in + 8 * 22) << 13, - }; - shifts = simd_batch{ 0, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 45-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 32 | SafeLoadAs(in + 8 * 23) << 32, - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 58 | SafeLoadAs(in + 8 * 24) << 6, - SafeLoadAs(in + 8 * 24) >> 39 | SafeLoadAs(in + 8 * 25) << 25, - }; - shifts = simd_batch{ 0, 13, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 45-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 20 | SafeLoadAs(in + 8 * 26) << 44, - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 46 | SafeLoadAs(in + 8 * 27) << 18, - SafeLoadAs(in + 8 * 27) >> 27 | SafeLoadAs(in + 8 * 28) << 37, - }; - shifts = simd_batch{ 0, 1, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 45-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 53 | SafeLoadAs(in + 8 * 29) << 11, - SafeLoadAs(in + 8 * 29) >> 34 | SafeLoadAs(in + 8 * 30) << 30, - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 8, 0, 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 45-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 60 | SafeLoadAs(in + 8 * 31) << 4, - SafeLoadAs(in + 8 * 31) >> 41 | SafeLoadAs(in + 8 * 32) << 23, - SafeLoadAs(in + 8 * 32) >> 22 | SafeLoadAs(in + 8 * 33) << 42, - SafeLoadAs(in + 8 * 33), - }; - shifts = simd_batch{ 0, 0, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 45-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 48 | SafeLoadAs(in + 8 * 34) << 16, - SafeLoadAs(in + 8 * 34) >> 29 | SafeLoadAs(in + 8 * 35) << 35, - SafeLoadAs(in + 8 * 35), - SafeLoadAs(in + 8 * 35) >> 55 | SafeLoadAs(in + 8 * 36) << 9, - }; - shifts = simd_batch{ 0, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 45-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 36 | SafeLoadAs(in + 8 * 37) << 28, - SafeLoadAs(in + 8 * 37), - SafeLoadAs(in + 8 * 37) >> 62 | SafeLoadAs(in + 8 * 38) << 2, - SafeLoadAs(in + 8 * 38) >> 43 | SafeLoadAs(in + 8 * 39) << 21, - }; - shifts = simd_batch{ 0, 17, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 45-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 24 | SafeLoadAs(in + 8 * 40) << 40, - SafeLoadAs(in + 8 * 40), - SafeLoadAs(in + 8 * 40) >> 50 | SafeLoadAs(in + 8 * 41) << 14, - SafeLoadAs(in + 8 * 41) >> 31 | SafeLoadAs(in + 8 * 42) << 33, - }; - shifts = simd_batch{ 0, 5, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 45-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 42), - SafeLoadAs(in + 8 * 42) >> 57 | SafeLoadAs(in + 8 * 43) << 7, - SafeLoadAs(in + 8 * 43) >> 38 | SafeLoadAs(in + 8 * 44) << 26, - SafeLoadAs(in + 8 * 44), - }; - shifts = simd_batch{ 12, 0, 0, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 45 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3fffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 46-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 46 | SafeLoadAs(in + 8 * 1) << 18, - SafeLoadAs(in + 8 * 1) >> 28 | SafeLoadAs(in + 8 * 2) << 36, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 0, 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 46-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 56 | SafeLoadAs(in + 8 * 3) << 8, - SafeLoadAs(in + 8 * 3) >> 38 | SafeLoadAs(in + 8 * 4) << 26, - SafeLoadAs(in + 8 * 4) >> 20 | SafeLoadAs(in + 8 * 5) << 44, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 46-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 48 | SafeLoadAs(in + 8 * 6) << 16, - SafeLoadAs(in + 8 * 6) >> 30 | SafeLoadAs(in + 8 * 7) << 34, - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 58 | SafeLoadAs(in + 8 * 8) << 6, - }; - shifts = simd_batch{ 0, 0, 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 46-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 40 | SafeLoadAs(in + 8 * 9) << 24, - SafeLoadAs(in + 8 * 9) >> 22 | SafeLoadAs(in + 8 * 10) << 42, - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 50 | SafeLoadAs(in + 8 * 11) << 14, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 46-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 32 | SafeLoadAs(in + 8 * 12) << 32, - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 60 | SafeLoadAs(in + 8 * 13) << 4, - SafeLoadAs(in + 8 * 13) >> 42 | SafeLoadAs(in + 8 * 14) << 22, - }; - shifts = simd_batch{ 0, 14, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 46-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 24 | SafeLoadAs(in + 8 * 15) << 40, - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 52 | SafeLoadAs(in + 8 * 16) << 12, - SafeLoadAs(in + 8 * 16) >> 34 | SafeLoadAs(in + 8 * 17) << 30, - }; - shifts = simd_batch{ 0, 6, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 46-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 62 | SafeLoadAs(in + 8 * 18) << 2, - SafeLoadAs(in + 8 * 18) >> 44 | SafeLoadAs(in + 8 * 19) << 20, - SafeLoadAs(in + 8 * 19) >> 26 | SafeLoadAs(in + 8 * 20) << 38, - }; - shifts = simd_batch{ 16, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 46-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 54 | SafeLoadAs(in + 8 * 21) << 10, - SafeLoadAs(in + 8 * 21) >> 36 | SafeLoadAs(in + 8 * 22) << 28, - SafeLoadAs(in + 8 * 22), - }; - shifts = simd_batch{ 8, 0, 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 46-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 46 | SafeLoadAs(in + 8 * 24) << 18, - SafeLoadAs(in + 8 * 24) >> 28 | SafeLoadAs(in + 8 * 25) << 36, - SafeLoadAs(in + 8 * 25), - }; - shifts = simd_batch{ 0, 0, 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 46-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 56 | SafeLoadAs(in + 8 * 26) << 8, - SafeLoadAs(in + 8 * 26) >> 38 | SafeLoadAs(in + 8 * 27) << 26, - SafeLoadAs(in + 8 * 27) >> 20 | SafeLoadAs(in + 8 * 28) << 44, - SafeLoadAs(in + 8 * 28), - }; - shifts = simd_batch{ 0, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 46-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 48 | SafeLoadAs(in + 8 * 29) << 16, - SafeLoadAs(in + 8 * 29) >> 30 | SafeLoadAs(in + 8 * 30) << 34, - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 58 | SafeLoadAs(in + 8 * 31) << 6, - }; - shifts = simd_batch{ 0, 0, 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 46-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 40 | SafeLoadAs(in + 8 * 32) << 24, - SafeLoadAs(in + 8 * 32) >> 22 | SafeLoadAs(in + 8 * 33) << 42, - SafeLoadAs(in + 8 * 33), - SafeLoadAs(in + 8 * 33) >> 50 | SafeLoadAs(in + 8 * 34) << 14, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 46-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 32 | SafeLoadAs(in + 8 * 35) << 32, - SafeLoadAs(in + 8 * 35), - SafeLoadAs(in + 8 * 35) >> 60 | SafeLoadAs(in + 8 * 36) << 4, - SafeLoadAs(in + 8 * 36) >> 42 | SafeLoadAs(in + 8 * 37) << 22, - }; - shifts = simd_batch{ 0, 14, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 46-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 37) >> 24 | SafeLoadAs(in + 8 * 38) << 40, - SafeLoadAs(in + 8 * 38), - SafeLoadAs(in + 8 * 38) >> 52 | SafeLoadAs(in + 8 * 39) << 12, - SafeLoadAs(in + 8 * 39) >> 34 | SafeLoadAs(in + 8 * 40) << 30, - }; - shifts = simd_batch{ 0, 6, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 46-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 40), - SafeLoadAs(in + 8 * 40) >> 62 | SafeLoadAs(in + 8 * 41) << 2, - SafeLoadAs(in + 8 * 41) >> 44 | SafeLoadAs(in + 8 * 42) << 20, - SafeLoadAs(in + 8 * 42) >> 26 | SafeLoadAs(in + 8 * 43) << 38, - }; - shifts = simd_batch{ 16, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 46-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 43), - SafeLoadAs(in + 8 * 43) >> 54 | SafeLoadAs(in + 8 * 44) << 10, - SafeLoadAs(in + 8 * 44) >> 36 | SafeLoadAs(in + 8 * 45) << 28, - SafeLoadAs(in + 8 * 45), - }; - shifts = simd_batch{ 8, 0, 0, 18 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 46 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7fffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 47-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 47 | SafeLoadAs(in + 8 * 1) << 17, - SafeLoadAs(in + 8 * 1) >> 30 | SafeLoadAs(in + 8 * 2) << 34, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 0, 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 47-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 2) >> 60 | SafeLoadAs(in + 8 * 3) << 4, - SafeLoadAs(in + 8 * 3) >> 43 | SafeLoadAs(in + 8 * 4) << 21, - SafeLoadAs(in + 8 * 4) >> 26 | SafeLoadAs(in + 8 * 5) << 38, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 0, 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 47-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 5) >> 56 | SafeLoadAs(in + 8 * 6) << 8, - SafeLoadAs(in + 8 * 6) >> 39 | SafeLoadAs(in + 8 * 7) << 25, - SafeLoadAs(in + 8 * 7) >> 22 | SafeLoadAs(in + 8 * 8) << 42, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 0, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 47-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 8) >> 52 | SafeLoadAs(in + 8 * 9) << 12, - SafeLoadAs(in + 8 * 9) >> 35 | SafeLoadAs(in + 8 * 10) << 29, - SafeLoadAs(in + 8 * 10) >> 18 | SafeLoadAs(in + 8 * 11) << 46, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 0, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 47-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 48 | SafeLoadAs(in + 8 * 12) << 16, - SafeLoadAs(in + 8 * 12) >> 31 | SafeLoadAs(in + 8 * 13) << 33, - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 61 | SafeLoadAs(in + 8 * 14) << 3, - }; - shifts = simd_batch{ 0, 0, 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 47-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 44 | SafeLoadAs(in + 8 * 15) << 20, - SafeLoadAs(in + 8 * 15) >> 27 | SafeLoadAs(in + 8 * 16) << 37, - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 57 | SafeLoadAs(in + 8 * 17) << 7, - }; - shifts = simd_batch{ 0, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 47-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 40 | SafeLoadAs(in + 8 * 18) << 24, - SafeLoadAs(in + 8 * 18) >> 23 | SafeLoadAs(in + 8 * 19) << 41, - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 53 | SafeLoadAs(in + 8 * 20) << 11, - }; - shifts = simd_batch{ 0, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 47-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 36 | SafeLoadAs(in + 8 * 21) << 28, - SafeLoadAs(in + 8 * 21) >> 19 | SafeLoadAs(in + 8 * 22) << 45, - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 49 | SafeLoadAs(in + 8 * 23) << 15, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 47-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 32 | SafeLoadAs(in + 8 * 24) << 32, - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 62 | SafeLoadAs(in + 8 * 25) << 2, - SafeLoadAs(in + 8 * 25) >> 45 | SafeLoadAs(in + 8 * 26) << 19, - }; - shifts = simd_batch{ 0, 15, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 47-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 28 | SafeLoadAs(in + 8 * 27) << 36, - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 58 | SafeLoadAs(in + 8 * 28) << 6, - SafeLoadAs(in + 8 * 28) >> 41 | SafeLoadAs(in + 8 * 29) << 23, - }; - shifts = simd_batch{ 0, 11, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 47-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 24 | SafeLoadAs(in + 8 * 30) << 40, - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 54 | SafeLoadAs(in + 8 * 31) << 10, - SafeLoadAs(in + 8 * 31) >> 37 | SafeLoadAs(in + 8 * 32) << 27, - }; - shifts = simd_batch{ 0, 7, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 47-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 32) >> 20 | SafeLoadAs(in + 8 * 33) << 44, - SafeLoadAs(in + 8 * 33), - SafeLoadAs(in + 8 * 33) >> 50 | SafeLoadAs(in + 8 * 34) << 14, - SafeLoadAs(in + 8 * 34) >> 33 | SafeLoadAs(in + 8 * 35) << 31, - }; - shifts = simd_batch{ 0, 3, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 47-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 35), - SafeLoadAs(in + 8 * 35) >> 63 | SafeLoadAs(in + 8 * 36) << 1, - SafeLoadAs(in + 8 * 36) >> 46 | SafeLoadAs(in + 8 * 37) << 18, - SafeLoadAs(in + 8 * 37) >> 29 | SafeLoadAs(in + 8 * 38) << 35, - }; - shifts = simd_batch{ 16, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 47-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 38), - SafeLoadAs(in + 8 * 38) >> 59 | SafeLoadAs(in + 8 * 39) << 5, - SafeLoadAs(in + 8 * 39) >> 42 | SafeLoadAs(in + 8 * 40) << 22, - SafeLoadAs(in + 8 * 40) >> 25 | SafeLoadAs(in + 8 * 41) << 39, - }; - shifts = simd_batch{ 12, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 47-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 41), - SafeLoadAs(in + 8 * 41) >> 55 | SafeLoadAs(in + 8 * 42) << 9, - SafeLoadAs(in + 8 * 42) >> 38 | SafeLoadAs(in + 8 * 43) << 26, - SafeLoadAs(in + 8 * 43) >> 21 | SafeLoadAs(in + 8 * 44) << 43, - }; - shifts = simd_batch{ 8, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 47-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 44), - SafeLoadAs(in + 8 * 44) >> 51 | SafeLoadAs(in + 8 * 45) << 13, - SafeLoadAs(in + 8 * 45) >> 34 | SafeLoadAs(in + 8 * 46) << 30, - SafeLoadAs(in + 8 * 46), - }; - shifts = simd_batch{ 4, 0, 0, 17 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 47 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 48-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 48 | SafeLoadAs(in + 8 * 1) << 16, - SafeLoadAs(in + 8 * 1) >> 32 | SafeLoadAs(in + 8 * 2) << 32, - SafeLoadAs(in + 8 * 2), - }; - shifts = simd_batch{ 0, 0, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 48-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 48 | SafeLoadAs(in + 8 * 4) << 16, - SafeLoadAs(in + 8 * 4) >> 32 | SafeLoadAs(in + 8 * 5) << 32, - SafeLoadAs(in + 8 * 5), - }; - shifts = simd_batch{ 0, 0, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 48-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 48 | SafeLoadAs(in + 8 * 7) << 16, - SafeLoadAs(in + 8 * 7) >> 32 | SafeLoadAs(in + 8 * 8) << 32, - SafeLoadAs(in + 8 * 8), - }; - shifts = simd_batch{ 0, 0, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 48-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 48 | SafeLoadAs(in + 8 * 10) << 16, - SafeLoadAs(in + 8 * 10) >> 32 | SafeLoadAs(in + 8 * 11) << 32, - SafeLoadAs(in + 8 * 11), - }; - shifts = simd_batch{ 0, 0, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 48-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 48 | SafeLoadAs(in + 8 * 13) << 16, - SafeLoadAs(in + 8 * 13) >> 32 | SafeLoadAs(in + 8 * 14) << 32, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 0, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 48-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 48 | SafeLoadAs(in + 8 * 16) << 16, - SafeLoadAs(in + 8 * 16) >> 32 | SafeLoadAs(in + 8 * 17) << 32, - SafeLoadAs(in + 8 * 17), - }; - shifts = simd_batch{ 0, 0, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 48-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 48 | SafeLoadAs(in + 8 * 19) << 16, - SafeLoadAs(in + 8 * 19) >> 32 | SafeLoadAs(in + 8 * 20) << 32, - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 0, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 48-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 48 | SafeLoadAs(in + 8 * 22) << 16, - SafeLoadAs(in + 8 * 22) >> 32 | SafeLoadAs(in + 8 * 23) << 32, - SafeLoadAs(in + 8 * 23), - }; - shifts = simd_batch{ 0, 0, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 48-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 48 | SafeLoadAs(in + 8 * 25) << 16, - SafeLoadAs(in + 8 * 25) >> 32 | SafeLoadAs(in + 8 * 26) << 32, - SafeLoadAs(in + 8 * 26), - }; - shifts = simd_batch{ 0, 0, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 48-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 48 | SafeLoadAs(in + 8 * 28) << 16, - SafeLoadAs(in + 8 * 28) >> 32 | SafeLoadAs(in + 8 * 29) << 32, - SafeLoadAs(in + 8 * 29), - }; - shifts = simd_batch{ 0, 0, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 48-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 48 | SafeLoadAs(in + 8 * 31) << 16, - SafeLoadAs(in + 8 * 31) >> 32 | SafeLoadAs(in + 8 * 32) << 32, - SafeLoadAs(in + 8 * 32), - }; - shifts = simd_batch{ 0, 0, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 48-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 33), - SafeLoadAs(in + 8 * 33) >> 48 | SafeLoadAs(in + 8 * 34) << 16, - SafeLoadAs(in + 8 * 34) >> 32 | SafeLoadAs(in + 8 * 35) << 32, - SafeLoadAs(in + 8 * 35), - }; - shifts = simd_batch{ 0, 0, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 48-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 36), - SafeLoadAs(in + 8 * 36) >> 48 | SafeLoadAs(in + 8 * 37) << 16, - SafeLoadAs(in + 8 * 37) >> 32 | SafeLoadAs(in + 8 * 38) << 32, - SafeLoadAs(in + 8 * 38), - }; - shifts = simd_batch{ 0, 0, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 48-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 39), - SafeLoadAs(in + 8 * 39) >> 48 | SafeLoadAs(in + 8 * 40) << 16, - SafeLoadAs(in + 8 * 40) >> 32 | SafeLoadAs(in + 8 * 41) << 32, - SafeLoadAs(in + 8 * 41), - }; - shifts = simd_batch{ 0, 0, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 48-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 42), - SafeLoadAs(in + 8 * 42) >> 48 | SafeLoadAs(in + 8 * 43) << 16, - SafeLoadAs(in + 8 * 43) >> 32 | SafeLoadAs(in + 8 * 44) << 32, - SafeLoadAs(in + 8 * 44), - }; - shifts = simd_batch{ 0, 0, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 48-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 45), - SafeLoadAs(in + 8 * 45) >> 48 | SafeLoadAs(in + 8 * 46) << 16, - SafeLoadAs(in + 8 * 46) >> 32 | SafeLoadAs(in + 8 * 47) << 32, - SafeLoadAs(in + 8 * 47), - }; - shifts = simd_batch{ 0, 0, 0, 16 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 48 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1ffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 49-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 49 | SafeLoadAs(in + 8 * 1) << 15, - SafeLoadAs(in + 8 * 1) >> 34 | SafeLoadAs(in + 8 * 2) << 30, - SafeLoadAs(in + 8 * 2) >> 19 | SafeLoadAs(in + 8 * 3) << 45, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 49-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 53 | SafeLoadAs(in + 8 * 4) << 11, - SafeLoadAs(in + 8 * 4) >> 38 | SafeLoadAs(in + 8 * 5) << 26, - SafeLoadAs(in + 8 * 5) >> 23 | SafeLoadAs(in + 8 * 6) << 41, - }; - shifts = simd_batch{ 4, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 49-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 6), - SafeLoadAs(in + 8 * 6) >> 57 | SafeLoadAs(in + 8 * 7) << 7, - SafeLoadAs(in + 8 * 7) >> 42 | SafeLoadAs(in + 8 * 8) << 22, - SafeLoadAs(in + 8 * 8) >> 27 | SafeLoadAs(in + 8 * 9) << 37, - }; - shifts = simd_batch{ 8, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 49-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 61 | SafeLoadAs(in + 8 * 10) << 3, - SafeLoadAs(in + 8 * 10) >> 46 | SafeLoadAs(in + 8 * 11) << 18, - SafeLoadAs(in + 8 * 11) >> 31 | SafeLoadAs(in + 8 * 12) << 33, - }; - shifts = simd_batch{ 12, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 49-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 16 | SafeLoadAs(in + 8 * 13) << 48, - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 50 | SafeLoadAs(in + 8 * 14) << 14, - SafeLoadAs(in + 8 * 14) >> 35 | SafeLoadAs(in + 8 * 15) << 29, - }; - shifts = simd_batch{ 0, 1, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 49-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 20 | SafeLoadAs(in + 8 * 16) << 44, - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 54 | SafeLoadAs(in + 8 * 17) << 10, - SafeLoadAs(in + 8 * 17) >> 39 | SafeLoadAs(in + 8 * 18) << 25, - }; - shifts = simd_batch{ 0, 5, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 49-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 24 | SafeLoadAs(in + 8 * 19) << 40, - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 58 | SafeLoadAs(in + 8 * 20) << 6, - SafeLoadAs(in + 8 * 20) >> 43 | SafeLoadAs(in + 8 * 21) << 21, - }; - shifts = simd_batch{ 0, 9, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 49-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 28 | SafeLoadAs(in + 8 * 22) << 36, - SafeLoadAs(in + 8 * 22), - SafeLoadAs(in + 8 * 22) >> 62 | SafeLoadAs(in + 8 * 23) << 2, - SafeLoadAs(in + 8 * 23) >> 47 | SafeLoadAs(in + 8 * 24) << 17, - }; - shifts = simd_batch{ 0, 13, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 49-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 32 | SafeLoadAs(in + 8 * 25) << 32, - SafeLoadAs(in + 8 * 25) >> 17 | SafeLoadAs(in + 8 * 26) << 47, - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 51 | SafeLoadAs(in + 8 * 27) << 13, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 49-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 36 | SafeLoadAs(in + 8 * 28) << 28, - SafeLoadAs(in + 8 * 28) >> 21 | SafeLoadAs(in + 8 * 29) << 43, - SafeLoadAs(in + 8 * 29), - SafeLoadAs(in + 8 * 29) >> 55 | SafeLoadAs(in + 8 * 30) << 9, - }; - shifts = simd_batch{ 0, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 49-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 40 | SafeLoadAs(in + 8 * 31) << 24, - SafeLoadAs(in + 8 * 31) >> 25 | SafeLoadAs(in + 8 * 32) << 39, - SafeLoadAs(in + 8 * 32), - SafeLoadAs(in + 8 * 32) >> 59 | SafeLoadAs(in + 8 * 33) << 5, - }; - shifts = simd_batch{ 0, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 49-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 44 | SafeLoadAs(in + 8 * 34) << 20, - SafeLoadAs(in + 8 * 34) >> 29 | SafeLoadAs(in + 8 * 35) << 35, - SafeLoadAs(in + 8 * 35), - SafeLoadAs(in + 8 * 35) >> 63 | SafeLoadAs(in + 8 * 36) << 1, - }; - shifts = simd_batch{ 0, 0, 14, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 49-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 48 | SafeLoadAs(in + 8 * 37) << 16, - SafeLoadAs(in + 8 * 37) >> 33 | SafeLoadAs(in + 8 * 38) << 31, - SafeLoadAs(in + 8 * 38) >> 18 | SafeLoadAs(in + 8 * 39) << 46, - SafeLoadAs(in + 8 * 39), - }; - shifts = simd_batch{ 0, 0, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 49-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 52 | SafeLoadAs(in + 8 * 40) << 12, - SafeLoadAs(in + 8 * 40) >> 37 | SafeLoadAs(in + 8 * 41) << 27, - SafeLoadAs(in + 8 * 41) >> 22 | SafeLoadAs(in + 8 * 42) << 42, - SafeLoadAs(in + 8 * 42), - }; - shifts = simd_batch{ 0, 0, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 49-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 42) >> 56 | SafeLoadAs(in + 8 * 43) << 8, - SafeLoadAs(in + 8 * 43) >> 41 | SafeLoadAs(in + 8 * 44) << 23, - SafeLoadAs(in + 8 * 44) >> 26 | SafeLoadAs(in + 8 * 45) << 38, - SafeLoadAs(in + 8 * 45), - }; - shifts = simd_batch{ 0, 0, 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 49-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 45) >> 60 | SafeLoadAs(in + 8 * 46) << 4, - SafeLoadAs(in + 8 * 46) >> 45 | SafeLoadAs(in + 8 * 47) << 19, - SafeLoadAs(in + 8 * 47) >> 30 | SafeLoadAs(in + 8 * 48) << 34, - SafeLoadAs(in + 8 * 48), - }; - shifts = simd_batch{ 0, 0, 0, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 49 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3ffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 50-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 50 | SafeLoadAs(in + 8 * 1) << 14, - SafeLoadAs(in + 8 * 1) >> 36 | SafeLoadAs(in + 8 * 2) << 28, - SafeLoadAs(in + 8 * 2) >> 22 | SafeLoadAs(in + 8 * 3) << 42, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 50-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 58 | SafeLoadAs(in + 8 * 4) << 6, - SafeLoadAs(in + 8 * 4) >> 44 | SafeLoadAs(in + 8 * 5) << 20, - SafeLoadAs(in + 8 * 5) >> 30 | SafeLoadAs(in + 8 * 6) << 34, - }; - shifts = simd_batch{ 8, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 50-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 16 | SafeLoadAs(in + 8 * 7) << 48, - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 52 | SafeLoadAs(in + 8 * 8) << 12, - SafeLoadAs(in + 8 * 8) >> 38 | SafeLoadAs(in + 8 * 9) << 26, - }; - shifts = simd_batch{ 0, 2, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 50-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 24 | SafeLoadAs(in + 8 * 10) << 40, - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 60 | SafeLoadAs(in + 8 * 11) << 4, - SafeLoadAs(in + 8 * 11) >> 46 | SafeLoadAs(in + 8 * 12) << 18, - }; - shifts = simd_batch{ 0, 10, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 50-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 32 | SafeLoadAs(in + 8 * 13) << 32, - SafeLoadAs(in + 8 * 13) >> 18 | SafeLoadAs(in + 8 * 14) << 46, - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 54 | SafeLoadAs(in + 8 * 15) << 10, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 50-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 40 | SafeLoadAs(in + 8 * 16) << 24, - SafeLoadAs(in + 8 * 16) >> 26 | SafeLoadAs(in + 8 * 17) << 38, - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 62 | SafeLoadAs(in + 8 * 18) << 2, - }; - shifts = simd_batch{ 0, 0, 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 50-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 48 | SafeLoadAs(in + 8 * 19) << 16, - SafeLoadAs(in + 8 * 19) >> 34 | SafeLoadAs(in + 8 * 20) << 30, - SafeLoadAs(in + 8 * 20) >> 20 | SafeLoadAs(in + 8 * 21) << 44, - SafeLoadAs(in + 8 * 21), - }; - shifts = simd_batch{ 0, 0, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 50-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 56 | SafeLoadAs(in + 8 * 22) << 8, - SafeLoadAs(in + 8 * 22) >> 42 | SafeLoadAs(in + 8 * 23) << 22, - SafeLoadAs(in + 8 * 23) >> 28 | SafeLoadAs(in + 8 * 24) << 36, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 0, 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 50-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 25), - SafeLoadAs(in + 8 * 25) >> 50 | SafeLoadAs(in + 8 * 26) << 14, - SafeLoadAs(in + 8 * 26) >> 36 | SafeLoadAs(in + 8 * 27) << 28, - SafeLoadAs(in + 8 * 27) >> 22 | SafeLoadAs(in + 8 * 28) << 42, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 50-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 58 | SafeLoadAs(in + 8 * 29) << 6, - SafeLoadAs(in + 8 * 29) >> 44 | SafeLoadAs(in + 8 * 30) << 20, - SafeLoadAs(in + 8 * 30) >> 30 | SafeLoadAs(in + 8 * 31) << 34, - }; - shifts = simd_batch{ 8, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 50-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 16 | SafeLoadAs(in + 8 * 32) << 48, - SafeLoadAs(in + 8 * 32), - SafeLoadAs(in + 8 * 32) >> 52 | SafeLoadAs(in + 8 * 33) << 12, - SafeLoadAs(in + 8 * 33) >> 38 | SafeLoadAs(in + 8 * 34) << 26, - }; - shifts = simd_batch{ 0, 2, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 50-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 24 | SafeLoadAs(in + 8 * 35) << 40, - SafeLoadAs(in + 8 * 35), - SafeLoadAs(in + 8 * 35) >> 60 | SafeLoadAs(in + 8 * 36) << 4, - SafeLoadAs(in + 8 * 36) >> 46 | SafeLoadAs(in + 8 * 37) << 18, - }; - shifts = simd_batch{ 0, 10, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 50-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 37) >> 32 | SafeLoadAs(in + 8 * 38) << 32, - SafeLoadAs(in + 8 * 38) >> 18 | SafeLoadAs(in + 8 * 39) << 46, - SafeLoadAs(in + 8 * 39), - SafeLoadAs(in + 8 * 39) >> 54 | SafeLoadAs(in + 8 * 40) << 10, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 50-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 40) >> 40 | SafeLoadAs(in + 8 * 41) << 24, - SafeLoadAs(in + 8 * 41) >> 26 | SafeLoadAs(in + 8 * 42) << 38, - SafeLoadAs(in + 8 * 42), - SafeLoadAs(in + 8 * 42) >> 62 | SafeLoadAs(in + 8 * 43) << 2, - }; - shifts = simd_batch{ 0, 0, 12, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 50-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 43) >> 48 | SafeLoadAs(in + 8 * 44) << 16, - SafeLoadAs(in + 8 * 44) >> 34 | SafeLoadAs(in + 8 * 45) << 30, - SafeLoadAs(in + 8 * 45) >> 20 | SafeLoadAs(in + 8 * 46) << 44, - SafeLoadAs(in + 8 * 46), - }; - shifts = simd_batch{ 0, 0, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 50-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 46) >> 56 | SafeLoadAs(in + 8 * 47) << 8, - SafeLoadAs(in + 8 * 47) >> 42 | SafeLoadAs(in + 8 * 48) << 22, - SafeLoadAs(in + 8 * 48) >> 28 | SafeLoadAs(in + 8 * 49) << 36, - SafeLoadAs(in + 8 * 49), - }; - shifts = simd_batch{ 0, 0, 0, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 50 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7ffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 51-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 51 | SafeLoadAs(in + 8 * 1) << 13, - SafeLoadAs(in + 8 * 1) >> 38 | SafeLoadAs(in + 8 * 2) << 26, - SafeLoadAs(in + 8 * 2) >> 25 | SafeLoadAs(in + 8 * 3) << 39, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 51-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3), - SafeLoadAs(in + 8 * 3) >> 63 | SafeLoadAs(in + 8 * 4) << 1, - SafeLoadAs(in + 8 * 4) >> 50 | SafeLoadAs(in + 8 * 5) << 14, - SafeLoadAs(in + 8 * 5) >> 37 | SafeLoadAs(in + 8 * 6) << 27, - }; - shifts = simd_batch{ 12, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 51-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 24 | SafeLoadAs(in + 8 * 7) << 40, - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 62 | SafeLoadAs(in + 8 * 8) << 2, - SafeLoadAs(in + 8 * 8) >> 49 | SafeLoadAs(in + 8 * 9) << 15, - }; - shifts = simd_batch{ 0, 11, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 51-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 36 | SafeLoadAs(in + 8 * 10) << 28, - SafeLoadAs(in + 8 * 10) >> 23 | SafeLoadAs(in + 8 * 11) << 41, - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 61 | SafeLoadAs(in + 8 * 12) << 3, - }; - shifts = simd_batch{ 0, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 51-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 12) >> 48 | SafeLoadAs(in + 8 * 13) << 16, - SafeLoadAs(in + 8 * 13) >> 35 | SafeLoadAs(in + 8 * 14) << 29, - SafeLoadAs(in + 8 * 14) >> 22 | SafeLoadAs(in + 8 * 15) << 42, - SafeLoadAs(in + 8 * 15), - }; - shifts = simd_batch{ 0, 0, 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 51-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 60 | SafeLoadAs(in + 8 * 16) << 4, - SafeLoadAs(in + 8 * 16) >> 47 | SafeLoadAs(in + 8 * 17) << 17, - SafeLoadAs(in + 8 * 17) >> 34 | SafeLoadAs(in + 8 * 18) << 30, - SafeLoadAs(in + 8 * 18) >> 21 | SafeLoadAs(in + 8 * 19) << 43, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 51-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 59 | SafeLoadAs(in + 8 * 20) << 5, - SafeLoadAs(in + 8 * 20) >> 46 | SafeLoadAs(in + 8 * 21) << 18, - SafeLoadAs(in + 8 * 21) >> 33 | SafeLoadAs(in + 8 * 22) << 31, - }; - shifts = simd_batch{ 8, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 51-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 20 | SafeLoadAs(in + 8 * 23) << 44, - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 58 | SafeLoadAs(in + 8 * 24) << 6, - SafeLoadAs(in + 8 * 24) >> 45 | SafeLoadAs(in + 8 * 25) << 19, - }; - shifts = simd_batch{ 0, 7, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 51-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 32 | SafeLoadAs(in + 8 * 26) << 32, - SafeLoadAs(in + 8 * 26) >> 19 | SafeLoadAs(in + 8 * 27) << 45, - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 57 | SafeLoadAs(in + 8 * 28) << 7, - }; - shifts = simd_batch{ 0, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 51-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 44 | SafeLoadAs(in + 8 * 29) << 20, - SafeLoadAs(in + 8 * 29) >> 31 | SafeLoadAs(in + 8 * 30) << 33, - SafeLoadAs(in + 8 * 30) >> 18 | SafeLoadAs(in + 8 * 31) << 46, - SafeLoadAs(in + 8 * 31), - }; - shifts = simd_batch{ 0, 0, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 51-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 56 | SafeLoadAs(in + 8 * 32) << 8, - SafeLoadAs(in + 8 * 32) >> 43 | SafeLoadAs(in + 8 * 33) << 21, - SafeLoadAs(in + 8 * 33) >> 30 | SafeLoadAs(in + 8 * 34) << 34, - SafeLoadAs(in + 8 * 34) >> 17 | SafeLoadAs(in + 8 * 35) << 47, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 51-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 35), - SafeLoadAs(in + 8 * 35) >> 55 | SafeLoadAs(in + 8 * 36) << 9, - SafeLoadAs(in + 8 * 36) >> 42 | SafeLoadAs(in + 8 * 37) << 22, - SafeLoadAs(in + 8 * 37) >> 29 | SafeLoadAs(in + 8 * 38) << 35, - }; - shifts = simd_batch{ 4, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 51-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 38) >> 16 | SafeLoadAs(in + 8 * 39) << 48, - SafeLoadAs(in + 8 * 39), - SafeLoadAs(in + 8 * 39) >> 54 | SafeLoadAs(in + 8 * 40) << 10, - SafeLoadAs(in + 8 * 40) >> 41 | SafeLoadAs(in + 8 * 41) << 23, - }; - shifts = simd_batch{ 0, 3, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 51-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 41) >> 28 | SafeLoadAs(in + 8 * 42) << 36, - SafeLoadAs(in + 8 * 42) >> 15 | SafeLoadAs(in + 8 * 43) << 49, - SafeLoadAs(in + 8 * 43), - SafeLoadAs(in + 8 * 43) >> 53 | SafeLoadAs(in + 8 * 44) << 11, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 51-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 44) >> 40 | SafeLoadAs(in + 8 * 45) << 24, - SafeLoadAs(in + 8 * 45) >> 27 | SafeLoadAs(in + 8 * 46) << 37, - SafeLoadAs(in + 8 * 46) >> 14 | SafeLoadAs(in + 8 * 47) << 50, - SafeLoadAs(in + 8 * 47), - }; - shifts = simd_batch{ 0, 0, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 51-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 47) >> 52 | SafeLoadAs(in + 8 * 48) << 12, - SafeLoadAs(in + 8 * 48) >> 39 | SafeLoadAs(in + 8 * 49) << 25, - SafeLoadAs(in + 8 * 49) >> 26 | SafeLoadAs(in + 8 * 50) << 38, - SafeLoadAs(in + 8 * 50), - }; - shifts = simd_batch{ 0, 0, 0, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 51 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xfffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 52-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 52 | SafeLoadAs(in + 8 * 1) << 12, - SafeLoadAs(in + 8 * 1) >> 40 | SafeLoadAs(in + 8 * 2) << 24, - SafeLoadAs(in + 8 * 2) >> 28 | SafeLoadAs(in + 8 * 3) << 36, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 52-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 16 | SafeLoadAs(in + 8 * 4) << 48, - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 56 | SafeLoadAs(in + 8 * 5) << 8, - SafeLoadAs(in + 8 * 5) >> 44 | SafeLoadAs(in + 8 * 6) << 20, - }; - shifts = simd_batch{ 0, 4, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 52-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 32 | SafeLoadAs(in + 8 * 7) << 32, - SafeLoadAs(in + 8 * 7) >> 20 | SafeLoadAs(in + 8 * 8) << 44, - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 60 | SafeLoadAs(in + 8 * 9) << 4, - }; - shifts = simd_batch{ 0, 0, 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 52-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 48 | SafeLoadAs(in + 8 * 10) << 16, - SafeLoadAs(in + 8 * 10) >> 36 | SafeLoadAs(in + 8 * 11) << 28, - SafeLoadAs(in + 8 * 11) >> 24 | SafeLoadAs(in + 8 * 12) << 40, - SafeLoadAs(in + 8 * 12), - }; - shifts = simd_batch{ 0, 0, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 52-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 13), - SafeLoadAs(in + 8 * 13) >> 52 | SafeLoadAs(in + 8 * 14) << 12, - SafeLoadAs(in + 8 * 14) >> 40 | SafeLoadAs(in + 8 * 15) << 24, - SafeLoadAs(in + 8 * 15) >> 28 | SafeLoadAs(in + 8 * 16) << 36, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 52-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 16 | SafeLoadAs(in + 8 * 17) << 48, - SafeLoadAs(in + 8 * 17), - SafeLoadAs(in + 8 * 17) >> 56 | SafeLoadAs(in + 8 * 18) << 8, - SafeLoadAs(in + 8 * 18) >> 44 | SafeLoadAs(in + 8 * 19) << 20, - }; - shifts = simd_batch{ 0, 4, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 52-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 32 | SafeLoadAs(in + 8 * 20) << 32, - SafeLoadAs(in + 8 * 20) >> 20 | SafeLoadAs(in + 8 * 21) << 44, - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 60 | SafeLoadAs(in + 8 * 22) << 4, - }; - shifts = simd_batch{ 0, 0, 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 52-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 48 | SafeLoadAs(in + 8 * 23) << 16, - SafeLoadAs(in + 8 * 23) >> 36 | SafeLoadAs(in + 8 * 24) << 28, - SafeLoadAs(in + 8 * 24) >> 24 | SafeLoadAs(in + 8 * 25) << 40, - SafeLoadAs(in + 8 * 25), - }; - shifts = simd_batch{ 0, 0, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 52-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 26), - SafeLoadAs(in + 8 * 26) >> 52 | SafeLoadAs(in + 8 * 27) << 12, - SafeLoadAs(in + 8 * 27) >> 40 | SafeLoadAs(in + 8 * 28) << 24, - SafeLoadAs(in + 8 * 28) >> 28 | SafeLoadAs(in + 8 * 29) << 36, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 52-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 16 | SafeLoadAs(in + 8 * 30) << 48, - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 56 | SafeLoadAs(in + 8 * 31) << 8, - SafeLoadAs(in + 8 * 31) >> 44 | SafeLoadAs(in + 8 * 32) << 20, - }; - shifts = simd_batch{ 0, 4, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 52-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 32) >> 32 | SafeLoadAs(in + 8 * 33) << 32, - SafeLoadAs(in + 8 * 33) >> 20 | SafeLoadAs(in + 8 * 34) << 44, - SafeLoadAs(in + 8 * 34), - SafeLoadAs(in + 8 * 34) >> 60 | SafeLoadAs(in + 8 * 35) << 4, - }; - shifts = simd_batch{ 0, 0, 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 52-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 35) >> 48 | SafeLoadAs(in + 8 * 36) << 16, - SafeLoadAs(in + 8 * 36) >> 36 | SafeLoadAs(in + 8 * 37) << 28, - SafeLoadAs(in + 8 * 37) >> 24 | SafeLoadAs(in + 8 * 38) << 40, - SafeLoadAs(in + 8 * 38), - }; - shifts = simd_batch{ 0, 0, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 52-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 39), - SafeLoadAs(in + 8 * 39) >> 52 | SafeLoadAs(in + 8 * 40) << 12, - SafeLoadAs(in + 8 * 40) >> 40 | SafeLoadAs(in + 8 * 41) << 24, - SafeLoadAs(in + 8 * 41) >> 28 | SafeLoadAs(in + 8 * 42) << 36, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 52-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 42) >> 16 | SafeLoadAs(in + 8 * 43) << 48, - SafeLoadAs(in + 8 * 43), - SafeLoadAs(in + 8 * 43) >> 56 | SafeLoadAs(in + 8 * 44) << 8, - SafeLoadAs(in + 8 * 44) >> 44 | SafeLoadAs(in + 8 * 45) << 20, - }; - shifts = simd_batch{ 0, 4, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 52-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 45) >> 32 | SafeLoadAs(in + 8 * 46) << 32, - SafeLoadAs(in + 8 * 46) >> 20 | SafeLoadAs(in + 8 * 47) << 44, - SafeLoadAs(in + 8 * 47), - SafeLoadAs(in + 8 * 47) >> 60 | SafeLoadAs(in + 8 * 48) << 4, - }; - shifts = simd_batch{ 0, 0, 8, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 52-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 48) >> 48 | SafeLoadAs(in + 8 * 49) << 16, - SafeLoadAs(in + 8 * 49) >> 36 | SafeLoadAs(in + 8 * 50) << 28, - SafeLoadAs(in + 8 * 50) >> 24 | SafeLoadAs(in + 8 * 51) << 40, - SafeLoadAs(in + 8 * 51), - }; - shifts = simd_batch{ 0, 0, 0, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 52 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1fffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 53-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 53 | SafeLoadAs(in + 8 * 1) << 11, - SafeLoadAs(in + 8 * 1) >> 42 | SafeLoadAs(in + 8 * 2) << 22, - SafeLoadAs(in + 8 * 2) >> 31 | SafeLoadAs(in + 8 * 3) << 33, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 53-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 20 | SafeLoadAs(in + 8 * 4) << 44, - SafeLoadAs(in + 8 * 4), - SafeLoadAs(in + 8 * 4) >> 62 | SafeLoadAs(in + 8 * 5) << 2, - SafeLoadAs(in + 8 * 5) >> 51 | SafeLoadAs(in + 8 * 6) << 13, - }; - shifts = simd_batch{ 0, 9, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 53-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 40 | SafeLoadAs(in + 8 * 7) << 24, - SafeLoadAs(in + 8 * 7) >> 29 | SafeLoadAs(in + 8 * 8) << 35, - SafeLoadAs(in + 8 * 8) >> 18 | SafeLoadAs(in + 8 * 9) << 46, - SafeLoadAs(in + 8 * 9), - }; - shifts = simd_batch{ 0, 0, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 53-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 9) >> 60 | SafeLoadAs(in + 8 * 10) << 4, - SafeLoadAs(in + 8 * 10) >> 49 | SafeLoadAs(in + 8 * 11) << 15, - SafeLoadAs(in + 8 * 11) >> 38 | SafeLoadAs(in + 8 * 12) << 26, - SafeLoadAs(in + 8 * 12) >> 27 | SafeLoadAs(in + 8 * 13) << 37, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 53-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 16 | SafeLoadAs(in + 8 * 14) << 48, - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 58 | SafeLoadAs(in + 8 * 15) << 6, - SafeLoadAs(in + 8 * 15) >> 47 | SafeLoadAs(in + 8 * 16) << 17, - }; - shifts = simd_batch{ 0, 5, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 53-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 36 | SafeLoadAs(in + 8 * 17) << 28, - SafeLoadAs(in + 8 * 17) >> 25 | SafeLoadAs(in + 8 * 18) << 39, - SafeLoadAs(in + 8 * 18) >> 14 | SafeLoadAs(in + 8 * 19) << 50, - SafeLoadAs(in + 8 * 19), - }; - shifts = simd_batch{ 0, 0, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 53-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 56 | SafeLoadAs(in + 8 * 20) << 8, - SafeLoadAs(in + 8 * 20) >> 45 | SafeLoadAs(in + 8 * 21) << 19, - SafeLoadAs(in + 8 * 21) >> 34 | SafeLoadAs(in + 8 * 22) << 30, - SafeLoadAs(in + 8 * 22) >> 23 | SafeLoadAs(in + 8 * 23) << 41, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 53-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 12 | SafeLoadAs(in + 8 * 24) << 52, - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 54 | SafeLoadAs(in + 8 * 25) << 10, - SafeLoadAs(in + 8 * 25) >> 43 | SafeLoadAs(in + 8 * 26) << 21, - }; - shifts = simd_batch{ 0, 1, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 53-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 32 | SafeLoadAs(in + 8 * 27) << 32, - SafeLoadAs(in + 8 * 27) >> 21 | SafeLoadAs(in + 8 * 28) << 43, - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 63 | SafeLoadAs(in + 8 * 29) << 1, - }; - shifts = simd_batch{ 0, 0, 10, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 53-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 52 | SafeLoadAs(in + 8 * 30) << 12, - SafeLoadAs(in + 8 * 30) >> 41 | SafeLoadAs(in + 8 * 31) << 23, - SafeLoadAs(in + 8 * 31) >> 30 | SafeLoadAs(in + 8 * 32) << 34, - SafeLoadAs(in + 8 * 32) >> 19 | SafeLoadAs(in + 8 * 33) << 45, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 53-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 33), - SafeLoadAs(in + 8 * 33) >> 61 | SafeLoadAs(in + 8 * 34) << 3, - SafeLoadAs(in + 8 * 34) >> 50 | SafeLoadAs(in + 8 * 35) << 14, - SafeLoadAs(in + 8 * 35) >> 39 | SafeLoadAs(in + 8 * 36) << 25, - }; - shifts = simd_batch{ 8, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 53-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 28 | SafeLoadAs(in + 8 * 37) << 36, - SafeLoadAs(in + 8 * 37) >> 17 | SafeLoadAs(in + 8 * 38) << 47, - SafeLoadAs(in + 8 * 38), - SafeLoadAs(in + 8 * 38) >> 59 | SafeLoadAs(in + 8 * 39) << 5, - }; - shifts = simd_batch{ 0, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 53-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 48 | SafeLoadAs(in + 8 * 40) << 16, - SafeLoadAs(in + 8 * 40) >> 37 | SafeLoadAs(in + 8 * 41) << 27, - SafeLoadAs(in + 8 * 41) >> 26 | SafeLoadAs(in + 8 * 42) << 38, - SafeLoadAs(in + 8 * 42) >> 15 | SafeLoadAs(in + 8 * 43) << 49, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 53-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 43), - SafeLoadAs(in + 8 * 43) >> 57 | SafeLoadAs(in + 8 * 44) << 7, - SafeLoadAs(in + 8 * 44) >> 46 | SafeLoadAs(in + 8 * 45) << 18, - SafeLoadAs(in + 8 * 45) >> 35 | SafeLoadAs(in + 8 * 46) << 29, - }; - shifts = simd_batch{ 4, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 53-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 46) >> 24 | SafeLoadAs(in + 8 * 47) << 40, - SafeLoadAs(in + 8 * 47) >> 13 | SafeLoadAs(in + 8 * 48) << 51, - SafeLoadAs(in + 8 * 48), - SafeLoadAs(in + 8 * 48) >> 55 | SafeLoadAs(in + 8 * 49) << 9, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 53-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 49) >> 44 | SafeLoadAs(in + 8 * 50) << 20, - SafeLoadAs(in + 8 * 50) >> 33 | SafeLoadAs(in + 8 * 51) << 31, - SafeLoadAs(in + 8 * 51) >> 22 | SafeLoadAs(in + 8 * 52) << 42, - SafeLoadAs(in + 8 * 52), - }; - shifts = simd_batch{ 0, 0, 0, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 53 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3fffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 54-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 54 | SafeLoadAs(in + 8 * 1) << 10, - SafeLoadAs(in + 8 * 1) >> 44 | SafeLoadAs(in + 8 * 2) << 20, - SafeLoadAs(in + 8 * 2) >> 34 | SafeLoadAs(in + 8 * 3) << 30, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 54-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 24 | SafeLoadAs(in + 8 * 4) << 40, - SafeLoadAs(in + 8 * 4) >> 14 | SafeLoadAs(in + 8 * 5) << 50, - SafeLoadAs(in + 8 * 5), - SafeLoadAs(in + 8 * 5) >> 58 | SafeLoadAs(in + 8 * 6) << 6, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 54-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 48 | SafeLoadAs(in + 8 * 7) << 16, - SafeLoadAs(in + 8 * 7) >> 38 | SafeLoadAs(in + 8 * 8) << 26, - SafeLoadAs(in + 8 * 8) >> 28 | SafeLoadAs(in + 8 * 9) << 36, - SafeLoadAs(in + 8 * 9) >> 18 | SafeLoadAs(in + 8 * 10) << 46, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 54-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 10), - SafeLoadAs(in + 8 * 10) >> 62 | SafeLoadAs(in + 8 * 11) << 2, - SafeLoadAs(in + 8 * 11) >> 52 | SafeLoadAs(in + 8 * 12) << 12, - SafeLoadAs(in + 8 * 12) >> 42 | SafeLoadAs(in + 8 * 13) << 22, - }; - shifts = simd_batch{ 8, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 54-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 32 | SafeLoadAs(in + 8 * 14) << 32, - SafeLoadAs(in + 8 * 14) >> 22 | SafeLoadAs(in + 8 * 15) << 42, - SafeLoadAs(in + 8 * 15) >> 12 | SafeLoadAs(in + 8 * 16) << 52, - SafeLoadAs(in + 8 * 16), - }; - shifts = simd_batch{ 0, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 54-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - SafeLoadAs(in + 8 * 17) >> 46 | SafeLoadAs(in + 8 * 18) << 18, - SafeLoadAs(in + 8 * 18) >> 36 | SafeLoadAs(in + 8 * 19) << 28, - SafeLoadAs(in + 8 * 19) >> 26 | SafeLoadAs(in + 8 * 20) << 38, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 54-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 16 | SafeLoadAs(in + 8 * 21) << 48, - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 60 | SafeLoadAs(in + 8 * 22) << 4, - SafeLoadAs(in + 8 * 22) >> 50 | SafeLoadAs(in + 8 * 23) << 14, - }; - shifts = simd_batch{ 0, 6, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 54-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 40 | SafeLoadAs(in + 8 * 24) << 24, - SafeLoadAs(in + 8 * 24) >> 30 | SafeLoadAs(in + 8 * 25) << 34, - SafeLoadAs(in + 8 * 25) >> 20 | SafeLoadAs(in + 8 * 26) << 44, - SafeLoadAs(in + 8 * 26), - }; - shifts = simd_batch{ 0, 0, 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 54-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 27), - SafeLoadAs(in + 8 * 27) >> 54 | SafeLoadAs(in + 8 * 28) << 10, - SafeLoadAs(in + 8 * 28) >> 44 | SafeLoadAs(in + 8 * 29) << 20, - SafeLoadAs(in + 8 * 29) >> 34 | SafeLoadAs(in + 8 * 30) << 30, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 54-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 24 | SafeLoadAs(in + 8 * 31) << 40, - SafeLoadAs(in + 8 * 31) >> 14 | SafeLoadAs(in + 8 * 32) << 50, - SafeLoadAs(in + 8 * 32), - SafeLoadAs(in + 8 * 32) >> 58 | SafeLoadAs(in + 8 * 33) << 6, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 54-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 48 | SafeLoadAs(in + 8 * 34) << 16, - SafeLoadAs(in + 8 * 34) >> 38 | SafeLoadAs(in + 8 * 35) << 26, - SafeLoadAs(in + 8 * 35) >> 28 | SafeLoadAs(in + 8 * 36) << 36, - SafeLoadAs(in + 8 * 36) >> 18 | SafeLoadAs(in + 8 * 37) << 46, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 54-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 37), - SafeLoadAs(in + 8 * 37) >> 62 | SafeLoadAs(in + 8 * 38) << 2, - SafeLoadAs(in + 8 * 38) >> 52 | SafeLoadAs(in + 8 * 39) << 12, - SafeLoadAs(in + 8 * 39) >> 42 | SafeLoadAs(in + 8 * 40) << 22, - }; - shifts = simd_batch{ 8, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 54-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 40) >> 32 | SafeLoadAs(in + 8 * 41) << 32, - SafeLoadAs(in + 8 * 41) >> 22 | SafeLoadAs(in + 8 * 42) << 42, - SafeLoadAs(in + 8 * 42) >> 12 | SafeLoadAs(in + 8 * 43) << 52, - SafeLoadAs(in + 8 * 43), - }; - shifts = simd_batch{ 0, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 54-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 43) >> 56 | SafeLoadAs(in + 8 * 44) << 8, - SafeLoadAs(in + 8 * 44) >> 46 | SafeLoadAs(in + 8 * 45) << 18, - SafeLoadAs(in + 8 * 45) >> 36 | SafeLoadAs(in + 8 * 46) << 28, - SafeLoadAs(in + 8 * 46) >> 26 | SafeLoadAs(in + 8 * 47) << 38, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 54-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 47) >> 16 | SafeLoadAs(in + 8 * 48) << 48, - SafeLoadAs(in + 8 * 48), - SafeLoadAs(in + 8 * 48) >> 60 | SafeLoadAs(in + 8 * 49) << 4, - SafeLoadAs(in + 8 * 49) >> 50 | SafeLoadAs(in + 8 * 50) << 14, - }; - shifts = simd_batch{ 0, 6, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 54-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 50) >> 40 | SafeLoadAs(in + 8 * 51) << 24, - SafeLoadAs(in + 8 * 51) >> 30 | SafeLoadAs(in + 8 * 52) << 34, - SafeLoadAs(in + 8 * 52) >> 20 | SafeLoadAs(in + 8 * 53) << 44, - SafeLoadAs(in + 8 * 53), - }; - shifts = simd_batch{ 0, 0, 0, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 54 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7fffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 55-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 55 | SafeLoadAs(in + 8 * 1) << 9, - SafeLoadAs(in + 8 * 1) >> 46 | SafeLoadAs(in + 8 * 2) << 18, - SafeLoadAs(in + 8 * 2) >> 37 | SafeLoadAs(in + 8 * 3) << 27, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 55-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 28 | SafeLoadAs(in + 8 * 4) << 36, - SafeLoadAs(in + 8 * 4) >> 19 | SafeLoadAs(in + 8 * 5) << 45, - SafeLoadAs(in + 8 * 5) >> 10 | SafeLoadAs(in + 8 * 6) << 54, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 0, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 55-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 6) >> 56 | SafeLoadAs(in + 8 * 7) << 8, - SafeLoadAs(in + 8 * 7) >> 47 | SafeLoadAs(in + 8 * 8) << 17, - SafeLoadAs(in + 8 * 8) >> 38 | SafeLoadAs(in + 8 * 9) << 26, - SafeLoadAs(in + 8 * 9) >> 29 | SafeLoadAs(in + 8 * 10) << 35, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 55-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 20 | SafeLoadAs(in + 8 * 11) << 44, - SafeLoadAs(in + 8 * 11) >> 11 | SafeLoadAs(in + 8 * 12) << 53, - SafeLoadAs(in + 8 * 12), - SafeLoadAs(in + 8 * 12) >> 57 | SafeLoadAs(in + 8 * 13) << 7, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 55-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 13) >> 48 | SafeLoadAs(in + 8 * 14) << 16, - SafeLoadAs(in + 8 * 14) >> 39 | SafeLoadAs(in + 8 * 15) << 25, - SafeLoadAs(in + 8 * 15) >> 30 | SafeLoadAs(in + 8 * 16) << 34, - SafeLoadAs(in + 8 * 16) >> 21 | SafeLoadAs(in + 8 * 17) << 43, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 55-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 12 | SafeLoadAs(in + 8 * 18) << 52, - SafeLoadAs(in + 8 * 18), - SafeLoadAs(in + 8 * 18) >> 58 | SafeLoadAs(in + 8 * 19) << 6, - SafeLoadAs(in + 8 * 19) >> 49 | SafeLoadAs(in + 8 * 20) << 15, - }; - shifts = simd_batch{ 0, 3, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 55-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 20) >> 40 | SafeLoadAs(in + 8 * 21) << 24, - SafeLoadAs(in + 8 * 21) >> 31 | SafeLoadAs(in + 8 * 22) << 33, - SafeLoadAs(in + 8 * 22) >> 22 | SafeLoadAs(in + 8 * 23) << 42, - SafeLoadAs(in + 8 * 23) >> 13 | SafeLoadAs(in + 8 * 24) << 51, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 55-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 24), - SafeLoadAs(in + 8 * 24) >> 59 | SafeLoadAs(in + 8 * 25) << 5, - SafeLoadAs(in + 8 * 25) >> 50 | SafeLoadAs(in + 8 * 26) << 14, - SafeLoadAs(in + 8 * 26) >> 41 | SafeLoadAs(in + 8 * 27) << 23, - }; - shifts = simd_batch{ 4, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 55-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 32 | SafeLoadAs(in + 8 * 28) << 32, - SafeLoadAs(in + 8 * 28) >> 23 | SafeLoadAs(in + 8 * 29) << 41, - SafeLoadAs(in + 8 * 29) >> 14 | SafeLoadAs(in + 8 * 30) << 50, - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 0, 0, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 55-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 60 | SafeLoadAs(in + 8 * 31) << 4, - SafeLoadAs(in + 8 * 31) >> 51 | SafeLoadAs(in + 8 * 32) << 13, - SafeLoadAs(in + 8 * 32) >> 42 | SafeLoadAs(in + 8 * 33) << 22, - SafeLoadAs(in + 8 * 33) >> 33 | SafeLoadAs(in + 8 * 34) << 31, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 55-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 24 | SafeLoadAs(in + 8 * 35) << 40, - SafeLoadAs(in + 8 * 35) >> 15 | SafeLoadAs(in + 8 * 36) << 49, - SafeLoadAs(in + 8 * 36), - SafeLoadAs(in + 8 * 36) >> 61 | SafeLoadAs(in + 8 * 37) << 3, - }; - shifts = simd_batch{ 0, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 55-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 37) >> 52 | SafeLoadAs(in + 8 * 38) << 12, - SafeLoadAs(in + 8 * 38) >> 43 | SafeLoadAs(in + 8 * 39) << 21, - SafeLoadAs(in + 8 * 39) >> 34 | SafeLoadAs(in + 8 * 40) << 30, - SafeLoadAs(in + 8 * 40) >> 25 | SafeLoadAs(in + 8 * 41) << 39, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 55-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 41) >> 16 | SafeLoadAs(in + 8 * 42) << 48, - SafeLoadAs(in + 8 * 42), - SafeLoadAs(in + 8 * 42) >> 62 | SafeLoadAs(in + 8 * 43) << 2, - SafeLoadAs(in + 8 * 43) >> 53 | SafeLoadAs(in + 8 * 44) << 11, - }; - shifts = simd_batch{ 0, 7, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 55-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 44) >> 44 | SafeLoadAs(in + 8 * 45) << 20, - SafeLoadAs(in + 8 * 45) >> 35 | SafeLoadAs(in + 8 * 46) << 29, - SafeLoadAs(in + 8 * 46) >> 26 | SafeLoadAs(in + 8 * 47) << 38, - SafeLoadAs(in + 8 * 47) >> 17 | SafeLoadAs(in + 8 * 48) << 47, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 55-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 48), - SafeLoadAs(in + 8 * 48) >> 63 | SafeLoadAs(in + 8 * 49) << 1, - SafeLoadAs(in + 8 * 49) >> 54 | SafeLoadAs(in + 8 * 50) << 10, - SafeLoadAs(in + 8 * 50) >> 45 | SafeLoadAs(in + 8 * 51) << 19, - }; - shifts = simd_batch{ 8, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 55-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 51) >> 36 | SafeLoadAs(in + 8 * 52) << 28, - SafeLoadAs(in + 8 * 52) >> 27 | SafeLoadAs(in + 8 * 53) << 37, - SafeLoadAs(in + 8 * 53) >> 18 | SafeLoadAs(in + 8 * 54) << 46, - SafeLoadAs(in + 8 * 54), - }; - shifts = simd_batch{ 0, 0, 0, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 55 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xffffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 56-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 56 | SafeLoadAs(in + 8 * 1) << 8, - SafeLoadAs(in + 8 * 1) >> 48 | SafeLoadAs(in + 8 * 2) << 16, - SafeLoadAs(in + 8 * 2) >> 40 | SafeLoadAs(in + 8 * 3) << 24, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 56-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 32 | SafeLoadAs(in + 8 * 4) << 32, - SafeLoadAs(in + 8 * 4) >> 24 | SafeLoadAs(in + 8 * 5) << 40, - SafeLoadAs(in + 8 * 5) >> 16 | SafeLoadAs(in + 8 * 6) << 48, - SafeLoadAs(in + 8 * 6), - }; - shifts = simd_batch{ 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 56-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 7), - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8) >> 48 | SafeLoadAs(in + 8 * 9) << 16, - SafeLoadAs(in + 8 * 9) >> 40 | SafeLoadAs(in + 8 * 10) << 24, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 56-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 32 | SafeLoadAs(in + 8 * 11) << 32, - SafeLoadAs(in + 8 * 11) >> 24 | SafeLoadAs(in + 8 * 12) << 40, - SafeLoadAs(in + 8 * 12) >> 16 | SafeLoadAs(in + 8 * 13) << 48, - SafeLoadAs(in + 8 * 13), - }; - shifts = simd_batch{ 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 56-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 14), - SafeLoadAs(in + 8 * 14) >> 56 | SafeLoadAs(in + 8 * 15) << 8, - SafeLoadAs(in + 8 * 15) >> 48 | SafeLoadAs(in + 8 * 16) << 16, - SafeLoadAs(in + 8 * 16) >> 40 | SafeLoadAs(in + 8 * 17) << 24, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 56-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 32 | SafeLoadAs(in + 8 * 18) << 32, - SafeLoadAs(in + 8 * 18) >> 24 | SafeLoadAs(in + 8 * 19) << 40, - SafeLoadAs(in + 8 * 19) >> 16 | SafeLoadAs(in + 8 * 20) << 48, - SafeLoadAs(in + 8 * 20), - }; - shifts = simd_batch{ 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 56-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 21), - SafeLoadAs(in + 8 * 21) >> 56 | SafeLoadAs(in + 8 * 22) << 8, - SafeLoadAs(in + 8 * 22) >> 48 | SafeLoadAs(in + 8 * 23) << 16, - SafeLoadAs(in + 8 * 23) >> 40 | SafeLoadAs(in + 8 * 24) << 24, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 56-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 32 | SafeLoadAs(in + 8 * 25) << 32, - SafeLoadAs(in + 8 * 25) >> 24 | SafeLoadAs(in + 8 * 26) << 40, - SafeLoadAs(in + 8 * 26) >> 16 | SafeLoadAs(in + 8 * 27) << 48, - SafeLoadAs(in + 8 * 27), - }; - shifts = simd_batch{ 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 56-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 28), - SafeLoadAs(in + 8 * 28) >> 56 | SafeLoadAs(in + 8 * 29) << 8, - SafeLoadAs(in + 8 * 29) >> 48 | SafeLoadAs(in + 8 * 30) << 16, - SafeLoadAs(in + 8 * 30) >> 40 | SafeLoadAs(in + 8 * 31) << 24, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 56-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 32 | SafeLoadAs(in + 8 * 32) << 32, - SafeLoadAs(in + 8 * 32) >> 24 | SafeLoadAs(in + 8 * 33) << 40, - SafeLoadAs(in + 8 * 33) >> 16 | SafeLoadAs(in + 8 * 34) << 48, - SafeLoadAs(in + 8 * 34), - }; - shifts = simd_batch{ 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 56-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 35), - SafeLoadAs(in + 8 * 35) >> 56 | SafeLoadAs(in + 8 * 36) << 8, - SafeLoadAs(in + 8 * 36) >> 48 | SafeLoadAs(in + 8 * 37) << 16, - SafeLoadAs(in + 8 * 37) >> 40 | SafeLoadAs(in + 8 * 38) << 24, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 56-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 38) >> 32 | SafeLoadAs(in + 8 * 39) << 32, - SafeLoadAs(in + 8 * 39) >> 24 | SafeLoadAs(in + 8 * 40) << 40, - SafeLoadAs(in + 8 * 40) >> 16 | SafeLoadAs(in + 8 * 41) << 48, - SafeLoadAs(in + 8 * 41), - }; - shifts = simd_batch{ 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 56-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 42), - SafeLoadAs(in + 8 * 42) >> 56 | SafeLoadAs(in + 8 * 43) << 8, - SafeLoadAs(in + 8 * 43) >> 48 | SafeLoadAs(in + 8 * 44) << 16, - SafeLoadAs(in + 8 * 44) >> 40 | SafeLoadAs(in + 8 * 45) << 24, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 56-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 45) >> 32 | SafeLoadAs(in + 8 * 46) << 32, - SafeLoadAs(in + 8 * 46) >> 24 | SafeLoadAs(in + 8 * 47) << 40, - SafeLoadAs(in + 8 * 47) >> 16 | SafeLoadAs(in + 8 * 48) << 48, - SafeLoadAs(in + 8 * 48), - }; - shifts = simd_batch{ 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 56-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 49), - SafeLoadAs(in + 8 * 49) >> 56 | SafeLoadAs(in + 8 * 50) << 8, - SafeLoadAs(in + 8 * 50) >> 48 | SafeLoadAs(in + 8 * 51) << 16, - SafeLoadAs(in + 8 * 51) >> 40 | SafeLoadAs(in + 8 * 52) << 24, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 56-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 52) >> 32 | SafeLoadAs(in + 8 * 53) << 32, - SafeLoadAs(in + 8 * 53) >> 24 | SafeLoadAs(in + 8 * 54) << 40, - SafeLoadAs(in + 8 * 54) >> 16 | SafeLoadAs(in + 8 * 55) << 48, - SafeLoadAs(in + 8 * 55), - }; - shifts = simd_batch{ 0, 0, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 56 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1ffffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 57-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 57 | SafeLoadAs(in + 8 * 1) << 7, - SafeLoadAs(in + 8 * 1) >> 50 | SafeLoadAs(in + 8 * 2) << 14, - SafeLoadAs(in + 8 * 2) >> 43 | SafeLoadAs(in + 8 * 3) << 21, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 57-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 36 | SafeLoadAs(in + 8 * 4) << 28, - SafeLoadAs(in + 8 * 4) >> 29 | SafeLoadAs(in + 8 * 5) << 35, - SafeLoadAs(in + 8 * 5) >> 22 | SafeLoadAs(in + 8 * 6) << 42, - SafeLoadAs(in + 8 * 6) >> 15 | SafeLoadAs(in + 8 * 7) << 49, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 57-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 8 | SafeLoadAs(in + 8 * 8) << 56, - SafeLoadAs(in + 8 * 8), - SafeLoadAs(in + 8 * 8) >> 58 | SafeLoadAs(in + 8 * 9) << 6, - SafeLoadAs(in + 8 * 9) >> 51 | SafeLoadAs(in + 8 * 10) << 13, - }; - shifts = simd_batch{ 0, 1, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 57-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 44 | SafeLoadAs(in + 8 * 11) << 20, - SafeLoadAs(in + 8 * 11) >> 37 | SafeLoadAs(in + 8 * 12) << 27, - SafeLoadAs(in + 8 * 12) >> 30 | SafeLoadAs(in + 8 * 13) << 34, - SafeLoadAs(in + 8 * 13) >> 23 | SafeLoadAs(in + 8 * 14) << 41, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 57-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 16 | SafeLoadAs(in + 8 * 15) << 48, - SafeLoadAs(in + 8 * 15) >> 9 | SafeLoadAs(in + 8 * 16) << 55, - SafeLoadAs(in + 8 * 16), - SafeLoadAs(in + 8 * 16) >> 59 | SafeLoadAs(in + 8 * 17) << 5, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 57-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 17) >> 52 | SafeLoadAs(in + 8 * 18) << 12, - SafeLoadAs(in + 8 * 18) >> 45 | SafeLoadAs(in + 8 * 19) << 19, - SafeLoadAs(in + 8 * 19) >> 38 | SafeLoadAs(in + 8 * 20) << 26, - SafeLoadAs(in + 8 * 20) >> 31 | SafeLoadAs(in + 8 * 21) << 33, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 57-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 24 | SafeLoadAs(in + 8 * 22) << 40, - SafeLoadAs(in + 8 * 22) >> 17 | SafeLoadAs(in + 8 * 23) << 47, - SafeLoadAs(in + 8 * 23) >> 10 | SafeLoadAs(in + 8 * 24) << 54, - SafeLoadAs(in + 8 * 24), - }; - shifts = simd_batch{ 0, 0, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 57-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 24) >> 60 | SafeLoadAs(in + 8 * 25) << 4, - SafeLoadAs(in + 8 * 25) >> 53 | SafeLoadAs(in + 8 * 26) << 11, - SafeLoadAs(in + 8 * 26) >> 46 | SafeLoadAs(in + 8 * 27) << 18, - SafeLoadAs(in + 8 * 27) >> 39 | SafeLoadAs(in + 8 * 28) << 25, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 57-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 28) >> 32 | SafeLoadAs(in + 8 * 29) << 32, - SafeLoadAs(in + 8 * 29) >> 25 | SafeLoadAs(in + 8 * 30) << 39, - SafeLoadAs(in + 8 * 30) >> 18 | SafeLoadAs(in + 8 * 31) << 46, - SafeLoadAs(in + 8 * 31) >> 11 | SafeLoadAs(in + 8 * 32) << 53, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 57-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 32), - SafeLoadAs(in + 8 * 32) >> 61 | SafeLoadAs(in + 8 * 33) << 3, - SafeLoadAs(in + 8 * 33) >> 54 | SafeLoadAs(in + 8 * 34) << 10, - SafeLoadAs(in + 8 * 34) >> 47 | SafeLoadAs(in + 8 * 35) << 17, - }; - shifts = simd_batch{ 4, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 57-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 35) >> 40 | SafeLoadAs(in + 8 * 36) << 24, - SafeLoadAs(in + 8 * 36) >> 33 | SafeLoadAs(in + 8 * 37) << 31, - SafeLoadAs(in + 8 * 37) >> 26 | SafeLoadAs(in + 8 * 38) << 38, - SafeLoadAs(in + 8 * 38) >> 19 | SafeLoadAs(in + 8 * 39) << 45, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 57-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 12 | SafeLoadAs(in + 8 * 40) << 52, - SafeLoadAs(in + 8 * 40), - SafeLoadAs(in + 8 * 40) >> 62 | SafeLoadAs(in + 8 * 41) << 2, - SafeLoadAs(in + 8 * 41) >> 55 | SafeLoadAs(in + 8 * 42) << 9, - }; - shifts = simd_batch{ 0, 5, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 57-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 42) >> 48 | SafeLoadAs(in + 8 * 43) << 16, - SafeLoadAs(in + 8 * 43) >> 41 | SafeLoadAs(in + 8 * 44) << 23, - SafeLoadAs(in + 8 * 44) >> 34 | SafeLoadAs(in + 8 * 45) << 30, - SafeLoadAs(in + 8 * 45) >> 27 | SafeLoadAs(in + 8 * 46) << 37, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 57-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 46) >> 20 | SafeLoadAs(in + 8 * 47) << 44, - SafeLoadAs(in + 8 * 47) >> 13 | SafeLoadAs(in + 8 * 48) << 51, - SafeLoadAs(in + 8 * 48), - SafeLoadAs(in + 8 * 48) >> 63 | SafeLoadAs(in + 8 * 49) << 1, - }; - shifts = simd_batch{ 0, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 57-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 49) >> 56 | SafeLoadAs(in + 8 * 50) << 8, - SafeLoadAs(in + 8 * 50) >> 49 | SafeLoadAs(in + 8 * 51) << 15, - SafeLoadAs(in + 8 * 51) >> 42 | SafeLoadAs(in + 8 * 52) << 22, - SafeLoadAs(in + 8 * 52) >> 35 | SafeLoadAs(in + 8 * 53) << 29, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 57-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 53) >> 28 | SafeLoadAs(in + 8 * 54) << 36, - SafeLoadAs(in + 8 * 54) >> 21 | SafeLoadAs(in + 8 * 55) << 43, - SafeLoadAs(in + 8 * 55) >> 14 | SafeLoadAs(in + 8 * 56) << 50, - SafeLoadAs(in + 8 * 56), - }; - shifts = simd_batch{ 0, 0, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 57 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3ffffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 58-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 58 | SafeLoadAs(in + 8 * 1) << 6, - SafeLoadAs(in + 8 * 1) >> 52 | SafeLoadAs(in + 8 * 2) << 12, - SafeLoadAs(in + 8 * 2) >> 46 | SafeLoadAs(in + 8 * 3) << 18, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 58-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 40 | SafeLoadAs(in + 8 * 4) << 24, - SafeLoadAs(in + 8 * 4) >> 34 | SafeLoadAs(in + 8 * 5) << 30, - SafeLoadAs(in + 8 * 5) >> 28 | SafeLoadAs(in + 8 * 6) << 36, - SafeLoadAs(in + 8 * 6) >> 22 | SafeLoadAs(in + 8 * 7) << 42, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 58-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 16 | SafeLoadAs(in + 8 * 8) << 48, - SafeLoadAs(in + 8 * 8) >> 10 | SafeLoadAs(in + 8 * 9) << 54, - SafeLoadAs(in + 8 * 9), - SafeLoadAs(in + 8 * 9) >> 62 | SafeLoadAs(in + 8 * 10) << 2, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 58-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 10) >> 56 | SafeLoadAs(in + 8 * 11) << 8, - SafeLoadAs(in + 8 * 11) >> 50 | SafeLoadAs(in + 8 * 12) << 14, - SafeLoadAs(in + 8 * 12) >> 44 | SafeLoadAs(in + 8 * 13) << 20, - SafeLoadAs(in + 8 * 13) >> 38 | SafeLoadAs(in + 8 * 14) << 26, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 58-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 32 | SafeLoadAs(in + 8 * 15) << 32, - SafeLoadAs(in + 8 * 15) >> 26 | SafeLoadAs(in + 8 * 16) << 38, - SafeLoadAs(in + 8 * 16) >> 20 | SafeLoadAs(in + 8 * 17) << 44, - SafeLoadAs(in + 8 * 17) >> 14 | SafeLoadAs(in + 8 * 18) << 50, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 58-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 8 | SafeLoadAs(in + 8 * 19) << 56, - SafeLoadAs(in + 8 * 19), - SafeLoadAs(in + 8 * 19) >> 60 | SafeLoadAs(in + 8 * 20) << 4, - SafeLoadAs(in + 8 * 20) >> 54 | SafeLoadAs(in + 8 * 21) << 10, - }; - shifts = simd_batch{ 0, 2, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 58-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 21) >> 48 | SafeLoadAs(in + 8 * 22) << 16, - SafeLoadAs(in + 8 * 22) >> 42 | SafeLoadAs(in + 8 * 23) << 22, - SafeLoadAs(in + 8 * 23) >> 36 | SafeLoadAs(in + 8 * 24) << 28, - SafeLoadAs(in + 8 * 24) >> 30 | SafeLoadAs(in + 8 * 25) << 34, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 58-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 24 | SafeLoadAs(in + 8 * 26) << 40, - SafeLoadAs(in + 8 * 26) >> 18 | SafeLoadAs(in + 8 * 27) << 46, - SafeLoadAs(in + 8 * 27) >> 12 | SafeLoadAs(in + 8 * 28) << 52, - SafeLoadAs(in + 8 * 28), - }; - shifts = simd_batch{ 0, 0, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 58-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 29), - SafeLoadAs(in + 8 * 29) >> 58 | SafeLoadAs(in + 8 * 30) << 6, - SafeLoadAs(in + 8 * 30) >> 52 | SafeLoadAs(in + 8 * 31) << 12, - SafeLoadAs(in + 8 * 31) >> 46 | SafeLoadAs(in + 8 * 32) << 18, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 58-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 32) >> 40 | SafeLoadAs(in + 8 * 33) << 24, - SafeLoadAs(in + 8 * 33) >> 34 | SafeLoadAs(in + 8 * 34) << 30, - SafeLoadAs(in + 8 * 34) >> 28 | SafeLoadAs(in + 8 * 35) << 36, - SafeLoadAs(in + 8 * 35) >> 22 | SafeLoadAs(in + 8 * 36) << 42, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 58-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 16 | SafeLoadAs(in + 8 * 37) << 48, - SafeLoadAs(in + 8 * 37) >> 10 | SafeLoadAs(in + 8 * 38) << 54, - SafeLoadAs(in + 8 * 38), - SafeLoadAs(in + 8 * 38) >> 62 | SafeLoadAs(in + 8 * 39) << 2, - }; - shifts = simd_batch{ 0, 0, 4, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 58-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 56 | SafeLoadAs(in + 8 * 40) << 8, - SafeLoadAs(in + 8 * 40) >> 50 | SafeLoadAs(in + 8 * 41) << 14, - SafeLoadAs(in + 8 * 41) >> 44 | SafeLoadAs(in + 8 * 42) << 20, - SafeLoadAs(in + 8 * 42) >> 38 | SafeLoadAs(in + 8 * 43) << 26, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 58-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 43) >> 32 | SafeLoadAs(in + 8 * 44) << 32, - SafeLoadAs(in + 8 * 44) >> 26 | SafeLoadAs(in + 8 * 45) << 38, - SafeLoadAs(in + 8 * 45) >> 20 | SafeLoadAs(in + 8 * 46) << 44, - SafeLoadAs(in + 8 * 46) >> 14 | SafeLoadAs(in + 8 * 47) << 50, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 58-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 47) >> 8 | SafeLoadAs(in + 8 * 48) << 56, - SafeLoadAs(in + 8 * 48), - SafeLoadAs(in + 8 * 48) >> 60 | SafeLoadAs(in + 8 * 49) << 4, - SafeLoadAs(in + 8 * 49) >> 54 | SafeLoadAs(in + 8 * 50) << 10, - }; - shifts = simd_batch{ 0, 2, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 58-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 50) >> 48 | SafeLoadAs(in + 8 * 51) << 16, - SafeLoadAs(in + 8 * 51) >> 42 | SafeLoadAs(in + 8 * 52) << 22, - SafeLoadAs(in + 8 * 52) >> 36 | SafeLoadAs(in + 8 * 53) << 28, - SafeLoadAs(in + 8 * 53) >> 30 | SafeLoadAs(in + 8 * 54) << 34, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 58-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 54) >> 24 | SafeLoadAs(in + 8 * 55) << 40, - SafeLoadAs(in + 8 * 55) >> 18 | SafeLoadAs(in + 8 * 56) << 46, - SafeLoadAs(in + 8 * 56) >> 12 | SafeLoadAs(in + 8 * 57) << 52, - SafeLoadAs(in + 8 * 57), - }; - shifts = simd_batch{ 0, 0, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 58 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7ffffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 59-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 59 | SafeLoadAs(in + 8 * 1) << 5, - SafeLoadAs(in + 8 * 1) >> 54 | SafeLoadAs(in + 8 * 2) << 10, - SafeLoadAs(in + 8 * 2) >> 49 | SafeLoadAs(in + 8 * 3) << 15, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 59-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 44 | SafeLoadAs(in + 8 * 4) << 20, - SafeLoadAs(in + 8 * 4) >> 39 | SafeLoadAs(in + 8 * 5) << 25, - SafeLoadAs(in + 8 * 5) >> 34 | SafeLoadAs(in + 8 * 6) << 30, - SafeLoadAs(in + 8 * 6) >> 29 | SafeLoadAs(in + 8 * 7) << 35, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 59-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 24 | SafeLoadAs(in + 8 * 8) << 40, - SafeLoadAs(in + 8 * 8) >> 19 | SafeLoadAs(in + 8 * 9) << 45, - SafeLoadAs(in + 8 * 9) >> 14 | SafeLoadAs(in + 8 * 10) << 50, - SafeLoadAs(in + 8 * 10) >> 9 | SafeLoadAs(in + 8 * 11) << 55, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 59-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 11), - SafeLoadAs(in + 8 * 11) >> 63 | SafeLoadAs(in + 8 * 12) << 1, - SafeLoadAs(in + 8 * 12) >> 58 | SafeLoadAs(in + 8 * 13) << 6, - SafeLoadAs(in + 8 * 13) >> 53 | SafeLoadAs(in + 8 * 14) << 11, - }; - shifts = simd_batch{ 4, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 59-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 14) >> 48 | SafeLoadAs(in + 8 * 15) << 16, - SafeLoadAs(in + 8 * 15) >> 43 | SafeLoadAs(in + 8 * 16) << 21, - SafeLoadAs(in + 8 * 16) >> 38 | SafeLoadAs(in + 8 * 17) << 26, - SafeLoadAs(in + 8 * 17) >> 33 | SafeLoadAs(in + 8 * 18) << 31, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 59-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 28 | SafeLoadAs(in + 8 * 19) << 36, - SafeLoadAs(in + 8 * 19) >> 23 | SafeLoadAs(in + 8 * 20) << 41, - SafeLoadAs(in + 8 * 20) >> 18 | SafeLoadAs(in + 8 * 21) << 46, - SafeLoadAs(in + 8 * 21) >> 13 | SafeLoadAs(in + 8 * 22) << 51, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 59-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 8 | SafeLoadAs(in + 8 * 23) << 56, - SafeLoadAs(in + 8 * 23), - SafeLoadAs(in + 8 * 23) >> 62 | SafeLoadAs(in + 8 * 24) << 2, - SafeLoadAs(in + 8 * 24) >> 57 | SafeLoadAs(in + 8 * 25) << 7, - }; - shifts = simd_batch{ 0, 3, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 59-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 25) >> 52 | SafeLoadAs(in + 8 * 26) << 12, - SafeLoadAs(in + 8 * 26) >> 47 | SafeLoadAs(in + 8 * 27) << 17, - SafeLoadAs(in + 8 * 27) >> 42 | SafeLoadAs(in + 8 * 28) << 22, - SafeLoadAs(in + 8 * 28) >> 37 | SafeLoadAs(in + 8 * 29) << 27, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 59-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 29) >> 32 | SafeLoadAs(in + 8 * 30) << 32, - SafeLoadAs(in + 8 * 30) >> 27 | SafeLoadAs(in + 8 * 31) << 37, - SafeLoadAs(in + 8 * 31) >> 22 | SafeLoadAs(in + 8 * 32) << 42, - SafeLoadAs(in + 8 * 32) >> 17 | SafeLoadAs(in + 8 * 33) << 47, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 59-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 12 | SafeLoadAs(in + 8 * 34) << 52, - SafeLoadAs(in + 8 * 34) >> 7 | SafeLoadAs(in + 8 * 35) << 57, - SafeLoadAs(in + 8 * 35), - SafeLoadAs(in + 8 * 35) >> 61 | SafeLoadAs(in + 8 * 36) << 3, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 59-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 36) >> 56 | SafeLoadAs(in + 8 * 37) << 8, - SafeLoadAs(in + 8 * 37) >> 51 | SafeLoadAs(in + 8 * 38) << 13, - SafeLoadAs(in + 8 * 38) >> 46 | SafeLoadAs(in + 8 * 39) << 18, - SafeLoadAs(in + 8 * 39) >> 41 | SafeLoadAs(in + 8 * 40) << 23, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 59-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 40) >> 36 | SafeLoadAs(in + 8 * 41) << 28, - SafeLoadAs(in + 8 * 41) >> 31 | SafeLoadAs(in + 8 * 42) << 33, - SafeLoadAs(in + 8 * 42) >> 26 | SafeLoadAs(in + 8 * 43) << 38, - SafeLoadAs(in + 8 * 43) >> 21 | SafeLoadAs(in + 8 * 44) << 43, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 59-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 44) >> 16 | SafeLoadAs(in + 8 * 45) << 48, - SafeLoadAs(in + 8 * 45) >> 11 | SafeLoadAs(in + 8 * 46) << 53, - SafeLoadAs(in + 8 * 46) >> 6 | SafeLoadAs(in + 8 * 47) << 58, - SafeLoadAs(in + 8 * 47), - }; - shifts = simd_batch{ 0, 0, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 59-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 47) >> 60 | SafeLoadAs(in + 8 * 48) << 4, - SafeLoadAs(in + 8 * 48) >> 55 | SafeLoadAs(in + 8 * 49) << 9, - SafeLoadAs(in + 8 * 49) >> 50 | SafeLoadAs(in + 8 * 50) << 14, - SafeLoadAs(in + 8 * 50) >> 45 | SafeLoadAs(in + 8 * 51) << 19, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 59-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 51) >> 40 | SafeLoadAs(in + 8 * 52) << 24, - SafeLoadAs(in + 8 * 52) >> 35 | SafeLoadAs(in + 8 * 53) << 29, - SafeLoadAs(in + 8 * 53) >> 30 | SafeLoadAs(in + 8 * 54) << 34, - SafeLoadAs(in + 8 * 54) >> 25 | SafeLoadAs(in + 8 * 55) << 39, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 59-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 55) >> 20 | SafeLoadAs(in + 8 * 56) << 44, - SafeLoadAs(in + 8 * 56) >> 15 | SafeLoadAs(in + 8 * 57) << 49, - SafeLoadAs(in + 8 * 57) >> 10 | SafeLoadAs(in + 8 * 58) << 54, - SafeLoadAs(in + 8 * 58), - }; - shifts = simd_batch{ 0, 0, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 59 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0xfffffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 60-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 60 | SafeLoadAs(in + 8 * 1) << 4, - SafeLoadAs(in + 8 * 1) >> 56 | SafeLoadAs(in + 8 * 2) << 8, - SafeLoadAs(in + 8 * 2) >> 52 | SafeLoadAs(in + 8 * 3) << 12, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 60-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 48 | SafeLoadAs(in + 8 * 4) << 16, - SafeLoadAs(in + 8 * 4) >> 44 | SafeLoadAs(in + 8 * 5) << 20, - SafeLoadAs(in + 8 * 5) >> 40 | SafeLoadAs(in + 8 * 6) << 24, - SafeLoadAs(in + 8 * 6) >> 36 | SafeLoadAs(in + 8 * 7) << 28, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 60-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 32 | SafeLoadAs(in + 8 * 8) << 32, - SafeLoadAs(in + 8 * 8) >> 28 | SafeLoadAs(in + 8 * 9) << 36, - SafeLoadAs(in + 8 * 9) >> 24 | SafeLoadAs(in + 8 * 10) << 40, - SafeLoadAs(in + 8 * 10) >> 20 | SafeLoadAs(in + 8 * 11) << 44, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 60-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 16 | SafeLoadAs(in + 8 * 12) << 48, - SafeLoadAs(in + 8 * 12) >> 12 | SafeLoadAs(in + 8 * 13) << 52, - SafeLoadAs(in + 8 * 13) >> 8 | SafeLoadAs(in + 8 * 14) << 56, - SafeLoadAs(in + 8 * 14), - }; - shifts = simd_batch{ 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 60-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 15), - SafeLoadAs(in + 8 * 15) >> 60 | SafeLoadAs(in + 8 * 16) << 4, - SafeLoadAs(in + 8 * 16) >> 56 | SafeLoadAs(in + 8 * 17) << 8, - SafeLoadAs(in + 8 * 17) >> 52 | SafeLoadAs(in + 8 * 18) << 12, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 60-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 18) >> 48 | SafeLoadAs(in + 8 * 19) << 16, - SafeLoadAs(in + 8 * 19) >> 44 | SafeLoadAs(in + 8 * 20) << 20, - SafeLoadAs(in + 8 * 20) >> 40 | SafeLoadAs(in + 8 * 21) << 24, - SafeLoadAs(in + 8 * 21) >> 36 | SafeLoadAs(in + 8 * 22) << 28, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 60-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 32 | SafeLoadAs(in + 8 * 23) << 32, - SafeLoadAs(in + 8 * 23) >> 28 | SafeLoadAs(in + 8 * 24) << 36, - SafeLoadAs(in + 8 * 24) >> 24 | SafeLoadAs(in + 8 * 25) << 40, - SafeLoadAs(in + 8 * 25) >> 20 | SafeLoadAs(in + 8 * 26) << 44, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 60-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 16 | SafeLoadAs(in + 8 * 27) << 48, - SafeLoadAs(in + 8 * 27) >> 12 | SafeLoadAs(in + 8 * 28) << 52, - SafeLoadAs(in + 8 * 28) >> 8 | SafeLoadAs(in + 8 * 29) << 56, - SafeLoadAs(in + 8 * 29), - }; - shifts = simd_batch{ 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 60-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 30), - SafeLoadAs(in + 8 * 30) >> 60 | SafeLoadAs(in + 8 * 31) << 4, - SafeLoadAs(in + 8 * 31) >> 56 | SafeLoadAs(in + 8 * 32) << 8, - SafeLoadAs(in + 8 * 32) >> 52 | SafeLoadAs(in + 8 * 33) << 12, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 60-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 33) >> 48 | SafeLoadAs(in + 8 * 34) << 16, - SafeLoadAs(in + 8 * 34) >> 44 | SafeLoadAs(in + 8 * 35) << 20, - SafeLoadAs(in + 8 * 35) >> 40 | SafeLoadAs(in + 8 * 36) << 24, - SafeLoadAs(in + 8 * 36) >> 36 | SafeLoadAs(in + 8 * 37) << 28, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 60-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 37) >> 32 | SafeLoadAs(in + 8 * 38) << 32, - SafeLoadAs(in + 8 * 38) >> 28 | SafeLoadAs(in + 8 * 39) << 36, - SafeLoadAs(in + 8 * 39) >> 24 | SafeLoadAs(in + 8 * 40) << 40, - SafeLoadAs(in + 8 * 40) >> 20 | SafeLoadAs(in + 8 * 41) << 44, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 60-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 41) >> 16 | SafeLoadAs(in + 8 * 42) << 48, - SafeLoadAs(in + 8 * 42) >> 12 | SafeLoadAs(in + 8 * 43) << 52, - SafeLoadAs(in + 8 * 43) >> 8 | SafeLoadAs(in + 8 * 44) << 56, - SafeLoadAs(in + 8 * 44), - }; - shifts = simd_batch{ 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 60-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 45), - SafeLoadAs(in + 8 * 45) >> 60 | SafeLoadAs(in + 8 * 46) << 4, - SafeLoadAs(in + 8 * 46) >> 56 | SafeLoadAs(in + 8 * 47) << 8, - SafeLoadAs(in + 8 * 47) >> 52 | SafeLoadAs(in + 8 * 48) << 12, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 60-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 48) >> 48 | SafeLoadAs(in + 8 * 49) << 16, - SafeLoadAs(in + 8 * 49) >> 44 | SafeLoadAs(in + 8 * 50) << 20, - SafeLoadAs(in + 8 * 50) >> 40 | SafeLoadAs(in + 8 * 51) << 24, - SafeLoadAs(in + 8 * 51) >> 36 | SafeLoadAs(in + 8 * 52) << 28, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 60-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 52) >> 32 | SafeLoadAs(in + 8 * 53) << 32, - SafeLoadAs(in + 8 * 53) >> 28 | SafeLoadAs(in + 8 * 54) << 36, - SafeLoadAs(in + 8 * 54) >> 24 | SafeLoadAs(in + 8 * 55) << 40, - SafeLoadAs(in + 8 * 55) >> 20 | SafeLoadAs(in + 8 * 56) << 44, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 60-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 56) >> 16 | SafeLoadAs(in + 8 * 57) << 48, - SafeLoadAs(in + 8 * 57) >> 12 | SafeLoadAs(in + 8 * 58) << 52, - SafeLoadAs(in + 8 * 58) >> 8 | SafeLoadAs(in + 8 * 59) << 56, - SafeLoadAs(in + 8 * 59), - }; - shifts = simd_batch{ 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 60 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x1fffffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 61-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 61 | SafeLoadAs(in + 8 * 1) << 3, - SafeLoadAs(in + 8 * 1) >> 58 | SafeLoadAs(in + 8 * 2) << 6, - SafeLoadAs(in + 8 * 2) >> 55 | SafeLoadAs(in + 8 * 3) << 9, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 61-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 52 | SafeLoadAs(in + 8 * 4) << 12, - SafeLoadAs(in + 8 * 4) >> 49 | SafeLoadAs(in + 8 * 5) << 15, - SafeLoadAs(in + 8 * 5) >> 46 | SafeLoadAs(in + 8 * 6) << 18, - SafeLoadAs(in + 8 * 6) >> 43 | SafeLoadAs(in + 8 * 7) << 21, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 61-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 40 | SafeLoadAs(in + 8 * 8) << 24, - SafeLoadAs(in + 8 * 8) >> 37 | SafeLoadAs(in + 8 * 9) << 27, - SafeLoadAs(in + 8 * 9) >> 34 | SafeLoadAs(in + 8 * 10) << 30, - SafeLoadAs(in + 8 * 10) >> 31 | SafeLoadAs(in + 8 * 11) << 33, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 61-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 28 | SafeLoadAs(in + 8 * 12) << 36, - SafeLoadAs(in + 8 * 12) >> 25 | SafeLoadAs(in + 8 * 13) << 39, - SafeLoadAs(in + 8 * 13) >> 22 | SafeLoadAs(in + 8 * 14) << 42, - SafeLoadAs(in + 8 * 14) >> 19 | SafeLoadAs(in + 8 * 15) << 45, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 61-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 16 | SafeLoadAs(in + 8 * 16) << 48, - SafeLoadAs(in + 8 * 16) >> 13 | SafeLoadAs(in + 8 * 17) << 51, - SafeLoadAs(in + 8 * 17) >> 10 | SafeLoadAs(in + 8 * 18) << 54, - SafeLoadAs(in + 8 * 18) >> 7 | SafeLoadAs(in + 8 * 19) << 57, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 61-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 4 | SafeLoadAs(in + 8 * 20) << 60, - SafeLoadAs(in + 8 * 20), - SafeLoadAs(in + 8 * 20) >> 62 | SafeLoadAs(in + 8 * 21) << 2, - SafeLoadAs(in + 8 * 21) >> 59 | SafeLoadAs(in + 8 * 22) << 5, - }; - shifts = simd_batch{ 0, 1, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 61-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 22) >> 56 | SafeLoadAs(in + 8 * 23) << 8, - SafeLoadAs(in + 8 * 23) >> 53 | SafeLoadAs(in + 8 * 24) << 11, - SafeLoadAs(in + 8 * 24) >> 50 | SafeLoadAs(in + 8 * 25) << 14, - SafeLoadAs(in + 8 * 25) >> 47 | SafeLoadAs(in + 8 * 26) << 17, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 61-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 26) >> 44 | SafeLoadAs(in + 8 * 27) << 20, - SafeLoadAs(in + 8 * 27) >> 41 | SafeLoadAs(in + 8 * 28) << 23, - SafeLoadAs(in + 8 * 28) >> 38 | SafeLoadAs(in + 8 * 29) << 26, - SafeLoadAs(in + 8 * 29) >> 35 | SafeLoadAs(in + 8 * 30) << 29, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 61-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 30) >> 32 | SafeLoadAs(in + 8 * 31) << 32, - SafeLoadAs(in + 8 * 31) >> 29 | SafeLoadAs(in + 8 * 32) << 35, - SafeLoadAs(in + 8 * 32) >> 26 | SafeLoadAs(in + 8 * 33) << 38, - SafeLoadAs(in + 8 * 33) >> 23 | SafeLoadAs(in + 8 * 34) << 41, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 61-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 20 | SafeLoadAs(in + 8 * 35) << 44, - SafeLoadAs(in + 8 * 35) >> 17 | SafeLoadAs(in + 8 * 36) << 47, - SafeLoadAs(in + 8 * 36) >> 14 | SafeLoadAs(in + 8 * 37) << 50, - SafeLoadAs(in + 8 * 37) >> 11 | SafeLoadAs(in + 8 * 38) << 53, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 61-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 38) >> 8 | SafeLoadAs(in + 8 * 39) << 56, - SafeLoadAs(in + 8 * 39) >> 5 | SafeLoadAs(in + 8 * 40) << 59, - SafeLoadAs(in + 8 * 40), - SafeLoadAs(in + 8 * 40) >> 63 | SafeLoadAs(in + 8 * 41) << 1, - }; - shifts = simd_batch{ 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 61-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 41) >> 60 | SafeLoadAs(in + 8 * 42) << 4, - SafeLoadAs(in + 8 * 42) >> 57 | SafeLoadAs(in + 8 * 43) << 7, - SafeLoadAs(in + 8 * 43) >> 54 | SafeLoadAs(in + 8 * 44) << 10, - SafeLoadAs(in + 8 * 44) >> 51 | SafeLoadAs(in + 8 * 45) << 13, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 61-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 45) >> 48 | SafeLoadAs(in + 8 * 46) << 16, - SafeLoadAs(in + 8 * 46) >> 45 | SafeLoadAs(in + 8 * 47) << 19, - SafeLoadAs(in + 8 * 47) >> 42 | SafeLoadAs(in + 8 * 48) << 22, - SafeLoadAs(in + 8 * 48) >> 39 | SafeLoadAs(in + 8 * 49) << 25, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 61-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 49) >> 36 | SafeLoadAs(in + 8 * 50) << 28, - SafeLoadAs(in + 8 * 50) >> 33 | SafeLoadAs(in + 8 * 51) << 31, - SafeLoadAs(in + 8 * 51) >> 30 | SafeLoadAs(in + 8 * 52) << 34, - SafeLoadAs(in + 8 * 52) >> 27 | SafeLoadAs(in + 8 * 53) << 37, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 61-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 53) >> 24 | SafeLoadAs(in + 8 * 54) << 40, - SafeLoadAs(in + 8 * 54) >> 21 | SafeLoadAs(in + 8 * 55) << 43, - SafeLoadAs(in + 8 * 55) >> 18 | SafeLoadAs(in + 8 * 56) << 46, - SafeLoadAs(in + 8 * 56) >> 15 | SafeLoadAs(in + 8 * 57) << 49, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 61-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 57) >> 12 | SafeLoadAs(in + 8 * 58) << 52, - SafeLoadAs(in + 8 * 58) >> 9 | SafeLoadAs(in + 8 * 59) << 55, - SafeLoadAs(in + 8 * 59) >> 6 | SafeLoadAs(in + 8 * 60) << 58, - SafeLoadAs(in + 8 * 60), - }; - shifts = simd_batch{ 0, 0, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 61 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x3fffffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 62-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 62 | SafeLoadAs(in + 8 * 1) << 2, - SafeLoadAs(in + 8 * 1) >> 60 | SafeLoadAs(in + 8 * 2) << 4, - SafeLoadAs(in + 8 * 2) >> 58 | SafeLoadAs(in + 8 * 3) << 6, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 62-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 56 | SafeLoadAs(in + 8 * 4) << 8, - SafeLoadAs(in + 8 * 4) >> 54 | SafeLoadAs(in + 8 * 5) << 10, - SafeLoadAs(in + 8 * 5) >> 52 | SafeLoadAs(in + 8 * 6) << 12, - SafeLoadAs(in + 8 * 6) >> 50 | SafeLoadAs(in + 8 * 7) << 14, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 62-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 48 | SafeLoadAs(in + 8 * 8) << 16, - SafeLoadAs(in + 8 * 8) >> 46 | SafeLoadAs(in + 8 * 9) << 18, - SafeLoadAs(in + 8 * 9) >> 44 | SafeLoadAs(in + 8 * 10) << 20, - SafeLoadAs(in + 8 * 10) >> 42 | SafeLoadAs(in + 8 * 11) << 22, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 62-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 40 | SafeLoadAs(in + 8 * 12) << 24, - SafeLoadAs(in + 8 * 12) >> 38 | SafeLoadAs(in + 8 * 13) << 26, - SafeLoadAs(in + 8 * 13) >> 36 | SafeLoadAs(in + 8 * 14) << 28, - SafeLoadAs(in + 8 * 14) >> 34 | SafeLoadAs(in + 8 * 15) << 30, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 62-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 32 | SafeLoadAs(in + 8 * 16) << 32, - SafeLoadAs(in + 8 * 16) >> 30 | SafeLoadAs(in + 8 * 17) << 34, - SafeLoadAs(in + 8 * 17) >> 28 | SafeLoadAs(in + 8 * 18) << 36, - SafeLoadAs(in + 8 * 18) >> 26 | SafeLoadAs(in + 8 * 19) << 38, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 62-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 24 | SafeLoadAs(in + 8 * 20) << 40, - SafeLoadAs(in + 8 * 20) >> 22 | SafeLoadAs(in + 8 * 21) << 42, - SafeLoadAs(in + 8 * 21) >> 20 | SafeLoadAs(in + 8 * 22) << 44, - SafeLoadAs(in + 8 * 22) >> 18 | SafeLoadAs(in + 8 * 23) << 46, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 62-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 16 | SafeLoadAs(in + 8 * 24) << 48, - SafeLoadAs(in + 8 * 24) >> 14 | SafeLoadAs(in + 8 * 25) << 50, - SafeLoadAs(in + 8 * 25) >> 12 | SafeLoadAs(in + 8 * 26) << 52, - SafeLoadAs(in + 8 * 26) >> 10 | SafeLoadAs(in + 8 * 27) << 54, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 62-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 8 | SafeLoadAs(in + 8 * 28) << 56, - SafeLoadAs(in + 8 * 28) >> 6 | SafeLoadAs(in + 8 * 29) << 58, - SafeLoadAs(in + 8 * 29) >> 4 | SafeLoadAs(in + 8 * 30) << 60, - SafeLoadAs(in + 8 * 30), - }; - shifts = simd_batch{ 0, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 62-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 31), - SafeLoadAs(in + 8 * 31) >> 62 | SafeLoadAs(in + 8 * 32) << 2, - SafeLoadAs(in + 8 * 32) >> 60 | SafeLoadAs(in + 8 * 33) << 4, - SafeLoadAs(in + 8 * 33) >> 58 | SafeLoadAs(in + 8 * 34) << 6, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 62-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 34) >> 56 | SafeLoadAs(in + 8 * 35) << 8, - SafeLoadAs(in + 8 * 35) >> 54 | SafeLoadAs(in + 8 * 36) << 10, - SafeLoadAs(in + 8 * 36) >> 52 | SafeLoadAs(in + 8 * 37) << 12, - SafeLoadAs(in + 8 * 37) >> 50 | SafeLoadAs(in + 8 * 38) << 14, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 62-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 38) >> 48 | SafeLoadAs(in + 8 * 39) << 16, - SafeLoadAs(in + 8 * 39) >> 46 | SafeLoadAs(in + 8 * 40) << 18, - SafeLoadAs(in + 8 * 40) >> 44 | SafeLoadAs(in + 8 * 41) << 20, - SafeLoadAs(in + 8 * 41) >> 42 | SafeLoadAs(in + 8 * 42) << 22, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 62-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 42) >> 40 | SafeLoadAs(in + 8 * 43) << 24, - SafeLoadAs(in + 8 * 43) >> 38 | SafeLoadAs(in + 8 * 44) << 26, - SafeLoadAs(in + 8 * 44) >> 36 | SafeLoadAs(in + 8 * 45) << 28, - SafeLoadAs(in + 8 * 45) >> 34 | SafeLoadAs(in + 8 * 46) << 30, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 62-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 46) >> 32 | SafeLoadAs(in + 8 * 47) << 32, - SafeLoadAs(in + 8 * 47) >> 30 | SafeLoadAs(in + 8 * 48) << 34, - SafeLoadAs(in + 8 * 48) >> 28 | SafeLoadAs(in + 8 * 49) << 36, - SafeLoadAs(in + 8 * 49) >> 26 | SafeLoadAs(in + 8 * 50) << 38, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 62-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 50) >> 24 | SafeLoadAs(in + 8 * 51) << 40, - SafeLoadAs(in + 8 * 51) >> 22 | SafeLoadAs(in + 8 * 52) << 42, - SafeLoadAs(in + 8 * 52) >> 20 | SafeLoadAs(in + 8 * 53) << 44, - SafeLoadAs(in + 8 * 53) >> 18 | SafeLoadAs(in + 8 * 54) << 46, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 62-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 54) >> 16 | SafeLoadAs(in + 8 * 55) << 48, - SafeLoadAs(in + 8 * 55) >> 14 | SafeLoadAs(in + 8 * 56) << 50, - SafeLoadAs(in + 8 * 56) >> 12 | SafeLoadAs(in + 8 * 57) << 52, - SafeLoadAs(in + 8 * 57) >> 10 | SafeLoadAs(in + 8 * 58) << 54, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 62-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 58) >> 8 | SafeLoadAs(in + 8 * 59) << 56, - SafeLoadAs(in + 8 * 59) >> 6 | SafeLoadAs(in + 8 * 60) << 58, - SafeLoadAs(in + 8 * 60) >> 4 | SafeLoadAs(in + 8 * 61) << 60, - SafeLoadAs(in + 8 * 61), - }; - shifts = simd_batch{ 0, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 62 * 8; - return in; - } -}; - -template<> -struct Simd256UnpackerForWidth { - - using simd_batch = xsimd::make_sized_batch_t; - static constexpr int kValuesUnpacked = 64; - - static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { - constexpr uint64_t kMask = 0x7fffffffffffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - // extract 63-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 8 * 0), - SafeLoadAs(in + 8 * 0) >> 63 | SafeLoadAs(in + 8 * 1) << 1, - SafeLoadAs(in + 8 * 1) >> 62 | SafeLoadAs(in + 8 * 2) << 2, - SafeLoadAs(in + 8 * 2) >> 61 | SafeLoadAs(in + 8 * 3) << 3, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 63-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 8 * 3) >> 60 | SafeLoadAs(in + 8 * 4) << 4, - SafeLoadAs(in + 8 * 4) >> 59 | SafeLoadAs(in + 8 * 5) << 5, - SafeLoadAs(in + 8 * 5) >> 58 | SafeLoadAs(in + 8 * 6) << 6, - SafeLoadAs(in + 8 * 6) >> 57 | SafeLoadAs(in + 8 * 7) << 7, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 63-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 8 * 7) >> 56 | SafeLoadAs(in + 8 * 8) << 8, - SafeLoadAs(in + 8 * 8) >> 55 | SafeLoadAs(in + 8 * 9) << 9, - SafeLoadAs(in + 8 * 9) >> 54 | SafeLoadAs(in + 8 * 10) << 10, - SafeLoadAs(in + 8 * 10) >> 53 | SafeLoadAs(in + 8 * 11) << 11, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 63-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 8 * 11) >> 52 | SafeLoadAs(in + 8 * 12) << 12, - SafeLoadAs(in + 8 * 12) >> 51 | SafeLoadAs(in + 8 * 13) << 13, - SafeLoadAs(in + 8 * 13) >> 50 | SafeLoadAs(in + 8 * 14) << 14, - SafeLoadAs(in + 8 * 14) >> 49 | SafeLoadAs(in + 8 * 15) << 15, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 63-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 8 * 15) >> 48 | SafeLoadAs(in + 8 * 16) << 16, - SafeLoadAs(in + 8 * 16) >> 47 | SafeLoadAs(in + 8 * 17) << 17, - SafeLoadAs(in + 8 * 17) >> 46 | SafeLoadAs(in + 8 * 18) << 18, - SafeLoadAs(in + 8 * 18) >> 45 | SafeLoadAs(in + 8 * 19) << 19, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 63-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 8 * 19) >> 44 | SafeLoadAs(in + 8 * 20) << 20, - SafeLoadAs(in + 8 * 20) >> 43 | SafeLoadAs(in + 8 * 21) << 21, - SafeLoadAs(in + 8 * 21) >> 42 | SafeLoadAs(in + 8 * 22) << 22, - SafeLoadAs(in + 8 * 22) >> 41 | SafeLoadAs(in + 8 * 23) << 23, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 63-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 8 * 23) >> 40 | SafeLoadAs(in + 8 * 24) << 24, - SafeLoadAs(in + 8 * 24) >> 39 | SafeLoadAs(in + 8 * 25) << 25, - SafeLoadAs(in + 8 * 25) >> 38 | SafeLoadAs(in + 8 * 26) << 26, - SafeLoadAs(in + 8 * 26) >> 37 | SafeLoadAs(in + 8 * 27) << 27, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 63-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 8 * 27) >> 36 | SafeLoadAs(in + 8 * 28) << 28, - SafeLoadAs(in + 8 * 28) >> 35 | SafeLoadAs(in + 8 * 29) << 29, - SafeLoadAs(in + 8 * 29) >> 34 | SafeLoadAs(in + 8 * 30) << 30, - SafeLoadAs(in + 8 * 30) >> 33 | SafeLoadAs(in + 8 * 31) << 31, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 63-bit bundles 32 to 35 - words = simd_batch{ - SafeLoadAs(in + 8 * 31) >> 32 | SafeLoadAs(in + 8 * 32) << 32, - SafeLoadAs(in + 8 * 32) >> 31 | SafeLoadAs(in + 8 * 33) << 33, - SafeLoadAs(in + 8 * 33) >> 30 | SafeLoadAs(in + 8 * 34) << 34, - SafeLoadAs(in + 8 * 34) >> 29 | SafeLoadAs(in + 8 * 35) << 35, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 63-bit bundles 36 to 39 - words = simd_batch{ - SafeLoadAs(in + 8 * 35) >> 28 | SafeLoadAs(in + 8 * 36) << 36, - SafeLoadAs(in + 8 * 36) >> 27 | SafeLoadAs(in + 8 * 37) << 37, - SafeLoadAs(in + 8 * 37) >> 26 | SafeLoadAs(in + 8 * 38) << 38, - SafeLoadAs(in + 8 * 38) >> 25 | SafeLoadAs(in + 8 * 39) << 39, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 63-bit bundles 40 to 43 - words = simd_batch{ - SafeLoadAs(in + 8 * 39) >> 24 | SafeLoadAs(in + 8 * 40) << 40, - SafeLoadAs(in + 8 * 40) >> 23 | SafeLoadAs(in + 8 * 41) << 41, - SafeLoadAs(in + 8 * 41) >> 22 | SafeLoadAs(in + 8 * 42) << 42, - SafeLoadAs(in + 8 * 42) >> 21 | SafeLoadAs(in + 8 * 43) << 43, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 63-bit bundles 44 to 47 - words = simd_batch{ - SafeLoadAs(in + 8 * 43) >> 20 | SafeLoadAs(in + 8 * 44) << 44, - SafeLoadAs(in + 8 * 44) >> 19 | SafeLoadAs(in + 8 * 45) << 45, - SafeLoadAs(in + 8 * 45) >> 18 | SafeLoadAs(in + 8 * 46) << 46, - SafeLoadAs(in + 8 * 46) >> 17 | SafeLoadAs(in + 8 * 47) << 47, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 63-bit bundles 48 to 51 - words = simd_batch{ - SafeLoadAs(in + 8 * 47) >> 16 | SafeLoadAs(in + 8 * 48) << 48, - SafeLoadAs(in + 8 * 48) >> 15 | SafeLoadAs(in + 8 * 49) << 49, - SafeLoadAs(in + 8 * 49) >> 14 | SafeLoadAs(in + 8 * 50) << 50, - SafeLoadAs(in + 8 * 50) >> 13 | SafeLoadAs(in + 8 * 51) << 51, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 63-bit bundles 52 to 55 - words = simd_batch{ - SafeLoadAs(in + 8 * 51) >> 12 | SafeLoadAs(in + 8 * 52) << 52, - SafeLoadAs(in + 8 * 52) >> 11 | SafeLoadAs(in + 8 * 53) << 53, - SafeLoadAs(in + 8 * 53) >> 10 | SafeLoadAs(in + 8 * 54) << 54, - SafeLoadAs(in + 8 * 54) >> 9 | SafeLoadAs(in + 8 * 55) << 55, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 63-bit bundles 56 to 59 - words = simd_batch{ - SafeLoadAs(in + 8 * 55) >> 8 | SafeLoadAs(in + 8 * 56) << 56, - SafeLoadAs(in + 8 * 56) >> 7 | SafeLoadAs(in + 8 * 57) << 57, - SafeLoadAs(in + 8 * 57) >> 6 | SafeLoadAs(in + 8 * 58) << 58, - SafeLoadAs(in + 8 * 58) >> 5 | SafeLoadAs(in + 8 * 59) << 59, - }; - shifts = simd_batch{ 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 63-bit bundles 60 to 63 - words = simd_batch{ - SafeLoadAs(in + 8 * 59) >> 4 | SafeLoadAs(in + 8 * 60) << 60, - SafeLoadAs(in + 8 * 60) >> 3 | SafeLoadAs(in + 8 * 61) << 61, - SafeLoadAs(in + 8 * 61) >> 2 | SafeLoadAs(in + 8 * 62) << 62, - SafeLoadAs(in + 8 * 62), - }; - shifts = simd_batch{ 0, 0, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 63 * 8; - return in; - } -}; - - -} // namespace -} // namespace arrow::internal From b38fd4d88a9a42f7d24f838b542d90bb0a23eaa0 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 30 Oct 2025 12:07:29 +0100 Subject: [PATCH 25/64] Remove uint8_t fallback --- .../arrow/util/bpacking_simd_impl_internal.h | 26 ++++++------------- 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index cc67bafec43..1741de7cc39 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -584,7 +584,14 @@ struct LargeKernel { const auto low_swizzled = swizzle_bytes(bytes, kLowSwizzles); const auto low_words = xsimd::bitwise_cast(low_swizzled); - const auto low_shifted = right_shift_by_excess(low_words, kLowRShifts); + simd_batch low_shifted; + if constexpr (kShape.unpacked_byte_size() == 1) { + // The logic of the fallback in right_shift_by_excess does not work for this single + // byte case case, so we use directly xsimd and its scalar fallback. + low_shifted = low_words >> kLowRShifts; + } else { + low_shifted = right_shift_by_excess(low_words, kLowRShifts); + } const auto high_swizzled = swizzle_bytes(bytes, kHighSwizzles); const auto high_words = xsimd::bitwise_cast(high_swizzled); @@ -641,23 +648,6 @@ struct Kernel : DispatchKernelType { using Base::unpack; }; -template -struct Kernel - : Kernel { - using Base = DispatchKernelType; - using Base::kValuesUnpacked; - using unpacked_type = uint8_t; - - static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) { - uint16_t buffer[kValuesUnpacked] = {}; - in = Base::unpack(in, buffer); - for (int k = 0; k < kValuesUnpacked; ++k) { - out[k] = static_cast(buffer[k]); - } - return in; - } -}; - template struct Kernel : Kernel { From 3cb34a6d77dae98aa1c88851b0190d9da342ad20 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 30 Oct 2025 15:35:14 +0100 Subject: [PATCH 26/64] Add boolean simd implementation --- .../arrow/util/bpacking_simd_impl_internal.h | 47 +++++++++---------- cpp/src/arrow/util/type_traits.h | 4 ++ 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 1741de7cc39..6136b39c6e5 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -27,6 +27,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bpacking_dispatch_internal.h" +#include "arrow/util/type_traits.h" namespace arrow::internal { @@ -38,7 +39,6 @@ namespace arrow::internal { // - array to batch constant to xsimd // - Shifts per swizzle can be improved when self.packed_max_byte_spread == 1 and the // byte can be reused (when val_bit_width divides packed_max_byte_spread). -// - Try for uint16_t and uint8_t and bool (currently copy) // - Add unpack_exact to benchmarks // - Reduce input size on small bit width using a broadcast. // - For Avx2: @@ -112,7 +112,10 @@ struct KernelTraits { }; using unpacked_type = UnpackedUint; - using simd_batch = xsimd::make_sized_batch_t; + // The integer type to work with, `unpacked_type` or an appropriate type for bool. + using uint_type = std::conditional_t, + SizedUint, unpacked_type>; + using simd_batch = xsimd::make_sized_batch_t; using simd_bytes = xsimd::make_sized_batch_t; using arch_type = typename simd_batch::arch_type; }; @@ -184,6 +187,7 @@ constexpr MediumKernelPlanSize BuildMediumPlanSize(const KernelShape& shape) { template struct MediumKernelPlan { using Traits = KernelTraits; + using uint_type = typename Traits::uint_type; static constexpr auto kShape = Traits::kShape; static constexpr auto kPlanSize = BuildMediumPlanSize(kShape); @@ -193,7 +197,7 @@ struct MediumKernelPlan { using SwizzlesPerRead = std::array; using SwizzlesPerKernel = std::array; - using Shift = std::array; + using Shift = std::array; using ShiftsPerSwizzle = std::array; using ShiftsPerRead = std::array; using ShitsPerKernel = std::array; @@ -212,7 +216,7 @@ struct MediumKernelPlan { ReadsPerKernel reads; SwizzlesPerKernel swizzles; ShitsPerKernel shifts; - UnpackedUint mask = bit_util::LeastSignificantBitMask(kPackedBitSize); + uint_type mask = bit_util::LeastSignificantBitMask(kPackedBitSize); }; template @@ -427,6 +431,7 @@ struct MediumKernel { static constexpr auto kShape = kPlan.kShape; using Traits = typename decltype(kPlan)::Traits; using unpacked_type = typename Traits::unpacked_type; + using uint_type = typename Traits::uint_type; using simd_batch = typename Traits::simd_batch; using simd_bytes = typename Traits::simd_bytes; using arch_type = typename Traits::arch_type; @@ -448,7 +453,12 @@ struct MediumKernel { // can use the fallback on these platforms. const auto shifted = right_shift_by_excess(words, kRightShifts); const auto vals = shifted & kMask; - xsimd::store_unaligned(out + kOutOffset, vals); + if constexpr (std::is_same_v) { + const xsimd::batch_bool bools(vals); + bools.store_unaligned(out + kOutOffset); + } else { + vals.store_unaligned(out + kOutOffset); + } } template @@ -458,7 +468,7 @@ struct MediumKernel { constexpr auto kSwizzles = make_batch_constant(); const auto swizzled = swizzle_bytes(bytes, kSwizzles); - const auto words = xsimd::bitwise_cast(swizzled); + const auto words = xsimd::bitwise_cast(swizzled); (unpack_one_shift_impl(words, out), ...); } @@ -487,6 +497,7 @@ struct MediumKernel { template struct LargeKernelPlan { using Traits = KernelTraits; + using uint_type = typename Traits::uint_type; static constexpr auto kShape = Traits::kShape; static constexpr int kUnpackedPerkernel = std::lcm(kShape.unpacked_per_simd(), 8); @@ -498,7 +509,7 @@ struct LargeKernelPlan { using Swizzle = std::array; using SwizzlesPerKernel = std::array; - using Shift = std::array; + using Shift = std::array; using ShitsPerKernel = std::array; ReadsPerKernel reads; @@ -506,12 +517,13 @@ struct LargeKernelPlan { SwizzlesPerKernel high_swizzles; ShitsPerKernel low_rshifts; ShitsPerKernel high_lshifts; - UnpackedUint mask; + uint_type mask; }; template constexpr LargeKernelPlan BuildLargePlan() { using Plan = LargeKernelPlan; + using uint_type = typename Plan::Traits::uint_type; constexpr auto kShape = Plan::kShape; static_assert(kShape.is_large()); constexpr int kOverBytes = @@ -550,7 +562,7 @@ constexpr LargeKernelPlan BuildLarge } } - plan.mask = bit_util::LeastSignificantBitMask(kPackedBitSize); + plan.mask = bit_util::LeastSignificantBitMask(kPackedBitSize); return plan; } @@ -648,21 +660,4 @@ struct Kernel : DispatchKernelType { using Base::unpack; }; -template -struct Kernel - : Kernel { - using Base = DispatchKernelType; - using Base::kValuesUnpacked; - using unpacked_type = bool; - - static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) { - uint16_t buffer[kValuesUnpacked] = {}; - in = Base::unpack(in, buffer); - for (int k = 0; k < kValuesUnpacked; ++k) { - out[k] = static_cast(buffer[k]); - } - return in; - } -}; - } // namespace arrow::internal diff --git a/cpp/src/arrow/util/type_traits.h b/cpp/src/arrow/util/type_traits.h index 9c3b388dab2..dfd968670fb 100644 --- a/cpp/src/arrow/util/type_traits.h +++ b/cpp/src/arrow/util/type_traits.h @@ -69,5 +69,9 @@ struct SizedIntImpl<8> { template using SizedInt = typename SizedIntImpl::type; +// Map a number of bytes to a type +template +using SizedUint = std::make_unsigned_t>; + } // namespace internal } // namespace arrow From 29ffdb31bb8d992afc413548918a772ee6138949 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 30 Oct 2025 15:58:27 +0100 Subject: [PATCH 27/64] Use std::is_base_of for arch detection --- cpp/src/arrow/util/bpacking_simd_impl_internal.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 6136b39c6e5..e98698d14b8 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -342,7 +342,7 @@ constexpr SwizzleBiLaneGenericPlan BuildSwizzleBiLaneGenericPlan( template auto swizzle_bytes(const xsimd::batch& batch, xsimd::batch_constant mask) { - if constexpr (xsimd::supported_architectures::contains()) { + if constexpr (std::is_base_of_v) { static constexpr auto kPlan = BuildSwizzleBiLaneGenericPlan(std::array{kIdx...}); static constexpr auto kSelfSwizzleArr = kPlan.self_lane; constexpr auto kSelfSwizzle = make_batch_constant(); @@ -369,8 +369,8 @@ auto swizzle_bytes(const xsimd::batch& batch, template auto left_shift_no_overflow(const xsimd::batch& batch, xsimd::batch_constant shifts) { - constexpr bool kHasSse2 = xsimd::supported_architectures::contains(); - constexpr bool kHasAvx2 = xsimd::supported_architectures::contains(); + constexpr bool kHasSse2 = std::is_base_of_v; + constexpr bool kHasAvx2 = std::is_base_of_v; if constexpr (kHasSse2 && !kHasAvx2) { static constexpr auto kShiftsArr = std::array{kShifts...}; @@ -401,8 +401,8 @@ auto left_shift_no_overflow(const xsimd::batch& batch, template auto right_shift_by_excess(const xsimd::batch& batch, xsimd::batch_constant shifts) { - constexpr bool kHasSse2 = xsimd::supported_architectures::contains(); - constexpr bool kHasAvx2 = xsimd::supported_architectures::contains(); + constexpr bool kHasSse2 = std::is_base_of_v; + constexpr bool kHasAvx2 = std::is_base_of_v; if constexpr (kHasSse2 && !kHasAvx2) { static constexpr auto kShiftsArr = std::array{kShifts...}; From bb97c69c9ab0a0d8577f0e45bc4dc5a64cbee982 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 17 Nov 2025 18:28:46 +0100 Subject: [PATCH 28/64] Improve swizzle --- .../arrow/util/bpacking_simd_impl_internal.h | 50 +++++++++++++------ 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index e98698d14b8..edfcd49c4af 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -39,12 +39,12 @@ namespace arrow::internal { // - array to batch constant to xsimd // - Shifts per swizzle can be improved when self.packed_max_byte_spread == 1 and the // byte can be reused (when val_bit_width divides packed_max_byte_spread). -// - Add unpack_exact to benchmarks // - Reduce input size on small bit width using a broadcast. // - For Avx2: // - Inspect how swizzle across lanes are handled: _mm256_shuffle_epi8 not used? // - Investigate AVX2 with 128 bit register // - Fix overreading problem +// - Improve Swizzle by computing which bigger swapable slots are free template constexpr Arr BuildConstantArray(typename Arr::value_type val) { @@ -316,22 +316,12 @@ constexpr SwizzleBiLaneGenericPlan BuildSwizzleBiLaneGenericPlan( plan.self_lane[k] = kAsZero; plan.cross_lane[k] = kAsZero; } else { - if (is_first_lane_idx) { - if (is_first_lane_mask) { - plan.self_lane[k] = mask[k]; - plan.cross_lane[k] = kAsZero; - } else { - plan.self_lane[k] = kAsZero; - plan.cross_lane[k] = mask[k] - kSizeHalf; - } + if (is_first_lane_idx == is_first_lane_mask) { + plan.self_lane[k] = mask[k] % kSizeHalf; + plan.cross_lane[k] = kAsZero; } else { - if (is_first_lane_mask) { - plan.self_lane[k] = kAsZero; - plan.cross_lane[k] = mask[k]; // Indices given within lane - } else { - plan.self_lane[k] = mask[k] - kSizeHalf; // Indices given within lane - plan.cross_lane[k] = kAsZero; - } + plan.self_lane[k] = kAsZero; + plan.cross_lane[k] = mask[k] % kSizeHalf; } } } @@ -339,6 +329,17 @@ constexpr SwizzleBiLaneGenericPlan BuildSwizzleBiLaneGenericPlan( return plan; } +template +constexpr bool isOnlyFromHigh(xsimd::batch_constant) { + return ((Vals >= (sizeof...(Vals) / 2)) && ...); +} + +template +constexpr bool isOnlyFromLow(xsimd::batch_constant) { + return ((Vals < (sizeof...(Vals) / 2)) && ...); +} + +/// Merged in xsimd 14.0, simply use swizzle template auto swizzle_bytes(const xsimd::batch& batch, xsimd::batch_constant mask) { @@ -349,6 +350,23 @@ auto swizzle_bytes(const xsimd::batch& batch, static constexpr auto kCrossSwizzleArr = kPlan.cross_lane; constexpr auto kCrossSwizzle = make_batch_constant(); + struct LaneMask { + static constexpr uint8_t get(uint8_t i, uint8_t n) { + constexpr auto kMask = std::array{kIdx...}; + return kMask[i] % (kMask.size() / 2); + } + }; + + constexpr auto kLaneMask = xsimd::make_batch_constant(); + if constexpr (isOnlyFromLow(mask)) { + auto broadcast = _mm256_permute2x128_si256(batch, batch, 0x00); // [low | low] + return _mm256_shuffle_epi8(broadcast, kLaneMask.as_batch()); + } + if constexpr (isOnlyFromHigh(mask)) { + auto broadcast = _mm256_permute2x128_si256(batch, batch, 0x11); // [high | high] + return _mm256_shuffle_epi8(broadcast, kLaneMask.as_batch()); + } + auto self = _mm256_shuffle_epi8(batch, kSelfSwizzle.as_batch()); auto swapped = _mm256_permute2x128_si256(batch, batch, 0x01); auto cross = _mm256_shuffle_epi8(swapped, kCrossSwizzle.as_batch()); From 329f8eaf1e7ce60a81f9e5cf5935a3deb698536d Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 17 Nov 2025 18:29:29 +0100 Subject: [PATCH 29/64] Only use lshift hack when available --- .../arrow/util/bpacking_simd_impl_internal.h | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index edfcd49c4af..d7c0d851a7e 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -398,8 +398,15 @@ auto left_shift_no_overflow(const xsimd::batch& batch, }; constexpr auto kMults = xsimd::make_batch_constant(); - return batch * kMults; - + // TODO in xsimd 14.0 this can be simplified to + // constexpr auto kMults = xsimd::make_batch_constant() << shits; + if constexpr (sizeof(Int) == sizeof(uint16_t)) { + return _mm_mullo_epi16(batch, kMults.as_batch()); + } + if constexpr (sizeof(Int) == sizeof(uint16_t)) { + // TODO that is latency 10 so maybe it is not worth it + return _mm_mullo_epi32(batch, kMults.as_batch()); + } } else { return batch << shifts; } @@ -434,6 +441,15 @@ auto right_shift_by_excess(const xsimd::batch& batch, }; constexpr auto kMults = xsimd::make_batch_constant(); + if constexpr (sizeof(Int) == sizeof(uint16_t)) { + return xsimd::batch(_mm_mullo_epi16(batch, kMults.as_batch())) >> + kMaxRightShift; + } + if constexpr (sizeof(Int) == sizeof(uint16_t)) { + // TODO that is latency 10 so maybe it is not worth it + return xsimd::batch(_mm_mullo_epi32(batch, kMults.as_batch())) >> + kMaxRightShift; + } return (batch * kMults) >> kMaxRightShift; } else { From 439275166f6823747b1037e60f02dda6a324dff2 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 17 Nov 2025 18:34:01 +0100 Subject: [PATCH 30/64] Fix return type --- cpp/src/arrow/util/bpacking_simd_impl_internal.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index d7c0d851a7e..d9c13f4ed21 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -342,7 +342,8 @@ constexpr bool isOnlyFromLow(xsimd::batch_constant) { /// Merged in xsimd 14.0, simply use swizzle template auto swizzle_bytes(const xsimd::batch& batch, - xsimd::batch_constant mask) { + xsimd::batch_constant mask) + -> xsimd::batch { if constexpr (std::is_base_of_v) { static constexpr auto kPlan = BuildSwizzleBiLaneGenericPlan(std::array{kIdx...}); static constexpr auto kSelfSwizzleArr = kPlan.self_lane; @@ -370,7 +371,7 @@ auto swizzle_bytes(const xsimd::batch& batch, auto self = _mm256_shuffle_epi8(batch, kSelfSwizzle.as_batch()); auto swapped = _mm256_permute2x128_si256(batch, batch, 0x01); auto cross = _mm256_shuffle_epi8(swapped, kCrossSwizzle.as_batch()); - return xsimd::batch(_mm256_or_si256(self, cross)); + return _mm256_or_si256(self, cross); } else { return xsimd::swizzle(batch, mask); } @@ -386,7 +387,8 @@ auto swizzle_bytes(const xsimd::batch& batch, // http://arxiv.org/abs/1209.2137 template auto left_shift_no_overflow(const xsimd::batch& batch, - xsimd::batch_constant shifts) { + xsimd::batch_constant shifts) + -> xsimd::batch { constexpr bool kHasSse2 = std::is_base_of_v; constexpr bool kHasAvx2 = std::is_base_of_v; @@ -407,9 +409,8 @@ auto left_shift_no_overflow(const xsimd::batch& batch, // TODO that is latency 10 so maybe it is not worth it return _mm_mullo_epi32(batch, kMults.as_batch()); } - } else { - return batch << shifts; } + return batch << shifts; } // Intel x86-64 does not have variable right shifts before AVX2. From 1fae6381f55246bd2ede28b5ff49cdc9c9fa2def Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 18 Nov 2025 15:35:47 +0100 Subject: [PATCH 31/64] Fix shift included size --- cpp/src/arrow/util/bpacking_simd_impl_internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index d9c13f4ed21..d9449804dd2 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -405,7 +405,7 @@ auto left_shift_no_overflow(const xsimd::batch& batch, if constexpr (sizeof(Int) == sizeof(uint16_t)) { return _mm_mullo_epi16(batch, kMults.as_batch()); } - if constexpr (sizeof(Int) == sizeof(uint16_t)) { + if constexpr (sizeof(Int) == sizeof(uint32_t)) { // TODO that is latency 10 so maybe it is not worth it return _mm_mullo_epi32(batch, kMults.as_batch()); } @@ -446,7 +446,7 @@ auto right_shift_by_excess(const xsimd::batch& batch, return xsimd::batch(_mm_mullo_epi16(batch, kMults.as_batch())) >> kMaxRightShift; } - if constexpr (sizeof(Int) == sizeof(uint16_t)) { + if constexpr (sizeof(Int) == sizeof(uint32_t)) { // TODO that is latency 10 so maybe it is not worth it return xsimd::batch(_mm_mullo_epi32(batch, kMults.as_batch())) >> kMaxRightShift; From 415f2acb045551ff4861bf9564030dbca46362db Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 19 Nov 2025 14:01:53 +0100 Subject: [PATCH 32/64] Add Avx2 uint16_t shift fallback --- .../arrow/util/bpacking_simd_impl_internal.h | 85 ++++++++++++------- 1 file changed, 54 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index d9449804dd2..31befd4639b 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -34,15 +34,10 @@ namespace arrow::internal { // https://github.com/fast-pack/LittleIntPacker/blob/master/src/horizontalpacking32.c // TODO // - _mm_cvtepi8_epi32 -// - no _mm_srlv_epi32 (128bit) in xsimd with AVX2 required arch // - no need for while loop (for up to 8 is sufficient) -// - array to batch constant to xsimd // - Shifts per swizzle can be improved when self.packed_max_byte_spread == 1 and the // byte can be reused (when val_bit_width divides packed_max_byte_spread). // - Reduce input size on small bit width using a broadcast. -// - For Avx2: -// - Inspect how swizzle across lanes are handled: _mm256_shuffle_epi8 not used? -// - Investigate AVX2 with 128 bit register // - Fix overreading problem // - Improve Swizzle by computing which bigger swapable slots are free @@ -391,17 +386,19 @@ auto left_shift_no_overflow(const xsimd::batch& batch, -> xsimd::batch { constexpr bool kHasSse2 = std::is_base_of_v; constexpr bool kHasAvx2 = std::is_base_of_v; + static_assert(!(kHasSse2 && kHasAvx2), "The hierarchy are different in xsimd"); - if constexpr (kHasSse2 && !kHasAvx2) { - static constexpr auto kShiftsArr = std::array{kShifts...}; + static constexpr auto kShiftsArr = std::array{kShifts...}; - struct MakeMults { - static constexpr Int get(int i, int n) { return Int{1} << kShiftsArr.at(i); } - }; + struct MakeMults { + static constexpr Int get(int i, int n) { return Int{1} << kShiftsArr.at(i); } + }; + + // TODO in xsimd 14.0 this can be simplified to + // constexpr auto kMults = xsimd::make_batch_constant() << shits; + constexpr auto kMults = xsimd::make_batch_constant(); - constexpr auto kMults = xsimd::make_batch_constant(); - // TODO in xsimd 14.0 this can be simplified to - // constexpr auto kMults = xsimd::make_batch_constant() << shits; + if constexpr (kHasSse2) { if constexpr (sizeof(Int) == sizeof(uint16_t)) { return _mm_mullo_epi16(batch, kMults.as_batch()); } @@ -410,6 +407,15 @@ auto left_shift_no_overflow(const xsimd::batch& batch, return _mm_mullo_epi32(batch, kMults.as_batch()); } } + if constexpr (kHasAvx2) { + if constexpr (sizeof(Int) == sizeof(uint8_t)) { + // TODO fallback + } + if constexpr (sizeof(Int) == sizeof(uint16_t)) { + return _mm256_mullo_epi16(batch, kMults.as_batch()); + } + } + return batch << shifts; } @@ -429,33 +435,50 @@ auto right_shift_by_excess(const xsimd::batch& batch, xsimd::batch_constant shifts) { constexpr bool kHasSse2 = std::is_base_of_v; constexpr bool kHasAvx2 = std::is_base_of_v; + static_assert(!(kHasSse2 && kHasAvx2), "The hierarchy are different in xsimd"); - if constexpr (kHasSse2 && !kHasAvx2) { - static constexpr auto kShiftsArr = std::array{kShifts...}; - static constexpr Int kMaxRightShift = max_value(kShiftsArr); + static constexpr auto kShiftsArr = std::array{kShifts...}; + static constexpr Int kMaxRightShift = max_value(kShiftsArr); - struct MakeMults { - static constexpr Int get(int i, int n) { - // Equivalent to left shift of kMaxRightShift - kRightShifts.at(i). - return Int{1} << (kMaxRightShift - kShiftsArr.at(i)); - } - }; + struct MakeMults { + static constexpr Int get(int i, int n) { + // Equivalent to left shift of kMaxRightShift - kRightShifts.at(i). + return Int{1} << (kMaxRightShift - kShiftsArr.at(i)); + } + }; + + // TODO in xsimd 14.0 this can be simplified to + // constexpr auto kMults = xsimd::make_batch_constant() - + // shifts; and then forwarded to left_shift + constexpr auto kMults = xsimd::make_batch_constant(); - constexpr auto kMults = xsimd::make_batch_constant(); + if constexpr (kHasSse2) { if constexpr (sizeof(Int) == sizeof(uint16_t)) { - return xsimd::batch(_mm_mullo_epi16(batch, kMults.as_batch())) >> - kMaxRightShift; + auto lshifted = _mm_mullo_epi16(batch, kMults.as_batch()); + // TODO(xsimd 14.0) + // return xsimd::bitwise_rshift(lshifted); + return xsimd::batch(lshifted) >> kMaxRightShift; } if constexpr (sizeof(Int) == sizeof(uint32_t)) { // TODO that is latency 10 so maybe it is not worth it - return xsimd::batch(_mm_mullo_epi32(batch, kMults.as_batch())) >> - kMaxRightShift; + auto lshifted = _mm_mullo_epi32(batch, kMults.as_batch()); + // TODO(xsimd 14.0) + // return xsimd::bitwise_rshift(lshifted); + return xsimd::batch(lshifted) >> kMaxRightShift; + } + } + if constexpr (kHasAvx2) { + if constexpr (sizeof(Int) == sizeof(uint8_t)) { + // TODO fallback + } + if constexpr (sizeof(Int) == sizeof(uint16_t)) { + auto lshifted = _mm256_mullo_epi16(batch, kMults.as_batch()); + // TODO(xsimd 14.0) + // return xsimd::bitwise_rshift(lshifted); + return xsimd::batch(lshifted) >> kMaxRightShift; } - return (batch * kMults) >> kMaxRightShift; - - } else { - return batch >> shifts; } + return batch >> shifts; } template From 557a0b0c9884203c39d8b9023cfb1e7f9c757177 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 19 Nov 2025 15:16:15 +0100 Subject: [PATCH 33/64] Refactor make_mult --- .../arrow/util/bpacking_simd_impl_internal.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 31befd4639b..67a8d331ad2 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -372,6 +372,17 @@ auto swizzle_bytes(const xsimd::batch& batch, } } +template +struct MakeMults { + static constexpr auto kShiftsArr = std::array{kShifts...}; + static constexpr Int get(int i, int n) { return Int{1} << kShiftsArr.at(i); } +}; + +template +constexpr auto make_mult(xsimd::batch_constant) { + return xsimd::make_batch_constant>(); +} + // Intel x86-64 does not have variable left shifts before AVX2. // // We replace the variable left shift by a variable multiply with a power of two. @@ -388,15 +399,9 @@ auto left_shift_no_overflow(const xsimd::batch& batch, constexpr bool kHasAvx2 = std::is_base_of_v; static_assert(!(kHasSse2 && kHasAvx2), "The hierarchy are different in xsimd"); - static constexpr auto kShiftsArr = std::array{kShifts...}; - - struct MakeMults { - static constexpr Int get(int i, int n) { return Int{1} << kShiftsArr.at(i); } - }; - // TODO in xsimd 14.0 this can be simplified to // constexpr auto kMults = xsimd::make_batch_constant() << shits; - constexpr auto kMults = xsimd::make_batch_constant(); + constexpr auto kMults = make_mult(shifts); if constexpr (kHasSse2) { if constexpr (sizeof(Int) == sizeof(uint16_t)) { From 9de59288dd165d626a4019065e9833d75292f6cd Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 19 Nov 2025 15:20:43 +0100 Subject: [PATCH 34/64] Add Avx2 lshift unint8_t fallback --- .../arrow/util/bpacking_simd_impl_internal.h | 39 +++++++++++++++---- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 67a8d331ad2..55f22d90cd2 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -383,23 +383,29 @@ constexpr auto make_mult(xsimd::batch_constant) { return xsimd::make_batch_constant>(); } -// Intel x86-64 does not have variable left shifts before AVX2. +template +struct SelectStride { + static constexpr auto kShiftsArr = std::array{kVals...}; + + static constexpr Int get(Int i, Int n) { return kShiftsArr[kLength * i + kOffset]; } +}; + +// Fallback for variable shift left. // // We replace the variable left shift by a variable multiply with a power of two. -// The behaviour is the same sa long as there are no overflow. // // This trick is borrowed from Daniel Lemire and Leonid Boytsov, Decoding billions of // integers per second through vectorization, Software Practice & Experience 45 (1), 2015. // http://arxiv.org/abs/1209.2137 template -auto left_shift_no_overflow(const xsimd::batch& batch, - xsimd::batch_constant shifts) +auto left_shift(const xsimd::batch& batch, + xsimd::batch_constant shifts) -> xsimd::batch { constexpr bool kHasSse2 = std::is_base_of_v; constexpr bool kHasAvx2 = std::is_base_of_v; static_assert(!(kHasSse2 && kHasAvx2), "The hierarchy are different in xsimd"); - // TODO in xsimd 14.0 this can be simplified to + // TODO(xsimd-14) this can be simplified to // constexpr auto kMults = xsimd::make_batch_constant() << shits; constexpr auto kMults = make_mult(shifts); @@ -408,13 +414,30 @@ auto left_shift_no_overflow(const xsimd::batch& batch, return _mm_mullo_epi16(batch, kMults.as_batch()); } if constexpr (sizeof(Int) == sizeof(uint32_t)) { - // TODO that is latency 10 so maybe it is not worth it return _mm_mullo_epi32(batch, kMults.as_batch()); } } if constexpr (kHasAvx2) { if constexpr (sizeof(Int) == sizeof(uint8_t)) { - // TODO fallback + auto batch16 = xsimd::bitwise_cast(batch); + + constexpr auto kShifts0 = + xsimd::make_batch_constant>(); + constexpr auto kMults0 = make_mult(kShifts0); + const auto shifted0 = _mm256_mullo_epi16(batch16, kMults0.as_batch()); + const auto mask0 = decltype(batch16)(0x00FF); + const auto masked0 = _mm256_and_si256(shifted0, mask0); + + constexpr auto kShifts1 = + xsimd::make_batch_constant>(); + constexpr auto kMults1 = make_mult(kShifts1); + const auto mask1 = decltype(batch16)(0xFF00); + const auto masked1 = _mm256_and_si256(batch16, mask1); + const auto shifted1 = _mm256_mullo_epi16(masked1, kMults1.as_batch()); + + return _mm256_or_si256(masked0, shifted1); } if constexpr (sizeof(Int) == sizeof(uint16_t)) { return _mm256_mullo_epi16(batch, kMults.as_batch()); @@ -670,7 +693,7 @@ struct LargeKernel { const auto high_swizzled = swizzle_bytes(bytes, kHighSwizzles); const auto high_words = xsimd::bitwise_cast(high_swizzled); - const auto high_shifted = left_shift_no_overflow(high_words, kHighLShifts); + const auto high_shifted = left_shift(high_words, kHighLShifts); // We can have a single mask and apply it after OR because the shifts will ensure that // there are zeros where the high/low values are incomplete. From 0e39313c8faa2237fbe2006d7d0af999468ee629 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 19 Nov 2025 15:51:24 +0100 Subject: [PATCH 35/64] Refactor right shift excess --- .../arrow/util/bpacking_simd_impl_internal.h | 61 +++++++------------ 1 file changed, 22 insertions(+), 39 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 55f22d90cd2..9ab86cd2a4e 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -447,7 +447,7 @@ auto left_shift(const xsimd::batch& batch, return batch << shifts; } -// Intel x86-64 does not have variable right shifts before AVX2. +// Fallback for variable shift right. // // When we know that the relevant bits will not overflow, we can instead shift left all // values to align them with the one with the largest right shifts followed by a constant @@ -465,47 +465,30 @@ auto right_shift_by_excess(const xsimd::batch& batch, constexpr bool kHasAvx2 = std::is_base_of_v; static_assert(!(kHasSse2 && kHasAvx2), "The hierarchy are different in xsimd"); - static constexpr auto kShiftsArr = std::array{kShifts...}; - static constexpr Int kMaxRightShift = max_value(kShiftsArr); - - struct MakeMults { - static constexpr Int get(int i, int n) { - // Equivalent to left shift of kMaxRightShift - kRightShifts.at(i). - return Int{1} << (kMaxRightShift - kShiftsArr.at(i)); - } - }; + // These conditions are the ones matched in `left_shift`, i.e. the ones where variable + // shift right will not be available. + if constexpr ((kHasSse2 & + (sizeof(Int) == sizeof(uint16_t) || sizeof(Int) == sizeof(uint32_t))) || + (kHasAvx2 & + (sizeof(Int) == sizeof(uint8_t) || sizeof(Int) == sizeof(uint16_t)))) { + static constexpr auto kShiftsArr = std::array{kShifts...}; + static constexpr Int kMaxRightShift = max_value(kShiftsArr); + + struct MakeShifts { + static constexpr Int get(int i, int n) { return kMaxRightShift - kShiftsArr.at(i); } + }; - // TODO in xsimd 14.0 this can be simplified to - // constexpr auto kMults = xsimd::make_batch_constant() - - // shifts; and then forwarded to left_shift - constexpr auto kMults = xsimd::make_batch_constant(); + // TODO(xsimd-14) this can be simplified to + // constexpr auto kRShifts = xsimd::make_batch_constant() - + // shifts; + constexpr auto kLShifts = xsimd::make_batch_constant(); - if constexpr (kHasSse2) { - if constexpr (sizeof(Int) == sizeof(uint16_t)) { - auto lshifted = _mm_mullo_epi16(batch, kMults.as_batch()); - // TODO(xsimd 14.0) - // return xsimd::bitwise_rshift(lshifted); - return xsimd::batch(lshifted) >> kMaxRightShift; - } - if constexpr (sizeof(Int) == sizeof(uint32_t)) { - // TODO that is latency 10 so maybe it is not worth it - auto lshifted = _mm_mullo_epi32(batch, kMults.as_batch()); - // TODO(xsimd 14.0) - // return xsimd::bitwise_rshift(lshifted); - return xsimd::batch(lshifted) >> kMaxRightShift; - } - } - if constexpr (kHasAvx2) { - if constexpr (sizeof(Int) == sizeof(uint8_t)) { - // TODO fallback - } - if constexpr (sizeof(Int) == sizeof(uint16_t)) { - auto lshifted = _mm256_mullo_epi16(batch, kMults.as_batch()); - // TODO(xsimd 14.0) - // return xsimd::bitwise_rshift(lshifted); - return xsimd::batch(lshifted) >> kMaxRightShift; - } + const auto lshifted = left_shift(batch, kLShifts); + // TODO(xsimd-14) + // return xsimd::bitwise_rshift(lshifted); + return xsimd::batch(lshifted) >> kMaxRightShift; } + return batch >> shifts; } From a4a2cde0a476620f434c080d525e1b91dbdaece9 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 20 Nov 2025 10:52:51 +0100 Subject: [PATCH 36/64] Refactor make_mult --- cpp/src/arrow/util/bpacking_simd_impl_internal.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 9ab86cd2a4e..797d2ce0f47 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -372,15 +372,9 @@ auto swizzle_bytes(const xsimd::batch& batch, } } -template -struct MakeMults { - static constexpr auto kShiftsArr = std::array{kShifts...}; - static constexpr Int get(int i, int n) { return Int{1} << kShiftsArr.at(i); } -}; - template constexpr auto make_mult(xsimd::batch_constant) { - return xsimd::make_batch_constant>(); + return xsimd::batch_constant(1u << kShifts)...>(); } template From 7a10f2ae8e7bc31d8dea689dbe2bf86283f567a1 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 20 Nov 2025 11:56:04 +0100 Subject: [PATCH 37/64] Add SSE var shift uint8_t fallback to uint16_t --- .../arrow/util/bpacking_simd_impl_internal.h | 72 +++++++++---------- 1 file changed, 34 insertions(+), 38 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 797d2ce0f47..97f0652b5e1 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -377,13 +377,19 @@ constexpr auto make_mult(xsimd::batch_constant) { return xsimd::batch_constant(1u << kShifts)...>(); } -template +template struct SelectStride { static constexpr auto kShiftsArr = std::array{kVals...}; - static constexpr Int get(Int i, Int n) { return kShiftsArr[kLength * i + kOffset]; } + static constexpr Int get(int i, int n) { return kShiftsArr[kLength * i + kOffset]; } }; +template +constexpr auto select_stride(xsimd::batch_constant) { + return xsimd::make_batch_constant< + ToInt, Arch, SelectStride>(); +} + // Fallback for variable shift left. // // We replace the variable left shift by a variable multiply with a power of two. @@ -403,39 +409,29 @@ auto left_shift(const xsimd::batch& batch, // constexpr auto kMults = xsimd::make_batch_constant() << shits; constexpr auto kMults = make_mult(shifts); - if constexpr (kHasSse2) { - if constexpr (sizeof(Int) == sizeof(uint16_t)) { - return _mm_mullo_epi16(batch, kMults.as_batch()); - } - if constexpr (sizeof(Int) == sizeof(uint32_t)) { - return _mm_mullo_epi32(batch, kMults.as_batch()); - } + constexpr auto IntSize = sizeof(Int); + + // Sizes and architecture for which there is no variable left shift and there is a + // multiplication + if constexpr ( // + (kHasSse2 && (IntSize == sizeof(uint16_t) || IntSize == sizeof(uint32_t))) // + || (kHasAvx2 && (IntSize == sizeof(uint16_t))) // + ) { + return batch * kMults; } - if constexpr (kHasAvx2) { - if constexpr (sizeof(Int) == sizeof(uint8_t)) { - auto batch16 = xsimd::bitwise_cast(batch); - - constexpr auto kShifts0 = - xsimd::make_batch_constant>(); - constexpr auto kMults0 = make_mult(kShifts0); - const auto shifted0 = _mm256_mullo_epi16(batch16, kMults0.as_batch()); - const auto mask0 = decltype(batch16)(0x00FF); - const auto masked0 = _mm256_and_si256(shifted0, mask0); - - constexpr auto kShifts1 = - xsimd::make_batch_constant>(); - constexpr auto kMults1 = make_mult(kShifts1); - const auto mask1 = decltype(batch16)(0xFF00); - const auto masked1 = _mm256_and_si256(batch16, mask1); - const auto shifted1 = _mm256_mullo_epi16(masked1, kMults1.as_batch()); - - return _mm256_or_si256(masked0, shifted1); - } - if constexpr (sizeof(Int) == sizeof(uint16_t)) { - return _mm256_mullo_epi16(batch, kMults.as_batch()); - } + + // Architecture for which there is no variable left shift on uint8_t but a fallback + // exists for uint16_t. + if constexpr ((kHasSse2 || kHasAvx2) && (IntSize == sizeof(uint8_t))) { + const auto batch16 = xsimd::bitwise_cast(batch); + + constexpr auto kShifts0 = select_stride(shifts); + const auto shifted0 = left_shift(batch16, kShifts0) & 0x00FF; + + constexpr auto kShifts1 = select_stride(shifts); + const auto shifted1 = left_shift(batch16 & 0xFF00, kShifts1); + + return xsimd::bitwise_cast(shifted0 | shifted1); } return batch << shifts; @@ -461,10 +457,10 @@ auto right_shift_by_excess(const xsimd::batch& batch, // These conditions are the ones matched in `left_shift`, i.e. the ones where variable // shift right will not be available. - if constexpr ((kHasSse2 & - (sizeof(Int) == sizeof(uint16_t) || sizeof(Int) == sizeof(uint32_t))) || - (kHasAvx2 & - (sizeof(Int) == sizeof(uint8_t) || sizeof(Int) == sizeof(uint16_t)))) { + if constexpr ( // + (kHasSse2 && (sizeof(Int) == sizeof(uint8_t) || sizeof(Int) == sizeof(uint16_t) || + sizeof(Int) == sizeof(uint32_t))) || + (kHasAvx2 && (sizeof(Int) == sizeof(uint8_t) || sizeof(Int) == sizeof(uint16_t)))) { static constexpr auto kShiftsArr = std::array{kShifts...}; static constexpr Int kMaxRightShift = max_value(kShiftsArr); From 6f5bc9c7d39bed73a3ee1d22faca795d0b68472e Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 20 Nov 2025 18:17:12 +0100 Subject: [PATCH 38/64] Implement size reading reduction --- .../arrow/util/bpacking_simd_impl_internal.h | 67 +++++++++++++++---- 1 file changed, 54 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 97f0652b5e1..6a39c8e8e26 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -136,16 +136,29 @@ struct MediumKernelPlanSize { } }; -constexpr MediumKernelPlanSize BuildMediumPlanSize(const KernelShape& shape) { +struct MediumKernelOptions { + /// An indicative limit on the number of values unpacked by the kernel. + /// This is a heuristic setting: other constraints such as alignment may not always make + /// small values feasibles. Must be a power of two. + int unpacked_per_kernel_limit = -1; +}; + +constexpr MediumKernelPlanSize BuildMediumPlanSize(const KernelShape& shape, + const MediumKernelOptions& options) { const int shifts_per_swizzle = shape.unpacked_byte_size() / shape.packed_max_spread_bytes(); const int vals_per_swizzle = shifts_per_swizzle * shape.unpacked_per_simd(); + // Using `unpacked_per_kernel_limit` to influence the number of swizzles per reads. + const auto packed_per_read_for_offset = [&](int bit_offset) -> int { + const int best = (shape.simd_bit_size() - bit_offset) / shape.packed_bit_size(); + const int limit = options.unpacked_per_kernel_limit; + return (best > limit) && (limit > 0) ? limit : best; + }; + const auto swizzles_per_read_for_offset = [&](int bit_offset) -> int { - const int vals_per_simd = - (shape.simd_bit_size() - bit_offset) / shape.packed_bit_size(); - return vals_per_simd / vals_per_swizzle; + return packed_per_read_for_offset(bit_offset) / vals_per_swizzle; }; // If after a whole swizzle reading iteration we fall unaligned, the remaining @@ -179,12 +192,15 @@ constexpr MediumKernelPlanSize BuildMediumPlanSize(const KernelShape& shape) { }; } -template +// TODO(C++20) Non type template parameter for MediumKernelOptions. +template struct MediumKernelPlan { using Traits = KernelTraits; using uint_type = typename Traits::uint_type; static constexpr auto kShape = Traits::kShape; - static constexpr auto kPlanSize = BuildMediumPlanSize(kShape); + static constexpr auto kPlanSize = BuildMediumPlanSize( + kShape, {/* .unpacked_per_kernel_limit= */ kUnpackedPerKernelLimit}); using ReadsPerKernel = std::array; @@ -208,15 +224,22 @@ struct MediumKernelPlan { return unpacked_per_read() * kPlanSize.reads_per_kernel(); } + static constexpr int bytes_per_read() { + return unpacked_per_read() * kShape.packed_bit_size(); + } + ReadsPerKernel reads; SwizzlesPerKernel swizzles; ShitsPerKernel shifts; uint_type mask = bit_util::LeastSignificantBitMask(kPackedBitSize); }; -template -constexpr MediumKernelPlan BuildMediumPlan() { - using Plan = MediumKernelPlan; +// TODO(C++20) Non type template parameter for MediumKernelOptions. +template +constexpr auto BuildMediumPlan() { + using Plan = MediumKernelPlan; constexpr auto kShape = Plan::kShape; constexpr auto kPlanSize = Plan::kPlanSize; static_assert(kShape.is_medium()); @@ -482,10 +505,12 @@ auto right_shift_by_excess(const xsimd::batch& batch, return batch >> shifts; } -template +// TODO(C++20) Non type template parameter for MediumKernelOptions. +template struct MediumKernel { - static constexpr auto kPlan = - BuildMediumPlan(); + static constexpr auto kPlan = BuildMediumPlan(); static constexpr auto kPlanSize = kPlan.kPlanSize; static constexpr auto kShape = kPlan.kShape; using Traits = typename decltype(kPlan)::Traits; @@ -531,11 +556,27 @@ struct MediumKernel { (unpack_one_shift_impl(words, out), ...); } + template + static simd_bytes load_bytes_as(const uint8_t* in) { + const Uint val = util::SafeLoadAs(in); + const auto batch = xsimd::batch(val); + return xsimd::bitwise_cast(batch); + } + + static simd_bytes load_bytes(const uint8_t* in) { + if constexpr (kPlan.bytes_per_read() > static_cast(sizeof(uint64_t))) { + return simd_bytes::load_unaligned(in); + } else if constexpr (kPlan.bytes_per_read() > static_cast(sizeof(uint32_t))) { + return load_bytes_as(in); + } + return load_bytes_as(in); + } + template static void unpack_one_read_impl(const uint8_t* in, unpacked_type* out, std::integer_sequence) { using ShiftSeq = std::make_integer_sequence; - const auto bytes = simd_bytes::load_unaligned(in + kPlan.reads.at(kReadIdx)); + const auto bytes = load_bytes(in + kPlan.reads.at(kReadIdx)); (unpack_one_swizzle_impl(bytes, out, ShiftSeq{}), ...); } From 5c26e2fd99c00ea0706b9c7a9275e7ef19bddb3b Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 24 Nov 2025 12:45:49 +0100 Subject: [PATCH 39/64] Add fallback Avx2 right shift --- .../arrow/util/bpacking_simd_impl_internal.h | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 6a39c8e8e26..7e22d5a1ef8 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -478,12 +478,28 @@ auto right_shift_by_excess(const xsimd::batch& batch, constexpr bool kHasAvx2 = std::is_base_of_v; static_assert(!(kHasSse2 && kHasAvx2), "The hierarchy are different in xsimd"); + constexpr auto IntSize = sizeof(Int); + + // Architecture for which there is no variable right shift but a larger fallback exists. + if constexpr (kHasAvx2 && (IntSize == sizeof(uint8_t) || IntSize == sizeof(uint16_t))) { + using twice_uint = SizedUint<2 * IntSize>; + + const auto batch2 = xsimd::bitwise_cast(batch); + + constexpr auto kShifts0 = select_stride(shifts); + constexpr auto kMask0 = bit_util::LeastSignificantBitMask(8 * IntSize); + const auto shifted0 = right_shift_by_excess(batch2 & kMask0, kShifts0); + + constexpr auto kShifts1 = select_stride(shifts); + constexpr auto kMask1 = kMask0 << (8 * IntSize); + const auto shifted1 = right_shift_by_excess(batch2, kShifts1) & kMask1; + + return xsimd::bitwise_cast(shifted0 | shifted1); + } + // These conditions are the ones matched in `left_shift`, i.e. the ones where variable - // shift right will not be available. - if constexpr ( // - (kHasSse2 && (sizeof(Int) == sizeof(uint8_t) || sizeof(Int) == sizeof(uint16_t) || - sizeof(Int) == sizeof(uint32_t))) || - (kHasAvx2 && (sizeof(Int) == sizeof(uint8_t) || sizeof(Int) == sizeof(uint16_t)))) { + // shift right will not be available but a left shift (fallback) exists. + if constexpr (kHasSse2 && (IntSize != sizeof(uint64_t))) { static constexpr auto kShiftsArr = std::array{kShifts...}; static constexpr Int kMaxRightShift = max_value(kShiftsArr); From 75505270857fc8f28c1d85bffcb9f052af3ba281 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 26 Nov 2025 10:24:47 +0100 Subject: [PATCH 40/64] Refactor static dispatch --- .../arrow/util/bpacking_simd_impl_internal.h | 39 ++++++++++--------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 7e22d5a1ef8..18bbb84a368 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -744,8 +744,9 @@ struct LargeKernel { } }; +/// A Kernel that does not extract anything, leaving all work to the naive implementation. template -struct OversizedKernel { +struct NoOpKernel { using unpacked_type = UnpackedUint; static constexpr int kValuesUnpacked = 0; @@ -753,27 +754,29 @@ struct OversizedKernel { static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) { return in; } }; +/******************************* + * Kernel static dispatching * + *******************************/ + +template +struct Kernel; + template -constexpr auto DispatchKernel() { - using kTraits = KernelTraits; - if constexpr (kTraits::kShape.is_medium()) { - return MediumKernel{}; - } else if constexpr (kTraits::kShape.is_large()) { - return LargeKernel{}; - } else { - return OversizedKernel{}; - } -} +struct Kernel::kShape.is_medium()>> + : MediumKernel {}; template -using DispatchKernelType = - decltype(DispatchKernel()); +struct Kernel::kShape.is_large()>> + : LargeKernel {}; template -struct Kernel : DispatchKernelType { - using Base = DispatchKernelType; - using Base::kValuesUnpacked; - using Base::unpack; -}; +struct Kernel::kShape.is_oversized()>> + : NoOpKernel {}; } // namespace arrow::internal From 6fae07c8e599be53222603e529a693f71a7209a9 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 26 Nov 2025 10:46:32 +0100 Subject: [PATCH 41/64] Forward oversized to larger uint when possible --- .../arrow/util/bpacking_simd_impl_internal.h | 52 +++++++++++++++---- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 18bbb84a368..9dc108775e9 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -754,29 +754,63 @@ struct NoOpKernel { static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) { return in; } }; +template +struct ForwardToKernel : WorkingKernel { + using unpacked_type = UnpackedUint; + + static constexpr int kValuesUnpacked = WorkingKernel::kValuesUnpacked; + + static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) { + using working_type = typename WorkingKernel::unpacked_type; + + working_type buffer[kValuesUnpacked] = {}; + in = WorkingKernel::unpack(in, buffer); + for (int k = 0; k < kValuesUnpacked; ++k) { + out[k] = static_cast(buffer[k]); + } + return in; + } +}; + /******************************* * Kernel static dispatching * *******************************/ +/// The public kernel exposed for any size. template struct Kernel; template -struct Kernel::kShape.is_medium()>> +struct Kernel< // + UnpackedUint, kPackedBitSize, kSimdBitSize, + std::enable_if_t< + KernelTraits::kShape.is_medium()>> : MediumKernel {}; template -struct Kernel::kShape.is_large()>> +struct Kernel< // + UnpackedUint, kPackedBitSize, kSimdBitSize, + std::enable_if_t< + KernelTraits::kShape.is_large()>> : LargeKernel {}; template -struct Kernel::kShape.is_oversized()>> +struct Kernel< // + UnpackedUint, kPackedBitSize, kSimdBitSize, + std::enable_if_t< + KernelTraits::kShape.is_oversized() && + sizeof(UnpackedUint) < sizeof(uint64_t)>> + : ForwardToKernel, kPackedBitSize, + kSimdBitSize>> {}; + +template +struct Kernel< // + UnpackedUint, kPackedBitSize, kSimdBitSize, + std::enable_if_t< + KernelTraits::kShape.is_oversized() && + sizeof(UnpackedUint) == sizeof(uint64_t)>> : NoOpKernel {}; } // namespace arrow::internal From b761efc0b146eefffac290f587fd4ff34fcb1a11 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 26 Nov 2025 10:57:28 +0100 Subject: [PATCH 42/64] Add arch detection functions --- cpp/src/arrow/util/bpacking_simd_impl_internal.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 9dc108775e9..99f2309bf17 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -413,6 +413,12 @@ constexpr auto select_stride(xsimd::batch_constant) { ToInt, Arch, SelectStride>(); } +template +constexpr bool HasSse2 = std::is_base_of_v; + +template +constexpr bool HasAvx2 = std::is_base_of_v; + // Fallback for variable shift left. // // We replace the variable left shift by a variable multiply with a power of two. @@ -424,8 +430,8 @@ template auto left_shift(const xsimd::batch& batch, xsimd::batch_constant shifts) -> xsimd::batch { - constexpr bool kHasSse2 = std::is_base_of_v; - constexpr bool kHasAvx2 = std::is_base_of_v; + constexpr bool kHasSse2 = HasSse2; + constexpr bool kHasAvx2 = HasAvx2; static_assert(!(kHasSse2 && kHasAvx2), "The hierarchy are different in xsimd"); // TODO(xsimd-14) this can be simplified to @@ -474,8 +480,8 @@ auto left_shift(const xsimd::batch& batch, template auto right_shift_by_excess(const xsimd::batch& batch, xsimd::batch_constant shifts) { - constexpr bool kHasSse2 = std::is_base_of_v; - constexpr bool kHasAvx2 = std::is_base_of_v; + constexpr bool kHasSse2 = HasSse2; + constexpr bool kHasAvx2 = HasAvx2; static_assert(!(kHasSse2 && kHasAvx2), "The hierarchy are different in xsimd"); constexpr auto IntSize = sizeof(Int); From 2007d6b57c5acddcbe2d7f0781746d2f1a6e70b9 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 26 Nov 2025 11:04:38 +0100 Subject: [PATCH 43/64] Refactor traits usage --- .../arrow/util/bpacking_simd_impl_internal.h | 36 +++++++++---------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 99f2309bf17..1581da7a08f 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -784,39 +784,37 @@ struct ForwardToKernel : WorkingKernel { *******************************/ /// The public kernel exposed for any size. -template +template , + typename = void> struct Kernel; -template +template struct Kernel< // - UnpackedUint, kPackedBitSize, kSimdBitSize, - std::enable_if_t< - KernelTraits::kShape.is_medium()>> + UnpackedUint, kPackedBitSize, kSimdBitSize, Traits, + std::enable_if_t> : MediumKernel {}; -template +template struct Kernel< // - UnpackedUint, kPackedBitSize, kSimdBitSize, - std::enable_if_t< - KernelTraits::kShape.is_large()>> + UnpackedUint, kPackedBitSize, kSimdBitSize, Traits, + std::enable_if_t> : LargeKernel {}; -template +template struct Kernel< // - UnpackedUint, kPackedBitSize, kSimdBitSize, - std::enable_if_t< - KernelTraits::kShape.is_oversized() && - sizeof(UnpackedUint) < sizeof(uint64_t)>> + UnpackedUint, kPackedBitSize, kSimdBitSize, Traits, + std::enable_if_t> : ForwardToKernel, kPackedBitSize, kSimdBitSize>> {}; -template +template struct Kernel< // - UnpackedUint, kPackedBitSize, kSimdBitSize, - std::enable_if_t< - KernelTraits::kShape.is_oversized() && - sizeof(UnpackedUint) == sizeof(uint64_t)>> + UnpackedUint, kPackedBitSize, kSimdBitSize, Traits, + std::enable_if_t> : NoOpKernel {}; } // namespace arrow::internal From 5716616cba4c43a3f784a6f17e56a4e013a0142a Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 26 Nov 2025 11:29:47 +0100 Subject: [PATCH 44/64] Forward x86_64 unpack64 to unpack32 --- .../arrow/util/bpacking_simd_impl_internal.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 1581da7a08f..7fe3f10aeef 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -789,12 +789,28 @@ template struct Kernel; +template +constexpr bool MediumShouldUseUint32 = + (HasSse2 || HasSse2)&& // + (Traits::kShape.unpacked_byte_size() == sizeof(uint64_t)) && + (Traits::kShape.packed_bit_size() < 32) && + KernelTraits::kShape.is_medium(); + template struct Kernel< // UnpackedUint, kPackedBitSize, kSimdBitSize, Traits, - std::enable_if_t> + std::enable_if_t>> : MediumKernel {}; +template +struct Kernel< // + UnpackedUint, kPackedBitSize, kSimdBitSize, Traits, + std::enable_if_t>> + : ForwardToKernel, kPackedBitSize, + kSimdBitSize>> {}; + template struct Kernel< // UnpackedUint, kPackedBitSize, kSimdBitSize, Traits, From 7e291d6814566b2a9870dbad3efca3c044230bd1 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 26 Nov 2025 14:16:04 +0100 Subject: [PATCH 45/64] Simplify template usage --- .../arrow/util/bpacking_simd_impl_internal.h | 91 +++++++++---------- 1 file changed, 43 insertions(+), 48 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 7fe3f10aeef..2c63a155e84 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -115,6 +115,18 @@ struct KernelTraits { using arch_type = typename simd_batch::arch_type; }; +template +using KernelTraitsWithUnpack = + KernelTraits; + +template +using KernelTraitsHalf = + KernelTraitsWithUnpack>; + +template +using KernelTraitsDouble = + KernelTraitsWithUnpack>; + struct MediumKernelPlanSize { int reads_per_kernel_; int swizzles_per_read_; @@ -136,15 +148,16 @@ struct MediumKernelPlanSize { } }; +template struct MediumKernelOptions { /// An indicative limit on the number of values unpacked by the kernel. /// This is a heuristic setting: other constraints such as alignment may not always make /// small values feasibles. Must be a power of two. - int unpacked_per_kernel_limit = -1; + static constexpr int kUnpackedPerKernelLimit = kUnpackedPerKernelLimit_; }; -constexpr MediumKernelPlanSize BuildMediumPlanSize(const KernelShape& shape, - const MediumKernelOptions& options) { +template +constexpr MediumKernelPlanSize BuildMediumPlanSize(const KernelShape& shape) { const int shifts_per_swizzle = shape.unpacked_byte_size() / shape.packed_max_spread_bytes(); @@ -153,7 +166,7 @@ constexpr MediumKernelPlanSize BuildMediumPlanSize(const KernelShape& shape, // Using `unpacked_per_kernel_limit` to influence the number of swizzles per reads. const auto packed_per_read_for_offset = [&](int bit_offset) -> int { const int best = (shape.simd_bit_size() - bit_offset) / shape.packed_bit_size(); - const int limit = options.unpacked_per_kernel_limit; + const int limit = KernelOptions::kUnpackedPerKernelLimit; return (best > limit) && (limit > 0) ? limit : best; }; @@ -192,15 +205,12 @@ constexpr MediumKernelPlanSize BuildMediumPlanSize(const KernelShape& shape, }; } -// TODO(C++20) Non type template parameter for MediumKernelOptions. -template +template struct MediumKernelPlan { - using Traits = KernelTraits; + using Traits = KernelTraits; using uint_type = typename Traits::uint_type; static constexpr auto kShape = Traits::kShape; - static constexpr auto kPlanSize = BuildMediumPlanSize( - kShape, {/* .unpacked_per_kernel_limit= */ kUnpackedPerKernelLimit}); + static constexpr auto kPlanSize = BuildMediumPlanSize(kShape); using ReadsPerKernel = std::array; @@ -231,15 +241,12 @@ struct MediumKernelPlan { ReadsPerKernel reads; SwizzlesPerKernel swizzles; ShitsPerKernel shifts; - uint_type mask = bit_util::LeastSignificantBitMask(kPackedBitSize); + uint_type mask = bit_util::LeastSignificantBitMask(kShape.packed_bit_size()); }; -// TODO(C++20) Non type template parameter for MediumKernelOptions. -template +template constexpr auto BuildMediumPlan() { - using Plan = MediumKernelPlan; + using Plan = MediumKernelPlan; constexpr auto kShape = Plan::kShape; constexpr auto kPlanSize = Plan::kPlanSize; static_assert(kShape.is_medium()); @@ -527,12 +534,9 @@ auto right_shift_by_excess(const xsimd::batch& batch, return batch >> shifts; } -// TODO(C++20) Non type template parameter for MediumKernelOptions. -template +template > struct MediumKernel { - static constexpr auto kPlan = BuildMediumPlan(); + static constexpr auto kPlan = BuildMediumPlan(); static constexpr auto kPlanSize = kPlan.kPlanSize; static constexpr auto kShape = kPlan.kShape; using Traits = typename decltype(kPlan)::Traits; @@ -616,9 +620,9 @@ struct MediumKernel { } }; -template +template struct LargeKernelPlan { - using Traits = KernelTraits; + using Traits = KernelTraits; using uint_type = typename Traits::uint_type; static constexpr auto kShape = Traits::kShape; @@ -642,9 +646,9 @@ struct LargeKernelPlan { uint_type mask; }; -template -constexpr LargeKernelPlan BuildLargePlan() { - using Plan = LargeKernelPlan; +template +constexpr LargeKernelPlan BuildLargePlan() { + using Plan = LargeKernelPlan; using uint_type = typename Plan::Traits::uint_type; constexpr auto kShape = Plan::kShape; static_assert(kShape.is_large()); @@ -684,15 +688,14 @@ constexpr LargeKernelPlan BuildLarge } } - plan.mask = bit_util::LeastSignificantBitMask(kPackedBitSize); + plan.mask = bit_util::LeastSignificantBitMask(kShape.packed_bit_size()); return plan; } -template +template struct LargeKernel { - static constexpr auto kPlan = - BuildLargePlan(); + static constexpr auto kPlan = BuildLargePlan(); static constexpr auto kShape = kPlan.kShape; using Traits = typename decltype(kPlan)::Traits; using unpacked_type = typename Traits::unpacked_type; @@ -751,19 +754,18 @@ struct LargeKernel { }; /// A Kernel that does not extract anything, leaving all work to the naive implementation. -template +template struct NoOpKernel { - using unpacked_type = UnpackedUint; + using unpacked_type = typename KernelTraits::unpacked_type; static constexpr int kValuesUnpacked = 0; static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) { return in; } }; -template +template struct ForwardToKernel : WorkingKernel { - using unpacked_type = UnpackedUint; + using unpacked_type = typename KernelTraits::unpacked_type; static constexpr int kValuesUnpacked = WorkingKernel::kValuesUnpacked; @@ -794,43 +796,36 @@ constexpr bool MediumShouldUseUint32 = (HasSse2 || HasSse2)&& // (Traits::kShape.unpacked_byte_size() == sizeof(uint64_t)) && (Traits::kShape.packed_bit_size() < 32) && - KernelTraits::kShape.is_medium(); + KernelTraitsWithUnpack::kShape.is_medium(); template struct Kernel< // UnpackedUint, kPackedBitSize, kSimdBitSize, Traits, std::enable_if_t>> - : MediumKernel {}; + : MediumKernel {}; template struct Kernel< // UnpackedUint, kPackedBitSize, kSimdBitSize, Traits, std::enable_if_t>> - : ForwardToKernel, kPackedBitSize, - kSimdBitSize>> {}; + : ForwardToKernel>> {}; template struct Kernel< // UnpackedUint, kPackedBitSize, kSimdBitSize, Traits, - std::enable_if_t> - : LargeKernel {}; + std::enable_if_t> : LargeKernel {}; template struct Kernel< // UnpackedUint, kPackedBitSize, kSimdBitSize, Traits, std::enable_if_t> - : ForwardToKernel, kPackedBitSize, - kSimdBitSize>> {}; + : ForwardToKernel>> {}; template struct Kernel< // UnpackedUint, kPackedBitSize, kSimdBitSize, Traits, std::enable_if_t> - : NoOpKernel {}; + sizeof(UnpackedUint) == sizeof(uint64_t)>> : NoOpKernel {}; } // namespace arrow::internal From 424de019de4b88a0163d5886ad15e734d6d3e121 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 26 Nov 2025 15:05:16 +0100 Subject: [PATCH 46/64] Reorganize and doc --- .../arrow/util/bpacking_simd_impl_internal.h | 559 ++++++++++-------- 1 file changed, 300 insertions(+), 259 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 2c63a155e84..2c90d10d239 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -15,6 +15,15 @@ // specific language governing permissions and limitations // under the License. +/// Simd integer unpacking kernels, that is small functions that efficiently operate over +/// a fixed input size. +/// +/// This a generalization of the algorithm from Daniel Lemire and Leonid Boytsov, +/// Decoding billions of integers per second through vectorization, Software Practice & +/// Experience 45 (1), 2015. +/// http://arxiv.org/abs/1209.2137 +/// https://github.com/fast-pack/LittleIntPacker/blob/master/src/horizontalpacking32.c + #pragma once #include @@ -31,25 +40,265 @@ namespace arrow::internal { -// https://github.com/fast-pack/LittleIntPacker/blob/master/src/horizontalpacking32.c -// TODO -// - _mm_cvtepi8_epi32 -// - no need for while loop (for up to 8 is sufficient) -// - Shifts per swizzle can be improved when self.packed_max_byte_spread == 1 and the -// byte can be reused (when val_bit_width divides packed_max_byte_spread). -// - Reduce input size on small bit width using a broadcast. -// - Fix overreading problem -// - Improve Swizzle by computing which bigger swapable slots are free +/********************* + * xsimd utilities * + *********************/ -template -constexpr Arr BuildConstantArray(typename Arr::value_type val) { - Arr out = {}; - for (auto& v : out) { - v = val; +/// Simple constexpr maximum element suited for non empty arrays. +template +constexpr T max_value(const std::array& arr) { + static_assert(N > 0); + T out = 0; + for (const T& v : arr) { + if (v > out) { + out = v; + } } return out; } +template +constexpr auto make_batch_constant_impl(std::index_sequence) { + using Array = std::decay_t; + using value_type = typename Array::value_type; + + return xsimd::batch_constant{}; +} + +/// Make a ``xsimd::batch_constant`` from a static constexpr array. +template +constexpr auto make_batch_constant() { + return make_batch_constant_impl(std::make_index_sequence()); +} + +template +struct SwizzleBiLaneGenericPlan { + using ByteSwizzle = std::array; + + ByteSwizzle self_lane; + ByteSwizzle cross_lane; +}; + +template +constexpr SwizzleBiLaneGenericPlan BuildSwizzleBiLaneGenericPlan( + std::array mask) { + constexpr T kAsZero = 0x80; // Most significant bit of the byte must be 1 + constexpr std::size_t kSize = N; + constexpr std::size_t kSizeHalf = kSize / 2; + + SwizzleBiLaneGenericPlan plan = {}; + + for (std::size_t k = 0; k < kSize; ++k) { + const bool is_defined = (0 <= mask[k]) && (mask[k] < kSize); + const bool is_first_lane_idx = k < kSizeHalf; + const bool is_first_lane_mask = mask[k] < kSizeHalf; + + if (!is_defined) { + plan.self_lane[k] = kAsZero; + plan.cross_lane[k] = kAsZero; + } else { + if (is_first_lane_idx == is_first_lane_mask) { + plan.self_lane[k] = mask[k] % kSizeHalf; + plan.cross_lane[k] = kAsZero; + } else { + plan.self_lane[k] = kAsZero; + plan.cross_lane[k] = mask[k] % kSizeHalf; + } + } + } + + return plan; +} + +template +constexpr bool isOnlyFromHigh(xsimd::batch_constant) { + return ((Vals >= (sizeof...(Vals) / 2)) && ...); +} + +template +constexpr bool isOnlyFromLow(xsimd::batch_constant) { + return ((Vals < (sizeof...(Vals) / 2)) && ...); +} + +/// Wrapper around ``xsimd::swizzle`` with optimizations and non implemented sizes. +/// TODO(xsimd 14.0) Merged and can be replaced with ``xsimd::swizzle``. +template +auto swizzle_bytes(const xsimd::batch& batch, + xsimd::batch_constant mask) + -> xsimd::batch { + if constexpr (std::is_base_of_v) { + static constexpr auto kPlan = BuildSwizzleBiLaneGenericPlan(std::array{kIdx...}); + static constexpr auto kSelfSwizzleArr = kPlan.self_lane; + constexpr auto kSelfSwizzle = make_batch_constant(); + static constexpr auto kCrossSwizzleArr = kPlan.cross_lane; + constexpr auto kCrossSwizzle = make_batch_constant(); + + struct LaneMask { + static constexpr uint8_t get(uint8_t i, uint8_t n) { + constexpr auto kMask = std::array{kIdx...}; + return kMask[i] % (kMask.size() / 2); + } + }; + + constexpr auto kLaneMask = xsimd::make_batch_constant(); + if constexpr (isOnlyFromLow(mask)) { + auto broadcast = _mm256_permute2x128_si256(batch, batch, 0x00); // [low | low] + return _mm256_shuffle_epi8(broadcast, kLaneMask.as_batch()); + } + if constexpr (isOnlyFromHigh(mask)) { + auto broadcast = _mm256_permute2x128_si256(batch, batch, 0x11); // [high | high] + return _mm256_shuffle_epi8(broadcast, kLaneMask.as_batch()); + } + + auto self = _mm256_shuffle_epi8(batch, kSelfSwizzle.as_batch()); + auto swapped = _mm256_permute2x128_si256(batch, batch, 0x01); + auto cross = _mm256_shuffle_epi8(swapped, kCrossSwizzle.as_batch()); + return _mm256_or_si256(self, cross); + } else { + return xsimd::swizzle(batch, mask); + } +} + +template +constexpr auto make_mult(xsimd::batch_constant) { + return xsimd::batch_constant(1u << kShifts)...>(); +} + +template +struct SelectStride { + static constexpr auto kShiftsArr = std::array{kVals...}; + + static constexpr Int get(int i, int n) { return kShiftsArr[kLength * i + kOffset]; } +}; + +template +constexpr auto select_stride(xsimd::batch_constant) { + return xsimd::make_batch_constant< + ToInt, Arch, SelectStride>(); +} + +template +constexpr bool HasSse2 = std::is_base_of_v; + +template +constexpr bool HasAvx2 = std::is_base_of_v; + +/// Wrapper around ``xsimd::bitwise_lshift`` with optimizations for non implemented sizes. +// +// We replace the variable left shift by a variable multiply with a power of two. +// +// This trick is borrowed from Daniel Lemire and Leonid Boytsov, Decoding billions of +// integers per second through vectorization, Software Practice & Experience 45 (1), 2015. +// http://arxiv.org/abs/1209.2137 +// +/// TODO(xsimd) Tracking in https://github.com/xtensor-stack/xsimd/pull/1220 +template +auto left_shift(const xsimd::batch& batch, + xsimd::batch_constant shifts) + -> xsimd::batch { + constexpr bool kHasSse2 = HasSse2; + constexpr bool kHasAvx2 = HasAvx2; + static_assert(!(kHasSse2 && kHasAvx2), "The hierarchy are different in xsimd"); + + // TODO(xsimd-14) this can be simplified to + // constexpr auto kMults = xsimd::make_batch_constant() << shits; + constexpr auto kMults = make_mult(shifts); + + constexpr auto IntSize = sizeof(Int); + + // Sizes and architecture for which there is no variable left shift and there is a + // multiplication + if constexpr ( // + (kHasSse2 && (IntSize == sizeof(uint16_t) || IntSize == sizeof(uint32_t))) // + || (kHasAvx2 && (IntSize == sizeof(uint16_t))) // + ) { + return batch * kMults; + } + + // Architecture for which there is no variable left shift on uint8_t but a fallback + // exists for uint16_t. + if constexpr ((kHasSse2 || kHasAvx2) && (IntSize == sizeof(uint8_t))) { + const auto batch16 = xsimd::bitwise_cast(batch); + + constexpr auto kShifts0 = select_stride(shifts); + const auto shifted0 = left_shift(batch16, kShifts0) & 0x00FF; + + constexpr auto kShifts1 = select_stride(shifts); + const auto shifted1 = left_shift(batch16 & 0xFF00, kShifts1); + + return xsimd::bitwise_cast(shifted0 | shifted1); + } + + return batch << shifts; +} + +/// Fallback for variable shift right. +/// +/// When we know that the relevant bits will not overflow, we can instead shift left all +/// values to align them with the one with the largest right shifts followed by a constant +/// shift on all values. +/// In doing so, we replace the variable left shift by a variable multiply with a power of +/// two. +/// +/// This trick is borrowed from Daniel Lemire and Leonid Boytsov, Decoding billions of +/// integers per second through vectorization, Software Practice & Experience 45 (1), +/// 2015. http://arxiv.org/abs/1209.2137 +template +auto right_shift_by_excess(const xsimd::batch& batch, + xsimd::batch_constant shifts) { + constexpr bool kHasSse2 = HasSse2; + constexpr bool kHasAvx2 = HasAvx2; + static_assert(!(kHasSse2 && kHasAvx2), "The hierarchy are different in xsimd"); + + constexpr auto IntSize = sizeof(Int); + + // Architecture for which there is no variable right shift but a larger fallback exists. + /// TODO(xsimd) Tracking for Avx2 in https://github.com/xtensor-stack/xsimd/pull/1220 + if constexpr (kHasAvx2 && (IntSize == sizeof(uint8_t) || IntSize == sizeof(uint16_t))) { + using twice_uint = SizedUint<2 * IntSize>; + + const auto batch2 = xsimd::bitwise_cast(batch); + + constexpr auto kShifts0 = select_stride(shifts); + constexpr auto kMask0 = bit_util::LeastSignificantBitMask(8 * IntSize); + const auto shifted0 = right_shift_by_excess(batch2 & kMask0, kShifts0); + + constexpr auto kShifts1 = select_stride(shifts); + constexpr auto kMask1 = kMask0 << (8 * IntSize); + const auto shifted1 = right_shift_by_excess(batch2, kShifts1) & kMask1; + + return xsimd::bitwise_cast(shifted0 | shifted1); + } + + // These conditions are the ones matched in `left_shift`, i.e. the ones where variable + // shift right will not be available but a left shift (fallback) exists. + if constexpr (kHasSse2 && (IntSize != sizeof(uint64_t))) { + static constexpr auto kShiftsArr = std::array{kShifts...}; + static constexpr Int kMaxRightShift = max_value(kShiftsArr); + + struct MakeShifts { + static constexpr Int get(int i, int n) { return kMaxRightShift - kShiftsArr.at(i); } + }; + + // TODO(xsimd 14.0) this can be simplified to + // constexpr auto kRShifts = xsimd::make_batch_constant() - + // shifts; + constexpr auto kLShifts = xsimd::make_batch_constant(); + + const auto lshifted = left_shift(batch, kLShifts); + // TODO(xsimd 14.0) this can be simplified to + // return xsimd::bitwise_rshift(lshifted); + return xsimd::batch(lshifted) >> kMaxRightShift; + } + + return batch >> shifts; +} + +/**************************** + * Properties of a kernel * + ****************************/ + +/// \see KernelShape constexpr bool PackedIsOversizedForSimd(int simd_bit_size, int unpacked_bit_size, int packed_bit_size) { const int unpacked_per_simd = simd_bit_size / unpacked_bit_size; @@ -70,6 +319,21 @@ constexpr bool PackedIsOversizedForSimd(int simd_bit_size, int unpacked_bit_size return false; } +/// Different sizes of a given kernel. +/// +/// When integers are bit-packed, they spread over multiple bytes. +/// For instance, integers packed voer three bits quickly spread over two bytes (on the +/// value `C` below) despite three bits being much smaller than a single byte. +/// +/// ``` +/// |A|A|A|B|B|B|C|C| |C|D|D|D|E|E|E|F| ... +/// ``` +/// +/// When the spread is smaller or equal to the unsigned integer to unpack to, we classify +/// if as "medium". When it is strictly larger it classifies as "large". +/// On rare occasions, due to offsets in reading a subsequent batch, we may not even be +/// able to read as many packed values as we should extract in a batch (mainly unpack 63 +/// bits to uint64_t on 128 bit SIMD). We classify this as "oversized". struct KernelShape { const int simd_bit_size_; const int unpacked_bit_size_; @@ -98,6 +362,7 @@ struct KernelShape { constexpr int packed_max_spread_bytes() const { return packed_max_spread_bytes_; } }; +/// Packing all useful and derived information about a kernel in a single type. template struct KernelTraits { static constexpr KernelShape kShape = { @@ -127,6 +392,10 @@ template using KernelTraitsDouble = KernelTraitsWithUnpack>; +/****************** + * MediumKernel * + ******************/ + struct MediumKernelPlanSize { int reads_per_kernel_; int swizzles_per_read_; @@ -244,6 +513,15 @@ struct MediumKernelPlan { uint_type mask = bit_util::LeastSignificantBitMask(kShape.packed_bit_size()); }; +template +constexpr Arr BuildConstantArray(typename Arr::value_type val) { + Arr out = {}; + for (auto& v : out) { + v = val; + } + return out; +} + template constexpr auto BuildMediumPlan() { using Plan = MediumKernelPlan; @@ -289,251 +567,6 @@ constexpr auto BuildMediumPlan() { return plan; } -/// Simple constexpr maximum element suited for non empty arrays. -template -constexpr T max_value(const std::array& arr) { - static_assert(N > 0); - T out = 0; - for (const T& v : arr) { - if (v > out) { - out = v; - } - } - return out; -} - -template -constexpr auto make_batch_constant_impl(std::index_sequence) { - using Array = std::decay_t; - using value_type = typename Array::value_type; - - return xsimd::batch_constant{}; -} - -template -constexpr auto make_batch_constant() { - return make_batch_constant_impl(std::make_index_sequence()); -} - -template -struct SwizzleBiLaneGenericPlan { - using ByteSwizzle = std::array; - - ByteSwizzle self_lane; - ByteSwizzle cross_lane; -}; - -template -constexpr SwizzleBiLaneGenericPlan BuildSwizzleBiLaneGenericPlan( - std::array mask) { - constexpr T kAsZero = 0x80; // Most significant bit of the byte must be 1 - constexpr std::size_t kSize = N; - constexpr std::size_t kSizeHalf = kSize / 2; - - SwizzleBiLaneGenericPlan plan = {}; - - for (std::size_t k = 0; k < kSize; ++k) { - const bool is_defined = (0 <= mask[k]) && (mask[k] < kSize); - const bool is_first_lane_idx = k < kSizeHalf; - const bool is_first_lane_mask = mask[k] < kSizeHalf; - - if (!is_defined) { - plan.self_lane[k] = kAsZero; - plan.cross_lane[k] = kAsZero; - } else { - if (is_first_lane_idx == is_first_lane_mask) { - plan.self_lane[k] = mask[k] % kSizeHalf; - plan.cross_lane[k] = kAsZero; - } else { - plan.self_lane[k] = kAsZero; - plan.cross_lane[k] = mask[k] % kSizeHalf; - } - } - } - - return plan; -} - -template -constexpr bool isOnlyFromHigh(xsimd::batch_constant) { - return ((Vals >= (sizeof...(Vals) / 2)) && ...); -} - -template -constexpr bool isOnlyFromLow(xsimd::batch_constant) { - return ((Vals < (sizeof...(Vals) / 2)) && ...); -} - -/// Merged in xsimd 14.0, simply use swizzle -template -auto swizzle_bytes(const xsimd::batch& batch, - xsimd::batch_constant mask) - -> xsimd::batch { - if constexpr (std::is_base_of_v) { - static constexpr auto kPlan = BuildSwizzleBiLaneGenericPlan(std::array{kIdx...}); - static constexpr auto kSelfSwizzleArr = kPlan.self_lane; - constexpr auto kSelfSwizzle = make_batch_constant(); - static constexpr auto kCrossSwizzleArr = kPlan.cross_lane; - constexpr auto kCrossSwizzle = make_batch_constant(); - - struct LaneMask { - static constexpr uint8_t get(uint8_t i, uint8_t n) { - constexpr auto kMask = std::array{kIdx...}; - return kMask[i] % (kMask.size() / 2); - } - }; - - constexpr auto kLaneMask = xsimd::make_batch_constant(); - if constexpr (isOnlyFromLow(mask)) { - auto broadcast = _mm256_permute2x128_si256(batch, batch, 0x00); // [low | low] - return _mm256_shuffle_epi8(broadcast, kLaneMask.as_batch()); - } - if constexpr (isOnlyFromHigh(mask)) { - auto broadcast = _mm256_permute2x128_si256(batch, batch, 0x11); // [high | high] - return _mm256_shuffle_epi8(broadcast, kLaneMask.as_batch()); - } - - auto self = _mm256_shuffle_epi8(batch, kSelfSwizzle.as_batch()); - auto swapped = _mm256_permute2x128_si256(batch, batch, 0x01); - auto cross = _mm256_shuffle_epi8(swapped, kCrossSwizzle.as_batch()); - return _mm256_or_si256(self, cross); - } else { - return xsimd::swizzle(batch, mask); - } -} - -template -constexpr auto make_mult(xsimd::batch_constant) { - return xsimd::batch_constant(1u << kShifts)...>(); -} - -template -struct SelectStride { - static constexpr auto kShiftsArr = std::array{kVals...}; - - static constexpr Int get(int i, int n) { return kShiftsArr[kLength * i + kOffset]; } -}; - -template -constexpr auto select_stride(xsimd::batch_constant) { - return xsimd::make_batch_constant< - ToInt, Arch, SelectStride>(); -} - -template -constexpr bool HasSse2 = std::is_base_of_v; - -template -constexpr bool HasAvx2 = std::is_base_of_v; - -// Fallback for variable shift left. -// -// We replace the variable left shift by a variable multiply with a power of two. -// -// This trick is borrowed from Daniel Lemire and Leonid Boytsov, Decoding billions of -// integers per second through vectorization, Software Practice & Experience 45 (1), 2015. -// http://arxiv.org/abs/1209.2137 -template -auto left_shift(const xsimd::batch& batch, - xsimd::batch_constant shifts) - -> xsimd::batch { - constexpr bool kHasSse2 = HasSse2; - constexpr bool kHasAvx2 = HasAvx2; - static_assert(!(kHasSse2 && kHasAvx2), "The hierarchy are different in xsimd"); - - // TODO(xsimd-14) this can be simplified to - // constexpr auto kMults = xsimd::make_batch_constant() << shits; - constexpr auto kMults = make_mult(shifts); - - constexpr auto IntSize = sizeof(Int); - - // Sizes and architecture for which there is no variable left shift and there is a - // multiplication - if constexpr ( // - (kHasSse2 && (IntSize == sizeof(uint16_t) || IntSize == sizeof(uint32_t))) // - || (kHasAvx2 && (IntSize == sizeof(uint16_t))) // - ) { - return batch * kMults; - } - - // Architecture for which there is no variable left shift on uint8_t but a fallback - // exists for uint16_t. - if constexpr ((kHasSse2 || kHasAvx2) && (IntSize == sizeof(uint8_t))) { - const auto batch16 = xsimd::bitwise_cast(batch); - - constexpr auto kShifts0 = select_stride(shifts); - const auto shifted0 = left_shift(batch16, kShifts0) & 0x00FF; - - constexpr auto kShifts1 = select_stride(shifts); - const auto shifted1 = left_shift(batch16 & 0xFF00, kShifts1); - - return xsimd::bitwise_cast(shifted0 | shifted1); - } - - return batch << shifts; -} - -// Fallback for variable shift right. -// -// When we know that the relevant bits will not overflow, we can instead shift left all -// values to align them with the one with the largest right shifts followed by a constant -// shift on all values. -// In doing so, we replace the variable left shift by a variable multiply with a power of -// two. -// -// This trick is borrowed from Daniel Lemire and Leonid Boytsov, Decoding billions of -// integers per second through vectorization, Software Practice & Experience 45 (1), 2015. -// http://arxiv.org/abs/1209.2137 -template -auto right_shift_by_excess(const xsimd::batch& batch, - xsimd::batch_constant shifts) { - constexpr bool kHasSse2 = HasSse2; - constexpr bool kHasAvx2 = HasAvx2; - static_assert(!(kHasSse2 && kHasAvx2), "The hierarchy are different in xsimd"); - - constexpr auto IntSize = sizeof(Int); - - // Architecture for which there is no variable right shift but a larger fallback exists. - if constexpr (kHasAvx2 && (IntSize == sizeof(uint8_t) || IntSize == sizeof(uint16_t))) { - using twice_uint = SizedUint<2 * IntSize>; - - const auto batch2 = xsimd::bitwise_cast(batch); - - constexpr auto kShifts0 = select_stride(shifts); - constexpr auto kMask0 = bit_util::LeastSignificantBitMask(8 * IntSize); - const auto shifted0 = right_shift_by_excess(batch2 & kMask0, kShifts0); - - constexpr auto kShifts1 = select_stride(shifts); - constexpr auto kMask1 = kMask0 << (8 * IntSize); - const auto shifted1 = right_shift_by_excess(batch2, kShifts1) & kMask1; - - return xsimd::bitwise_cast(shifted0 | shifted1); - } - - // These conditions are the ones matched in `left_shift`, i.e. the ones where variable - // shift right will not be available but a left shift (fallback) exists. - if constexpr (kHasSse2 && (IntSize != sizeof(uint64_t))) { - static constexpr auto kShiftsArr = std::array{kShifts...}; - static constexpr Int kMaxRightShift = max_value(kShiftsArr); - - struct MakeShifts { - static constexpr Int get(int i, int n) { return kMaxRightShift - kShiftsArr.at(i); } - }; - - // TODO(xsimd-14) this can be simplified to - // constexpr auto kRShifts = xsimd::make_batch_constant() - - // shifts; - constexpr auto kLShifts = xsimd::make_batch_constant(); - - const auto lshifted = left_shift(batch, kLShifts); - // TODO(xsimd-14) - // return xsimd::bitwise_rshift(lshifted); - return xsimd::batch(lshifted) >> kMaxRightShift; - } - - return batch >> shifts; -} - template > struct MediumKernel { static constexpr auto kPlan = BuildMediumPlan(); @@ -620,6 +653,10 @@ struct MediumKernel { } }; +/***************** + * LargeKernel * + *****************/ + template struct LargeKernelPlan { using Traits = KernelTraits; @@ -753,6 +790,10 @@ struct LargeKernel { } }; +/********************* + * Utility Kernels * + *********************/ + /// A Kernel that does not extract anything, leaving all work to the naive implementation. template struct NoOpKernel { From e6d7baad9bb33d45bbcab9c96499787684d5b709 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 26 Nov 2025 15:06:00 +0100 Subject: [PATCH 47/64] Refactor KernelDispatch and remove Oversized dispatch --- .../arrow/util/bpacking_simd_impl_internal.h | 53 ++++++++----------- 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 2c90d10d239..07985714f02 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -826,12 +826,10 @@ struct ForwardToKernel : WorkingKernel { * Kernel static dispatching * *******************************/ -/// The public kernel exposed for any size. -template , - typename = void> -struct Kernel; +template +struct KernelDispatch; +// Benchmarking show unpack to uint64_t is underperforming on SSE4.2 and Avx2 template constexpr bool MediumShouldUseUint32 = (HasSse2 || HasSse2)&& // @@ -839,34 +837,29 @@ constexpr bool MediumShouldUseUint32 = (Traits::kShape.packed_bit_size() < 32) && KernelTraitsWithUnpack::kShape.is_medium(); -template -struct Kernel< // - UnpackedUint, kPackedBitSize, kSimdBitSize, Traits, - std::enable_if_t>> +template +struct KernelDispatch>> : MediumKernel {}; -template -struct Kernel< // - UnpackedUint, kPackedBitSize, kSimdBitSize, Traits, - std::enable_if_t>> +template +struct KernelDispatch< + Traits, std::enable_if_t>> : ForwardToKernel>> {}; -template -struct Kernel< // - UnpackedUint, kPackedBitSize, kSimdBitSize, Traits, - std::enable_if_t> : LargeKernel {}; - -template -struct Kernel< // - UnpackedUint, kPackedBitSize, kSimdBitSize, Traits, - std::enable_if_t> - : ForwardToKernel>> {}; - -template -struct Kernel< // - UnpackedUint, kPackedBitSize, kSimdBitSize, Traits, - std::enable_if_t> : NoOpKernel {}; +// Large kernel +template +struct KernelDispatch> + : LargeKernel {}; + +// Oversize kernel is only a few edge cases +template +struct KernelDispatch> + : NoOpKernel {}; + +/// The public kernel exposed for any size. +template +struct Kernel : KernelDispatch> { +}; } // namespace arrow::internal From b2bd763d726ded4ded18be739e6aca9b161c6079 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 26 Nov 2025 15:34:06 +0100 Subject: [PATCH 48/64] Forward large unpack8 to unpack16 on SSE2 --- .../arrow/util/bpacking_simd_impl_internal.h | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 07985714f02..2769f2be81c 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -384,14 +384,6 @@ template using KernelTraitsWithUnpack = KernelTraits; -template -using KernelTraitsHalf = - KernelTraitsWithUnpack>; - -template -using KernelTraitsDouble = - KernelTraitsWithUnpack>; - /****************** * MediumKernel * ******************/ @@ -845,13 +837,24 @@ struct KernelDispatch struct KernelDispatch< Traits, std::enable_if_t>> - : ForwardToKernel>> {}; + : ForwardToKernel>> {}; + +// Benchmarking show large unpack to uint8_t is underperforming on SSE4.2 +template +constexpr bool LargeShouldUseUint16 = HasSse2 && + (Traits::kShape.unpacked_byte_size() == + sizeof(uint8_t)); -// Large kernel template -struct KernelDispatch> +struct KernelDispatch< + Traits, std::enable_if_t>> : LargeKernel {}; +template +struct KernelDispatch< + Traits, std::enable_if_t>> + : ForwardToKernel>> {}; + // Oversize kernel is only a few edge cases template struct KernelDispatch> From 2ab462c43083c420b45bb68da7ef93e25df406f4 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 26 Nov 2025 15:37:50 +0100 Subject: [PATCH 49/64] Use fallback right shift on large uint8_t avx2 --- cpp/src/arrow/util/bpacking_simd_impl_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 2769f2be81c..2882bac0a5e 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -751,7 +751,7 @@ struct LargeKernel { const auto low_swizzled = swizzle_bytes(bytes, kLowSwizzles); const auto low_words = xsimd::bitwise_cast(low_swizzled); simd_batch low_shifted; - if constexpr (kShape.unpacked_byte_size() == 1) { + if constexpr (kShape.unpacked_byte_size() == 1 && HasSse2) { // The logic of the fallback in right_shift_by_excess does not work for this single // byte case case, so we use directly xsimd and its scalar fallback. low_shifted = low_words >> kLowRShifts; From bc9d27f70b8d72f85fcad8b73537f8f2ad7a7787 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 27 Nov 2025 11:40:23 +0100 Subject: [PATCH 50/64] Fix enable_if --- .../arrow/util/bpacking_simd_impl_internal.h | 45 +++++++++---------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_impl_internal.h index 2882bac0a5e..e39ab62420e 100644 --- a/cpp/src/arrow/util/bpacking_simd_impl_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_impl_internal.h @@ -818,9 +818,6 @@ struct ForwardToKernel : WorkingKernel { * Kernel static dispatching * *******************************/ -template -struct KernelDispatch; - // Benchmarking show unpack to uint64_t is underperforming on SSE4.2 and Avx2 template constexpr bool MediumShouldUseUint32 = @@ -829,36 +826,36 @@ constexpr bool MediumShouldUseUint32 = (Traits::kShape.packed_bit_size() < 32) && KernelTraitsWithUnpack::kShape.is_medium(); -template -struct KernelDispatch>> - : MediumKernel {}; - -template -struct KernelDispatch< - Traits, std::enable_if_t>> - : ForwardToKernel>> {}; - // Benchmarking show large unpack to uint8_t is underperforming on SSE4.2 template constexpr bool LargeShouldUseUint16 = HasSse2 && (Traits::kShape.unpacked_byte_size() == sizeof(uint8_t)); +// A ``std::enable_if`` that works on MSVC template -struct KernelDispatch< - Traits, std::enable_if_t>> - : LargeKernel {}; - -template -struct KernelDispatch< - Traits, std::enable_if_t>> - : ForwardToKernel>> {}; +constexpr auto KernelDispatchImpl() { + if constexpr (Traits::kShape.is_medium()) { + if constexpr (MediumShouldUseUint32) { + using Kernel32 = MediumKernel>; + return ForwardToKernel{}; + } else { + return MediumKernel{}; + } + } else if constexpr (Traits::kShape.is_large()) { + if constexpr (LargeShouldUseUint16) { + using Kernel16 = MediumKernel>; + return ForwardToKernel{}; + } else { + return LargeKernel{}; + } + } else if constexpr (Traits::kShape.is_oversized()) { + return NoOpKernel{}; + } +} -// Oversize kernel is only a few edge cases template -struct KernelDispatch> - : NoOpKernel {}; +using KernelDispatch = decltype(KernelDispatchImpl()); /// The public kernel exposed for any size. template From c9d72783da706e510036b6d4feaacb07d052d510 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 27 Nov 2025 14:09:26 +0100 Subject: [PATCH 51/64] Add missing header --- cpp/src/arrow/util/bpacking_simd_avx2.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/util/bpacking_simd_avx2.cc b/cpp/src/arrow/util/bpacking_simd_avx2.cc index c41f9fc3836..d3a4bd362c1 100644 --- a/cpp/src/arrow/util/bpacking_simd_avx2.cc +++ b/cpp/src/arrow/util/bpacking_simd_avx2.cc @@ -17,6 +17,7 @@ #include "arrow/util/bpacking_dispatch_internal.h" #include "arrow/util/bpacking_simd_impl_internal.h" +#include "arrow/util/bpacking_simd_internal.h" namespace arrow::internal { From 814d8b4b1ef43bb14d9f1676d3cd28b53028d03b Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 27 Nov 2025 14:29:22 +0100 Subject: [PATCH 52/64] fmt --- cpp/src/arrow/util/bpacking_benchmark.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc index e053226fbfb..38df4a3befc 100644 --- a/cpp/src/arrow/util/bpacking_benchmark.cc +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -106,7 +106,9 @@ constexpr std::initializer_list kBitWidths32 = {1, 2, 8, 20}; constexpr std::initializer_list kBitWidths64 = {1, 2, 8, 20, 47}; static const std::vector> kBitWidthsNumValuesBool = { - {0, 1}, kNumValuesRange}; + {0, 1}, + kNumValuesRange, +}; static const std::vector> kBitWidthsNumValues8 = { kBitWidths8, kNumValuesRange, From 8a8f9ba655615ba3945ffffbe20e89bb850d0675 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 27 Nov 2025 17:47:46 +0100 Subject: [PATCH 53/64] Add SSE4.2 to dynamic dispatch --- cpp/src/arrow/util/bpacking.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index fdb1c5a52ac..99ff18ccdc6 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -34,9 +34,11 @@ struct UnpackDynamicFunction { static constexpr auto implementations() { return std::array { - // Current SIMD unpack algorithm works terribly on SSE4.2 due to lack of variable - // rhsift and poor xsimd fallback. +#if defined(ARROW_HAVE_SSE4_2) + Implementation{DispatchLevel::NONE, &unpack_sse4_2}, +#else Implementation{DispatchLevel::NONE, &unpack_scalar}, +#endif #if defined(ARROW_HAVE_RUNTIME_AVX2) Implementation{DispatchLevel::AVX2, &unpack_avx2}, #endif From 62efbb016e490553672fa5b998ec4c199f15041a Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 27 Nov 2025 15:00:32 +0100 Subject: [PATCH 54/64] Rename bpacking_simd_impl > bpacking_simd_kernel --- cpp/src/arrow/util/bpacking_simd_avx2.cc | 2 +- cpp/src/arrow/util/bpacking_simd_default.cc | 2 +- ...ing_simd_impl_internal.h => bpacking_simd_kernel_internal.h} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename cpp/src/arrow/util/{bpacking_simd_impl_internal.h => bpacking_simd_kernel_internal.h} (100%) diff --git a/cpp/src/arrow/util/bpacking_simd_avx2.cc b/cpp/src/arrow/util/bpacking_simd_avx2.cc index d3a4bd362c1..19533b1785c 100644 --- a/cpp/src/arrow/util/bpacking_simd_avx2.cc +++ b/cpp/src/arrow/util/bpacking_simd_avx2.cc @@ -16,8 +16,8 @@ // under the License. #include "arrow/util/bpacking_dispatch_internal.h" -#include "arrow/util/bpacking_simd_impl_internal.h" #include "arrow/util/bpacking_simd_internal.h" +#include "arrow/util/bpacking_simd_kernel_internal.h" namespace arrow::internal { diff --git a/cpp/src/arrow/util/bpacking_simd_default.cc b/cpp/src/arrow/util/bpacking_simd_default.cc index 1835157f4e2..78a25ee2182 100644 --- a/cpp/src/arrow/util/bpacking_simd_default.cc +++ b/cpp/src/arrow/util/bpacking_simd_default.cc @@ -24,8 +24,8 @@ #if defined(UNPACK_PLATFORM) # include "arrow/util/bpacking_dispatch_internal.h" -# include "arrow/util/bpacking_simd_impl_internal.h" # include "arrow/util/bpacking_simd_internal.h" +# include "arrow/util/bpacking_simd_kernel_internal.h" namespace arrow::internal { diff --git a/cpp/src/arrow/util/bpacking_simd_impl_internal.h b/cpp/src/arrow/util/bpacking_simd_kernel_internal.h similarity index 100% rename from cpp/src/arrow/util/bpacking_simd_impl_internal.h rename to cpp/src/arrow/util/bpacking_simd_kernel_internal.h From b95d89d1461b1d64f5385de2bdc4b338fe850a12 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 27 Nov 2025 15:02:39 +0100 Subject: [PATCH 55/64] Restore modifications to simd_codegen --- cpp/src/arrow/util/bpacking_simd_codegen.py | 44 ++++----------------- 1 file changed, 7 insertions(+), 37 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_codegen.py b/cpp/src/arrow/util/bpacking_simd_codegen.py index 029472eb9fa..3218fe24cb6 100755 --- a/cpp/src/arrow/util/bpacking_simd_codegen.py +++ b/cpp/src/arrow/util/bpacking_simd_codegen.py @@ -46,20 +46,6 @@ """ -def packed_max_spread_bytes(width: int, bit_offset: int = 0) -> int: - max_spread = (width + 7) // 8 - start = bit_offset - while True: - byte_start = start // 8 - byte_end = (start + width - 1) // 8 - spread = byte_end - byte_start + 1 - max_spread = max(spread, max_spread) - start += width - if start % 8 == bit_offset: - break - return max_spread - - @dataclasses.dataclass class UnpackStructGenerator: out_type: str @@ -174,28 +160,13 @@ def print_struct_k(self, bit: int): print("template<>") print(f"struct {self.struct_specialization(bit)} {{") print() - - if packed_max_spread_bytes(bit) <= self.out_byte_width: - print( - " using Dispatch = Kernel<" - f"{self.out_type}, {bit}, {self.simd_bit_width}>;" - ) - print() - print(" static constexpr int kValuesUnpacked = Dispatch::kValuesUnpacked;") - print() - print( - f" static const uint8_t* unpack(const uint8_t* in, {self.out_type}* out) {{" - ) - print(" return Dispatch::unpack(in, out);") - print(" }") - else: - print( - " using simd_batch = xsimd::make_sized_batch_t<" - f"{self.out_type}, {self.simd_value_count}>;" - ) - print(f" static constexpr int kValuesUnpacked = {self.out_bit_width};") - print() - self.print_unpack_bit_func(bit) + print( + " using simd_batch = xsimd::make_sized_batch_t<" + f"{self.out_type}, {self.simd_value_count}>;" + ) + print(f" static constexpr int kValuesUnpacked = {self.out_bit_width};") + print() + self.print_unpack_bit_func(bit) print("};") def print_uint32_fallback_struct(self): @@ -253,7 +224,6 @@ def print_headers(self): print("#include ") print() print('#include "arrow/util/ubsan.h"') - print('#include "arrow/util/bpacking_simd_impl_internal.h"') def print_file_top(self): print("#pragma once") From 4036ec6686f2eee59d51e035abfef295380f79a7 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 27 Nov 2025 17:34:54 +0100 Subject: [PATCH 56/64] Reduce reading size and declare bytes read --- .../util/bpacking_simd_kernel_internal.h | 63 +++++++++++++------ 1 file changed, 44 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_kernel_internal.h b/cpp/src/arrow/util/bpacking_simd_kernel_internal.h index e39ab62420e..9e8e755a88c 100644 --- a/cpp/src/arrow/util/bpacking_simd_kernel_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_kernel_internal.h @@ -466,6 +466,15 @@ constexpr MediumKernelPlanSize BuildMediumPlanSize(const KernelShape& shape) { }; } +constexpr int reduced_bytes_per_read(int bits_per_read, int simd_byte_size) { + if (bits_per_read <= static_cast(8 * sizeof(uint32_t))) { + return sizeof(uint32_t); + } else if (bits_per_read <= static_cast(8 * sizeof(uint64_t))) { + return sizeof(uint64_t); + } + return simd_byte_size; +} + template struct MediumKernelPlan { using Traits = KernelTraits; @@ -496,9 +505,12 @@ struct MediumKernelPlan { } static constexpr int bytes_per_read() { - return unpacked_per_read() * kShape.packed_bit_size(); + const auto bits_per_read = unpacked_per_read() * kShape.packed_bit_size(); + return reduced_bytes_per_read(bits_per_read, kShape.simd_byte_size()); } + constexpr int total_bytes_read() const { return reads.back() + bytes_per_read(); } + ReadsPerKernel reads; SwizzlesPerKernel swizzles; ShitsPerKernel shifts; @@ -559,6 +571,22 @@ constexpr auto BuildMediumPlan() { return plan; } +template +xsimd::batch load_bytes_as(const uint8_t* in) { + const Uint val = util::SafeLoadAs(in); + const auto batch = xsimd::batch(val); + return xsimd::bitwise_cast(batch); +} + +template +xsimd::batch load_bytes(const uint8_t* in) { + if constexpr (kBytes <= sizeof(uint64_t)) { + return load_bytes_as, Arch>(in); + } + using simd_bytes = xsimd::batch; + return simd_bytes::load_unaligned(in); +} + template > struct MediumKernel { static constexpr auto kPlan = BuildMediumPlan(); @@ -572,6 +600,7 @@ struct MediumKernel { using arch_type = typename Traits::arch_type; static constexpr int kValuesUnpacked = kPlan.unpacked_per_kernel(); + static constexpr int kBytesRead = kPlan.total_bytes_read(); template static void unpack_one_shift_impl(const simd_batch& words, unpacked_type* out) { @@ -607,27 +636,12 @@ struct MediumKernel { (unpack_one_shift_impl(words, out), ...); } - template - static simd_bytes load_bytes_as(const uint8_t* in) { - const Uint val = util::SafeLoadAs(in); - const auto batch = xsimd::batch(val); - return xsimd::bitwise_cast(batch); - } - - static simd_bytes load_bytes(const uint8_t* in) { - if constexpr (kPlan.bytes_per_read() > static_cast(sizeof(uint64_t))) { - return simd_bytes::load_unaligned(in); - } else if constexpr (kPlan.bytes_per_read() > static_cast(sizeof(uint32_t))) { - return load_bytes_as(in); - } - return load_bytes_as(in); - } - template static void unpack_one_read_impl(const uint8_t* in, unpacked_type* out, std::integer_sequence) { using ShiftSeq = std::make_integer_sequence; - const auto bytes = load_bytes(in + kPlan.reads.at(kReadIdx)); + const auto bytes = + load_bytes(in + kPlan.reads.at(kReadIdx)); (unpack_one_swizzle_impl(bytes, out, ShiftSeq{}), ...); } @@ -658,6 +672,7 @@ struct LargeKernelPlan { static constexpr int kUnpackedPerkernel = std::lcm(kShape.unpacked_per_simd(), 8); static constexpr int kReadsPerKernel = static_cast(bit_util::CeilDiv( kUnpackedPerkernel * kShape.packed_bit_size(), kShape.simd_bit_size())); + static constexpr int kUnpackedPerRead = kUnpackedPerkernel / kReadsPerKernel; using ReadsPerKernel = std::array; @@ -667,6 +682,13 @@ struct LargeKernelPlan { using Shift = std::array; using ShitsPerKernel = std::array; + static constexpr int bytes_per_read() { + const auto bits_per_read = kUnpackedPerRead * kShape.packed_bit_size(); + return reduced_bytes_per_read(bits_per_read, kShape.simd_byte_size()); + } + + constexpr int total_bytes_read() const { return reads.back() + bytes_per_read(); } + ReadsPerKernel reads; SwizzlesPerKernel low_swizzles; SwizzlesPerKernel high_swizzles; @@ -733,6 +755,7 @@ struct LargeKernel { using arch_type = typename Traits::arch_type; static constexpr int kValuesUnpacked = kPlan.kUnpackedPerkernel; + static constexpr int kBytesRead = kPlan.total_bytes_read(); template static void unpack_one_read_impl(const uint8_t* in, unpacked_type* out) { @@ -746,7 +769,8 @@ struct LargeKernel { static constexpr auto kHighLShiftsArr = kPlan.high_lshifts.at(kReadIdx); constexpr auto kHighLShifts = make_batch_constant(); - const auto bytes = simd_bytes::load_unaligned(in + kPlan.reads.at(kReadIdx)); + const auto bytes = + load_bytes(in + kPlan.reads.at(kReadIdx)); const auto low_swizzled = swizzle_bytes(bytes, kLowSwizzles); const auto low_words = xsimd::bitwise_cast(low_swizzled); @@ -792,6 +816,7 @@ struct NoOpKernel { using unpacked_type = typename KernelTraits::unpacked_type; static constexpr int kValuesUnpacked = 0; + static constexpr int kBytesRead = 0; static const uint8_t* unpack(const uint8_t* in, unpacked_type* out) { return in; } }; From 867ea9708fe74613b1439bdf938fc18a52fb003d Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 27 Nov 2025 17:36:28 +0100 Subject: [PATCH 57/64] Add kBytesRead to scalar code --- cpp/src/arrow/util/bpacking_scalar_codegen.py | 8 ++ .../util/bpacking_scalar_generated_internal.h | 97 +++++++++++++++++++ 2 files changed, 105 insertions(+) diff --git a/cpp/src/arrow/util/bpacking_scalar_codegen.py b/cpp/src/arrow/util/bpacking_scalar_codegen.py index 39b1190d921..91bf596df5c 100644 --- a/cpp/src/arrow/util/bpacking_scalar_codegen.py +++ b/cpp/src/arrow/util/bpacking_scalar_codegen.py @@ -114,6 +114,9 @@ def total_out_values(self, bit: int) -> int: self.total_in_values * bit + self.out_bit_width - 1 ) // self.out_bit_width + def total_in_bytes(self, bit: int) -> int: + return (self.total_in_values * bit) // 8 + def total_out_bytes(self, bit: int) -> int: return ( self.total_in_values * bit + self.out_byte_width - 1 @@ -186,6 +189,7 @@ def print_struct_k(self, bit: int) -> None: print(f"struct {self.struct_specialization(bit)} {{") print() print(f" static constexpr int kValuesUnpacked = {self.total_in_values};") + print(f" static constexpr int kBytesRead = {self.total_in_bytes(bit)};") print() self.print_unpack_k(bit) print("};") @@ -199,6 +203,10 @@ def print_uint32_fallback_struct(self): " static constexpr int kValuesUnpacked = " f"{self.struct_name}::kValuesUnpacked;" ) + print( + " static constexpr int kBytesRead = " + f"{self.struct_name}::kBytesRead;" + ) print() print( diff --git a/cpp/src/arrow/util/bpacking_scalar_generated_internal.h b/cpp/src/arrow/util/bpacking_scalar_generated_internal.h index dc21fddd5d5..24256189a33 100644 --- a/cpp/src/arrow/util/bpacking_scalar_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_scalar_generated_internal.h @@ -41,6 +41,7 @@ template struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = ScalarUnpackerForWidth::kValuesUnpacked; + static constexpr int kBytesRead = ScalarUnpackerForWidth::kBytesRead; static const uint8_t* unpack(const uint8_t* in, bool* out) { uint32_t buffer[kValuesUnpacked] = {}; @@ -56,6 +57,7 @@ template struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = ScalarUnpackerForWidth::kValuesUnpacked; + static constexpr int kBytesRead = ScalarUnpackerForWidth::kBytesRead; static const uint8_t* unpack(const uint8_t* in, uint8_t* out) { uint32_t buffer[kValuesUnpacked] = {}; @@ -71,6 +73,7 @@ template struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = ScalarUnpackerForWidth::kValuesUnpacked; + static constexpr int kBytesRead = ScalarUnpackerForWidth::kBytesRead; static const uint8_t* unpack(const uint8_t* in, uint16_t* out) { uint32_t buffer[kValuesUnpacked] = {}; @@ -86,6 +89,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 4; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 1) - uint32_t{1}); @@ -132,6 +136,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 8; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 2) - uint32_t{1}); @@ -179,6 +184,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 12; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 3) - uint32_t{1}); @@ -227,6 +233,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 16; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 4) - uint32_t{1}); @@ -276,6 +283,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 20; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 5) - uint32_t{1}); @@ -326,6 +334,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 24; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 6) - uint32_t{1}); @@ -377,6 +386,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 28; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 7) - uint32_t{1}); @@ -429,6 +439,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 32; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 8) - uint32_t{1}); @@ -482,6 +493,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 36; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 9) - uint32_t{1}); @@ -536,6 +548,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 40; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 10) - uint32_t{1}); @@ -591,6 +604,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 44; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 11) - uint32_t{1}); @@ -647,6 +661,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 48; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 12) - uint32_t{1}); @@ -704,6 +719,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 52; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 13) - uint32_t{1}); @@ -762,6 +778,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 56; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 14) - uint32_t{1}); @@ -821,6 +838,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 60; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 15) - uint32_t{1}); @@ -881,6 +899,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 64; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 16) - uint32_t{1}); @@ -942,6 +961,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 68; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 17) - uint32_t{1}); @@ -1004,6 +1024,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 72; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 18) - uint32_t{1}); @@ -1067,6 +1088,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 76; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 19) - uint32_t{1}); @@ -1131,6 +1153,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 80; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 20) - uint32_t{1}); @@ -1196,6 +1219,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 84; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 21) - uint32_t{1}); @@ -1262,6 +1286,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 88; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 22) - uint32_t{1}); @@ -1329,6 +1354,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 92; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 23) - uint32_t{1}); @@ -1397,6 +1423,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 96; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 24) - uint32_t{1}); @@ -1466,6 +1493,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 100; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 25) - uint32_t{1}); @@ -1536,6 +1564,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 104; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 26) - uint32_t{1}); @@ -1607,6 +1636,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 108; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 27) - uint32_t{1}); @@ -1679,6 +1709,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 112; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 28) - uint32_t{1}); @@ -1752,6 +1783,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 116; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 29) - uint32_t{1}); @@ -1826,6 +1858,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 120; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 30) - uint32_t{1}); @@ -1901,6 +1934,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 124; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 31) - uint32_t{1}); @@ -1978,6 +2012,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 4; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 1) - uint64_t{1}); @@ -2024,6 +2059,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 8; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 2) - uint64_t{1}); @@ -2070,6 +2106,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 12; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 3) - uint64_t{1}); @@ -2117,6 +2154,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 16; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 4) - uint64_t{1}); @@ -2164,6 +2202,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 20; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 5) - uint64_t{1}); @@ -2212,6 +2251,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 24; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 6) - uint64_t{1}); @@ -2260,6 +2300,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 28; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 7) - uint64_t{1}); @@ -2309,6 +2350,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 32; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 8) - uint64_t{1}); @@ -2358,6 +2400,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 36; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 9) - uint64_t{1}); @@ -2408,6 +2451,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 40; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 10) - uint64_t{1}); @@ -2458,6 +2502,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 44; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 11) - uint64_t{1}); @@ -2509,6 +2554,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 48; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 12) - uint64_t{1}); @@ -2560,6 +2606,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 52; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 13) - uint64_t{1}); @@ -2612,6 +2659,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 56; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 14) - uint64_t{1}); @@ -2664,6 +2712,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 60; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 15) - uint64_t{1}); @@ -2717,6 +2766,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 64; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 16) - uint64_t{1}); @@ -2770,6 +2820,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 68; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 17) - uint64_t{1}); @@ -2824,6 +2875,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 72; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 18) - uint64_t{1}); @@ -2878,6 +2930,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 76; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 19) - uint64_t{1}); @@ -2933,6 +2986,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 80; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 20) - uint64_t{1}); @@ -2988,6 +3042,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 84; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 21) - uint64_t{1}); @@ -3044,6 +3099,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 88; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 22) - uint64_t{1}); @@ -3100,6 +3156,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 92; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 23) - uint64_t{1}); @@ -3157,6 +3214,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 96; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 24) - uint64_t{1}); @@ -3214,6 +3272,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 100; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 25) - uint64_t{1}); @@ -3272,6 +3331,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 104; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 26) - uint64_t{1}); @@ -3330,6 +3390,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 108; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 27) - uint64_t{1}); @@ -3389,6 +3450,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 112; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 28) - uint64_t{1}); @@ -3448,6 +3510,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 116; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 29) - uint64_t{1}); @@ -3508,6 +3571,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 120; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 30) - uint64_t{1}); @@ -3568,6 +3632,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 124; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 31) - uint64_t{1}); @@ -3629,6 +3694,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 128; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 32) - uint64_t{1}); @@ -3690,6 +3756,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 132; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 33) - uint64_t{1}); @@ -3752,6 +3819,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 136; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 34) - uint64_t{1}); @@ -3814,6 +3882,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 140; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 35) - uint64_t{1}); @@ -3877,6 +3946,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 144; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 36) - uint64_t{1}); @@ -3940,6 +4010,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 148; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 37) - uint64_t{1}); @@ -4004,6 +4075,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 152; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 38) - uint64_t{1}); @@ -4068,6 +4140,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 156; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 39) - uint64_t{1}); @@ -4133,6 +4206,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 160; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 40) - uint64_t{1}); @@ -4198,6 +4272,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 164; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 41) - uint64_t{1}); @@ -4264,6 +4339,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 168; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 42) - uint64_t{1}); @@ -4330,6 +4406,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 172; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 43) - uint64_t{1}); @@ -4397,6 +4474,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 176; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 44) - uint64_t{1}); @@ -4464,6 +4542,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 180; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 45) - uint64_t{1}); @@ -4532,6 +4611,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 184; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 46) - uint64_t{1}); @@ -4600,6 +4680,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 188; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 47) - uint64_t{1}); @@ -4669,6 +4750,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 192; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 48) - uint64_t{1}); @@ -4738,6 +4820,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 196; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 49) - uint64_t{1}); @@ -4808,6 +4891,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 200; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 50) - uint64_t{1}); @@ -4878,6 +4962,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 204; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 51) - uint64_t{1}); @@ -4949,6 +5034,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 208; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 52) - uint64_t{1}); @@ -5020,6 +5106,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 212; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 53) - uint64_t{1}); @@ -5092,6 +5179,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 216; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 54) - uint64_t{1}); @@ -5164,6 +5252,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 220; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 55) - uint64_t{1}); @@ -5237,6 +5326,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 224; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 56) - uint64_t{1}); @@ -5310,6 +5400,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 228; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 57) - uint64_t{1}); @@ -5384,6 +5475,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 232; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 58) - uint64_t{1}); @@ -5458,6 +5550,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 236; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 59) - uint64_t{1}); @@ -5533,6 +5626,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 240; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 60) - uint64_t{1}); @@ -5608,6 +5702,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 244; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 61) - uint64_t{1}); @@ -5684,6 +5779,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 248; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 62) - uint64_t{1}); @@ -5760,6 +5856,7 @@ template<> struct ScalarUnpackerForWidth { static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 252; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 63) - uint64_t{1}); From 9f45892fefd1890c40cda0fec4f6d9c5b5ed7f9f Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 27 Nov 2025 17:41:45 +0100 Subject: [PATCH 58/64] Add kBytesRead to simd 512 generated code --- .../bpacking_simd512_generated_internal.h | 97 +++++++++++++++++++ cpp/src/arrow/util/bpacking_simd_codegen.py | 8 +- 2 files changed, 104 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/bpacking_simd512_generated_internal.h b/cpp/src/arrow/util/bpacking_simd512_generated_internal.h index bb79fbca5b4..263f1d79ff8 100644 --- a/cpp/src/arrow/util/bpacking_simd512_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd512_generated_internal.h @@ -40,6 +40,7 @@ template struct Simd512UnpackerForWidth { static constexpr int kValuesUnpacked = Simd512UnpackerForWidth::kValuesUnpacked; + static constexpr int kBytesRead = Simd512UnpackerForWidth::kBytesRead; static const uint8_t* unpack(const uint8_t* in, bool* out) { uint32_t buffer[kValuesUnpacked] = {}; @@ -55,6 +56,7 @@ template struct Simd512UnpackerForWidth { static constexpr int kValuesUnpacked = Simd512UnpackerForWidth::kValuesUnpacked; + static constexpr int kBytesRead = Simd512UnpackerForWidth::kBytesRead; static const uint8_t* unpack(const uint8_t* in, uint8_t* out) { uint32_t buffer[kValuesUnpacked] = {}; @@ -70,6 +72,7 @@ template struct Simd512UnpackerForWidth { static constexpr int kValuesUnpacked = Simd512UnpackerForWidth::kValuesUnpacked; + static constexpr int kBytesRead = Simd512UnpackerForWidth::kBytesRead; static const uint8_t* unpack(const uint8_t* in, uint16_t* out) { uint32_t buffer[kValuesUnpacked] = {}; @@ -86,6 +89,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 4; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1; @@ -151,6 +155,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 8; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3; @@ -216,6 +221,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 12; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7; @@ -281,6 +287,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 16; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xf; @@ -346,6 +353,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 20; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1f; @@ -411,6 +419,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 24; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3f; @@ -476,6 +485,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 28; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7f; @@ -541,6 +551,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 32; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xff; @@ -606,6 +617,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 36; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1ff; @@ -671,6 +683,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 40; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3ff; @@ -736,6 +749,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 44; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7ff; @@ -801,6 +815,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 48; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xfff; @@ -866,6 +881,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 52; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1fff; @@ -931,6 +947,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 56; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3fff; @@ -996,6 +1013,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 60; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7fff; @@ -1061,6 +1079,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 64; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xffff; @@ -1126,6 +1145,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 68; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1ffff; @@ -1191,6 +1211,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 72; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3ffff; @@ -1256,6 +1277,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 76; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7ffff; @@ -1321,6 +1343,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 80; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xfffff; @@ -1386,6 +1409,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 84; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1fffff; @@ -1451,6 +1475,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 88; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3fffff; @@ -1516,6 +1541,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 92; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7fffff; @@ -1581,6 +1607,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 96; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xffffff; @@ -1646,6 +1673,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 100; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1ffffff; @@ -1711,6 +1739,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 104; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3ffffff; @@ -1776,6 +1805,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 108; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7ffffff; @@ -1841,6 +1871,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 112; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xfffffff; @@ -1906,6 +1937,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 116; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1fffffff; @@ -1971,6 +2003,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 120; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3fffffff; @@ -2036,6 +2069,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 32; + static constexpr int kBytesRead = 124; static const uint8_t* unpack(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7fffffff; @@ -2102,6 +2136,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 8; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x1; @@ -2247,6 +2282,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 16; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x3; @@ -2392,6 +2428,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 24; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x7; @@ -2537,6 +2574,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 32; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0xf; @@ -2682,6 +2720,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 40; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x1f; @@ -2827,6 +2866,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 48; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x3f; @@ -2972,6 +3012,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 56; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x7f; @@ -3117,6 +3158,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 64; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0xff; @@ -3262,6 +3304,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 72; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x1ff; @@ -3407,6 +3450,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 80; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x3ff; @@ -3552,6 +3596,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 88; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x7ff; @@ -3697,6 +3742,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 96; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0xfff; @@ -3842,6 +3888,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 104; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x1fff; @@ -3987,6 +4034,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 112; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x3fff; @@ -4132,6 +4180,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 120; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x7fff; @@ -4277,6 +4326,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 128; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0xffff; @@ -4422,6 +4472,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 136; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x1ffff; @@ -4567,6 +4618,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 144; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x3ffff; @@ -4712,6 +4764,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 152; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x7ffff; @@ -4857,6 +4910,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 160; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0xfffff; @@ -5002,6 +5056,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 168; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x1fffff; @@ -5147,6 +5202,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 176; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x3fffff; @@ -5292,6 +5348,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 184; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x7fffff; @@ -5437,6 +5494,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 192; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0xffffff; @@ -5582,6 +5640,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 200; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x1ffffff; @@ -5727,6 +5786,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 208; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x3ffffff; @@ -5872,6 +5932,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 216; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x7ffffff; @@ -6017,6 +6078,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 224; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0xfffffff; @@ -6162,6 +6224,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 232; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x1fffffff; @@ -6307,6 +6370,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 240; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x3fffffff; @@ -6452,6 +6516,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 248; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x7fffffff; @@ -6597,6 +6662,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 256; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0xffffffff; @@ -6742,6 +6808,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 264; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x1ffffffff; @@ -6887,6 +6954,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 272; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x3ffffffff; @@ -7032,6 +7100,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 280; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x7ffffffff; @@ -7177,6 +7246,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 288; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0xfffffffff; @@ -7322,6 +7392,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 296; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x1fffffffff; @@ -7467,6 +7538,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 304; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x3fffffffff; @@ -7612,6 +7684,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 312; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x7fffffffff; @@ -7757,6 +7830,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 320; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0xffffffffff; @@ -7902,6 +7976,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 328; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x1ffffffffff; @@ -8047,6 +8122,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 336; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x3ffffffffff; @@ -8192,6 +8268,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 344; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x7ffffffffff; @@ -8337,6 +8414,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 352; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0xfffffffffff; @@ -8482,6 +8560,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 360; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x1fffffffffff; @@ -8627,6 +8706,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 368; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x3fffffffffff; @@ -8772,6 +8852,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 376; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x7fffffffffff; @@ -8917,6 +8998,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 384; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0xffffffffffff; @@ -9062,6 +9144,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 392; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x1ffffffffffff; @@ -9207,6 +9290,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 400; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x3ffffffffffff; @@ -9352,6 +9436,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 408; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x7ffffffffffff; @@ -9497,6 +9582,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 416; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0xfffffffffffff; @@ -9642,6 +9728,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 424; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x1fffffffffffff; @@ -9787,6 +9874,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 432; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x3fffffffffffff; @@ -9932,6 +10020,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 440; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x7fffffffffffff; @@ -10077,6 +10166,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 448; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0xffffffffffffff; @@ -10222,6 +10312,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 456; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x1ffffffffffffff; @@ -10367,6 +10458,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 464; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x3ffffffffffffff; @@ -10512,6 +10604,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 472; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x7ffffffffffffff; @@ -10657,6 +10750,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 480; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0xfffffffffffffff; @@ -10802,6 +10896,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 488; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x1fffffffffffffff; @@ -10947,6 +11042,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 496; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x3fffffffffffffff; @@ -11092,6 +11188,7 @@ struct Simd512UnpackerForWidth { using simd_batch = xsimd::make_sized_batch_t; static constexpr int kValuesUnpacked = 64; + static constexpr int kBytesRead = 504; static const uint8_t* unpack(const uint8_t* in, uint64_t* out) { constexpr uint64_t kMask = 0x7fffffffffffffff; diff --git a/cpp/src/arrow/util/bpacking_simd_codegen.py b/cpp/src/arrow/util/bpacking_simd_codegen.py index 3218fe24cb6..df9b19bcf20 100755 --- a/cpp/src/arrow/util/bpacking_simd_codegen.py +++ b/cpp/src/arrow/util/bpacking_simd_codegen.py @@ -164,7 +164,9 @@ def print_struct_k(self, bit: int): " using simd_batch = xsimd::make_sized_batch_t<" f"{self.out_type}, {self.simd_value_count}>;" ) - print(f" static constexpr int kValuesUnpacked = {self.out_bit_width};") + n_values_unpacked = self.out_bit_width # This is a property of this algorithm + print(f" static constexpr int kValuesUnpacked = {n_values_unpacked};") + print(f" static constexpr int kBytesRead = {(n_values_unpacked * bit) // 8};") print() self.print_unpack_bit_func(bit) print("};") @@ -178,6 +180,10 @@ def print_uint32_fallback_struct(self): " static constexpr int kValuesUnpacked = " f"{self.struct_name}::kValuesUnpacked;" ) + print( + " static constexpr int kBytesRead = " + f"{self.struct_name}::kBytesRead;" + ) print() print( From 9aa7c74480ec04aa058a43532c71c6ac30a5b659 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 27 Nov 2025 17:42:36 +0100 Subject: [PATCH 59/64] Prevent overreading --- cpp/src/arrow/util/bpacking_dispatch_internal.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h index 79026536a4d..6a50651c023 100644 --- a/cpp/src/arrow/util/bpacking_dispatch_internal.h +++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h @@ -189,17 +189,18 @@ void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int bit_ } else { using UnpackerForWidth = Unpacker; constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked; + constexpr auto kBytesRead = UnpackerForWidth::kBytesRead; if constexpr (kValuesUnpacked > 0) { // Running the optimized kernel for batch extraction - const int unpacker_iter_count = batch_size / kValuesUnpacked; + const int bytes_to_unpack = (batch_size * kPackedBitWidth) / 8; + const int unpacker_iter_count = + std::min(batch_size / kValuesUnpacked, bytes_to_unpack / kBytesRead); for (int i = 0; i < unpacker_iter_count; ++i) { in = UnpackerForWidth::unpack(in, out); out += kValuesUnpacked; } batch_size -= unpacker_iter_count * kValuesUnpacked; - ARROW_DCHECK_LT(batch_size, kValuesUnpacked); - ARROW_COMPILER_ASSUME(batch_size < kValuesUnpacked); } // Running the epilog for the remaining values that don't fit in a kernel From c629263bdce53b97ea587273986d3e1bff14291b Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 28 Nov 2025 11:19:34 +0100 Subject: [PATCH 60/64] Fix pessimit overeading guard --- cpp/src/arrow/util/bpacking_dispatch_internal.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h index 6a50651c023..f7f65bea533 100644 --- a/cpp/src/arrow/util/bpacking_dispatch_internal.h +++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h @@ -192,15 +192,14 @@ void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int bit_ constexpr auto kBytesRead = UnpackerForWidth::kBytesRead; if constexpr (kValuesUnpacked > 0) { + const uint8_t* last_in = + in + bit_util::CeilDiv(batch_size * kPackedBitWidth, 8) - kBytesRead; // Running the optimized kernel for batch extraction - const int bytes_to_unpack = (batch_size * kPackedBitWidth) / 8; - const int unpacker_iter_count = - std::min(batch_size / kValuesUnpacked, bytes_to_unpack / kBytesRead); - for (int i = 0; i < unpacker_iter_count; ++i) { + while ((batch_size > kValuesUnpacked) && (in <= last_in)) { in = UnpackerForWidth::unpack(in, out); out += kValuesUnpacked; + batch_size -= kValuesUnpacked; } - batch_size -= unpacker_iter_count * kValuesUnpacked; } // Running the epilog for the remaining values that don't fit in a kernel From 0191392f102809c99440155121ddc9e07b59bb4b Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 1 Dec 2025 15:21:31 +0100 Subject: [PATCH 61/64] Fix overreading guard comparison --- cpp/src/arrow/util/bpacking_dispatch_internal.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h index f7f65bea533..87bef51a01a 100644 --- a/cpp/src/arrow/util/bpacking_dispatch_internal.h +++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h @@ -188,18 +188,26 @@ void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int bit_ return unpack_full(in, out, batch_size); } else { using UnpackerForWidth = Unpacker; + // Number of values extracted by one iteration of the kernel constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked; + // Number of bytes read, but not necessarily unpacked, by one iteration of the + // kernel. This constant prevent reading past buffer end. constexpr auto kBytesRead = UnpackerForWidth::kBytesRead; if constexpr (kValuesUnpacked > 0) { - const uint8_t* last_in = - in + bit_util::CeilDiv(batch_size * kPackedBitWidth, 8) - kBytesRead; + const uint8_t* in_end = in + bit_util::CeilDiv(batch_size * kPackedBitWidth, 8); + const uint8_t* in_last = in_end - kBytesRead; // Running the optimized kernel for batch extraction - while ((batch_size > kValuesUnpacked) && (in <= last_in)) { + while ((batch_size >= kValuesUnpacked) && (in <= in_last)) { in = UnpackerForWidth::unpack(in, out); out += kValuesUnpacked; batch_size -= kValuesUnpacked; } + + // Performance check aking sure we ran the kernel loop as much as possible: + // Either we ran out because we could not pack enough values, or because we would + // overread. + ARROW_DCHECK((batch_size < kValuesUnpacked) || (in_end - in) < kBytesRead); } // Running the epilog for the remaining values that don't fit in a kernel From 8e7a758e5f48c7fa8aa16580f608bb19291e1f34 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 1 Dec 2025 18:34:10 +0100 Subject: [PATCH 62/64] Add UnpackOptions and max_read_bytes --- .../arrow/util/bit_stream_utils_internal.h | 13 +- cpp/src/arrow/util/bpacking.cc | 17 +- cpp/src/arrow/util/bpacking_benchmark.cc | 11 +- .../arrow/util/bpacking_dispatch_internal.h | 410 ++++++++++++------ cpp/src/arrow/util/bpacking_internal.h | 35 +- cpp/src/arrow/util/bpacking_scalar.cc | 16 +- cpp/src/arrow/util/bpacking_scalar_internal.h | 18 +- cpp/src/arrow/util/bpacking_simd_avx2.cc | 16 +- cpp/src/arrow/util/bpacking_simd_avx512.cc | 16 +- cpp/src/arrow/util/bpacking_simd_default.cc | 15 +- cpp/src/arrow/util/bpacking_simd_internal.h | 67 ++- cpp/src/arrow/util/bpacking_test.cc | 104 +++-- cpp/src/arrow/util/rle_encoding_internal.h | 44 +- cpp/src/arrow/util/rle_encoding_test.cc | 9 +- 14 files changed, 479 insertions(+), 312 deletions(-) diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h index 376de56a9af..f77eabf958f 100644 --- a/cpp/src/arrow/util/bit_stream_utils_internal.h +++ b/cpp/src/arrow/util/bit_stream_utils_internal.h @@ -273,14 +273,19 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) { batch_size = static_cast(remaining_bits / num_bits); } + const ::arrow::internal::UnpackOptions opts{ + /* .batch_size= */ batch_size, + /* .bit_width= */ num_bits, + /* .bit_offset= */ bit_offset_, + /* .max_read_bytes= */ max_bytes_ - byte_offset_, + }; + if constexpr (std::is_same_v) { - ::arrow::internal::unpack(buffer_ + byte_offset_, v, batch_size, num_bits, - bit_offset_); + ::arrow::internal::unpack(buffer_ + byte_offset_, v, opts); } else { ::arrow::internal::unpack(buffer_ + byte_offset_, - reinterpret_cast*>(v), batch_size, - num_bits, bit_offset_); + reinterpret_cast*>(v), opts); } Advance(batch_size * num_bits); diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index 99ff18ccdc6..a39f3610085 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -17,7 +17,6 @@ #include -#include "arrow/util/bpacking_dispatch_internal.h" #include "arrow/util/bpacking_internal.h" #include "arrow/util/bpacking_scalar_internal.h" #include "arrow/util/bpacking_simd_internal.h" @@ -52,19 +51,19 @@ struct UnpackDynamicFunction { } // namespace template -void unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits, int bit_offset) { +void unpack(const uint8_t* in, Uint* out, const UnpackOptions& opts) { #if defined(ARROW_HAVE_NEON) - return unpack_neon(in, out, batch_size, num_bits, bit_offset); + return unpack_neon(in, out, opts); #else static DynamicDispatch > dispatch; - return dispatch.func(in, out, batch_size, num_bits, bit_offset); + return dispatch.func(in, out, opts); #endif } -template void unpack(const uint8_t*, bool*, int, int, int); -template void unpack(const uint8_t*, uint8_t*, int, int, int); -template void unpack(const uint8_t*, uint16_t*, int, int, int); -template void unpack(const uint8_t*, uint32_t*, int, int, int); -template void unpack(const uint8_t*, uint64_t*, int, int, int); +template void unpack(const uint8_t*, bool*, const UnpackOptions&); +template void unpack(const uint8_t*, uint8_t*, const UnpackOptions&); +template void unpack(const uint8_t*, uint16_t*, const UnpackOptions&); +template void unpack(const uint8_t*, uint32_t*, const UnpackOptions&); +template void unpack(const uint8_t*, uint64_t*, const UnpackOptions&); } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc index 38df4a3befc..69ce84c37f3 100644 --- a/cpp/src/arrow/util/bpacking_benchmark.cc +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -34,7 +34,7 @@ namespace arrow::internal { namespace { template -using UnpackFunc = void (*)(const uint8_t*, Int*, int, int, int); +using UnpackFunc = void (*)(const uint8_t*, Int*, const UnpackOptions&); /// Get the number of bytes associate with a packing. constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) { @@ -89,8 +89,15 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc unpack, bo auto unpacked = std::make_unique(num_values); + const ::arrow::internal::UnpackOptions opts{ + /* .batch_size= */ num_values, + /* .bit_width= */ bit_width, + /* .bit_offset= */ 0, + /* .max_read_bytes= */ -1, + }; + for (auto _ : state) { - unpack(packed_ptr, unpacked.get(), num_values, bit_width, /* bit_offset = */ 0); + unpack(packed_ptr, unpacked.get(), opts); benchmark::ClobberMemory(); } state.SetItemsProcessed(num_values * state.iterations()); diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h index 87bef51a01a..509337f0b31 100644 --- a/cpp/src/arrow/util/bpacking_dispatch_internal.h +++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h @@ -22,6 +22,7 @@ #include #include "arrow/util/bit_util.h" +#include "arrow/util/bpacking_internal.h" #include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" @@ -166,13 +167,26 @@ int unpack_exact(const uint8_t* in, Uint* out, int batch_size, int bit_offset) { /// fixed amount of values (usually constrained by SIMD batch sizes and /// byte alignment). /// @tparam UnpackedUInt The type in which we unpack the values. +/// @param batch_size The number of values to unpack. +/// @param max_read_bytes The maximum size of the input byte array that can be read. +/// This is used to safely overread. +/// Negative value to deduce from batch_size. template typename Unpacker, typename UnpackedUInt> -void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int bit_offset) { +void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int bit_offset, + int max_read_bytes) { if constexpr (kPackedBitWidth == 0) { // Easy case to handle, simply setting memory to zero. return unpack_null(in, out, batch_size); } else { + // Number of size to read according to batch_size. + const int bytes_batch = static_cast( + bit_util::BytesForBits(batch_size * kPackedBitWidth + bit_offset)); + // If specified, max_read_bytes must be greater that the bytes needed to extract the + // number of desired values. + ARROW_DCHECK(max_read_bytes < 0 || bytes_batch <= max_read_bytes); + const uint8_t* in_end = in + (max_read_bytes >= 0 ? max_read_bytes : bytes_batch); + // In case of misalignment, we need to run the prolog until aligned. int extracted = unpack_exact(in, out, batch_size, bit_offset); // We either extracted everything or found a alignment @@ -195,7 +209,6 @@ void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int bit_ constexpr auto kBytesRead = UnpackerForWidth::kBytesRead; if constexpr (kValuesUnpacked > 0) { - const uint8_t* in_end = in + bit_util::CeilDiv(batch_size * kPackedBitWidth, 8); const uint8_t* in_last = in_end - kBytesRead; // Running the optimized kernel for batch extraction while ((batch_size >= kValuesUnpacked) && (in <= in_last)) { @@ -219,276 +232,401 @@ void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int bit_ } template