diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h index 376de56a9af..f77eabf958f 100644 --- a/cpp/src/arrow/util/bit_stream_utils_internal.h +++ b/cpp/src/arrow/util/bit_stream_utils_internal.h @@ -273,14 +273,19 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) { batch_size = static_cast(remaining_bits / num_bits); } + const ::arrow::internal::UnpackOptions opts{ + /* .batch_size= */ batch_size, + /* .bit_width= */ num_bits, + /* .bit_offset= */ bit_offset_, + /* .max_read_bytes= */ max_bytes_ - byte_offset_, + }; + if constexpr (std::is_same_v) { - ::arrow::internal::unpack(buffer_ + byte_offset_, v, batch_size, num_bits, - bit_offset_); + ::arrow::internal::unpack(buffer_ + byte_offset_, v, opts); } else { ::arrow::internal::unpack(buffer_ + byte_offset_, - reinterpret_cast*>(v), batch_size, - num_bits, bit_offset_); + reinterpret_cast*>(v), opts); } Advance(batch_size * num_bits); diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index fdb1c5a52ac..a39f3610085 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -17,7 +17,6 @@ #include -#include "arrow/util/bpacking_dispatch_internal.h" #include "arrow/util/bpacking_internal.h" #include "arrow/util/bpacking_scalar_internal.h" #include "arrow/util/bpacking_simd_internal.h" @@ -34,9 +33,11 @@ struct UnpackDynamicFunction { static constexpr auto implementations() { return std::array { - // Current SIMD unpack algorithm works terribly on SSE4.2 due to lack of variable - // rhsift and poor xsimd fallback. +#if defined(ARROW_HAVE_SSE4_2) + Implementation{DispatchLevel::NONE, &unpack_sse4_2}, +#else Implementation{DispatchLevel::NONE, &unpack_scalar}, +#endif #if defined(ARROW_HAVE_RUNTIME_AVX2) Implementation{DispatchLevel::AVX2, &unpack_avx2}, #endif @@ -50,19 +51,19 @@ struct UnpackDynamicFunction { } // namespace template -void unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits, int bit_offset) { +void unpack(const uint8_t* in, Uint* out, const UnpackOptions& opts) { #if defined(ARROW_HAVE_NEON) - return unpack_neon(in, out, batch_size, num_bits, bit_offset); + return unpack_neon(in, out, opts); #else static DynamicDispatch > dispatch; - return dispatch.func(in, out, batch_size, num_bits, bit_offset); + return dispatch.func(in, out, opts); #endif } -template void unpack(const uint8_t*, bool*, int, int, int); -template void unpack(const uint8_t*, uint8_t*, int, int, int); -template void unpack(const uint8_t*, uint16_t*, int, int, int); -template void unpack(const uint8_t*, uint32_t*, int, int, int); -template void unpack(const uint8_t*, uint64_t*, int, int, int); +template void unpack(const uint8_t*, bool*, const UnpackOptions&); +template void unpack(const uint8_t*, uint8_t*, const UnpackOptions&); +template void unpack(const uint8_t*, uint16_t*, const UnpackOptions&); +template void unpack(const uint8_t*, uint32_t*, const UnpackOptions&); +template void unpack(const uint8_t*, uint64_t*, const UnpackOptions&); } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc index 7094656eeef..69ce84c37f3 100644 --- a/cpp/src/arrow/util/bpacking_benchmark.cc +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include @@ -33,7 +34,7 @@ namespace arrow::internal { namespace { template -using UnpackFunc = void (*)(const uint8_t*, Int*, int, int, int); +using UnpackFunc = void (*)(const uint8_t*, Int*, const UnpackOptions&); /// Get the number of bytes associate with a packing. constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) { @@ -86,33 +87,62 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc unpack, bo const uint8_t* packed_ptr = GetNextAlignedByte(packed.data(), sizeof(Int)) + (aligned ? 0 : 1); - std::vector unpacked(num_values, 0); + auto unpacked = std::make_unique(num_values); + + const ::arrow::internal::UnpackOptions opts{ + /* .batch_size= */ num_values, + /* .bit_width= */ bit_width, + /* .bit_offset= */ 0, + /* .max_read_bytes= */ -1, + }; for (auto _ : state) { - unpack(packed_ptr, unpacked.data(), num_values, bit_width, /* bit_offset = */ 0); + unpack(packed_ptr, unpacked.get(), opts); benchmark::ClobberMemory(); } state.SetItemsProcessed(num_values * state.iterations()); } -constexpr int32_t kMinRange = 64; -constexpr int32_t kMaxRange = 32768; +/// Currently, the minimum unpack SIMD kernel size is 32 and the bit packing encoder will +/// not emit runs larger than 512 (though other implementation might), so we biased the +/// benchmarks towards a rather small scale. +static const auto kNumValuesRange = benchmark::CreateRange(32, 512, 2); +constexpr std::initializer_list kBitWidths8 = {1, 2, 8}; constexpr std::initializer_list kBitWidths16 = {1, 2, 8, 13}; constexpr std::initializer_list kBitWidths32 = {1, 2, 8, 20}; constexpr std::initializer_list kBitWidths64 = {1, 2, 8, 20, 47}; + +static const std::vector> kBitWidthsNumValuesBool = { + {0, 1}, + kNumValuesRange, +}; +static const std::vector> kBitWidthsNumValues8 = { + kBitWidths8, + kNumValuesRange, +}; static const std::vector> kBitWidthsNumValues16 = { kBitWidths16, - benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), + kNumValuesRange, }; static const std::vector> kBitWidthsNumValues32 = { kBitWidths32, - benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), + kNumValuesRange, }; static const std::vector> kBitWidthsNumValues64 = { kBitWidths64, - benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), + kNumValuesRange, }; +/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro. +void BM_UnpackBool(benchmark::State& state, bool aligned, UnpackFunc unpack, + bool skip = false, std::string skip_msg = "") { + return BM_Unpack(state, aligned, unpack, skip, std::move(skip_msg)); +} +/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro. +void BM_UnpackUint8(benchmark::State& state, bool aligned, UnpackFunc unpack, + bool skip = false, std::string skip_msg = "") { + return BM_Unpack(state, aligned, unpack, skip, std::move(skip_msg)); +} /// Nudge for MSVC template inside BENCHMARK_CAPTURE macro. void BM_UnpackUint16(benchmark::State& state, bool aligned, UnpackFunc unpack, bool skip = false, std::string skip_msg = "") { @@ -129,6 +159,10 @@ void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc return BM_Unpack(state, aligned, unpack, skip, std::move(skip_msg)); } +BENCHMARK_CAPTURE(BM_UnpackBool, ScalarUnaligned, false, &unpack_scalar) + ->ArgsProduct(kBitWidthsNumValuesBool); +BENCHMARK_CAPTURE(BM_UnpackUint8, ScalarUnaligned, false, &unpack_scalar) + ->ArgsProduct(kBitWidthsNumValues8); BENCHMARK_CAPTURE(BM_UnpackUint16, ScalarUnaligned, false, &unpack_scalar) ->ArgsProduct(kBitWidthsNumValues16); BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, &unpack_scalar) @@ -136,7 +170,28 @@ BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, &unpack_scalar) ->ArgsProduct(kBitWidthsNumValues64); +#if defined(ARROW_HAVE_SSE4_2) +BENCHMARK_CAPTURE(BM_UnpackBool, Sse42Unaligned, false, &unpack_sse4_2) + ->ArgsProduct(kBitWidthsNumValuesBool); +BENCHMARK_CAPTURE(BM_UnpackUint8, Sse42Unaligned, false, &unpack_sse4_2) + ->ArgsProduct(kBitWidthsNumValues8); +BENCHMARK_CAPTURE(BM_UnpackUint16, Sse42Unaligned, false, &unpack_sse4_2) + ->ArgsProduct(kBitWidthsNumValues16); +BENCHMARK_CAPTURE(BM_UnpackUint32, Sse42Unaligned, false, &unpack_sse4_2) + ->ArgsProduct(kBitWidthsNumValues32); +BENCHMARK_CAPTURE(BM_UnpackUint64, Sse42Unaligned, false, &unpack_sse4_2) + ->ArgsProduct(kBitWidthsNumValues64); +#endif + #if defined(ARROW_HAVE_RUNTIME_AVX2) +BENCHMARK_CAPTURE(BM_UnpackBool, Avx2Unaligned, false, &unpack_avx2, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), + "Avx2 not available") + ->ArgsProduct(kBitWidthsNumValuesBool); +BENCHMARK_CAPTURE(BM_UnpackUint8, Avx2Unaligned, false, &unpack_avx2, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), + "Avx2 not available") + ->ArgsProduct(kBitWidthsNumValues8); BENCHMARK_CAPTURE(BM_UnpackUint16, Avx2Unaligned, false, &unpack_avx2, !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), "Avx2 not available") @@ -152,6 +207,14 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, Avx2Unaligned, false, &unpack_avx2, #endif #if defined(ARROW_HAVE_RUNTIME_AVX512) +BENCHMARK_CAPTURE(BM_UnpackBool, Avx512Unaligned, false, &unpack_avx512, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), + "Avx512 not available") + ->ArgsProduct(kBitWidthsNumValuesBool); +BENCHMARK_CAPTURE(BM_UnpackUint8, Avx512Unaligned, false, &unpack_avx512, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), + "Avx512 not available") + ->ArgsProduct(kBitWidthsNumValues8); BENCHMARK_CAPTURE(BM_UnpackUint16, Avx512Unaligned, false, &unpack_avx512, !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), "Avx512 not available") @@ -167,6 +230,10 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, Avx512Unaligned, false, &unpack_avx512) + ->ArgsProduct(kBitWidthsNumValuesBool); +BENCHMARK_CAPTURE(BM_UnpackUint8, NeonUnaligned, false, &unpack_neon) + ->ArgsProduct(kBitWidthsNumValues8); BENCHMARK_CAPTURE(BM_UnpackUint16, NeonUnaligned, false, &unpack_neon) ->ArgsProduct(kBitWidthsNumValues16); BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, &unpack_neon) @@ -175,6 +242,16 @@ BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, &unpack_neon) ->ArgsProduct(kBitWidthsNumValues64); #endif +BENCHMARK_CAPTURE(BM_UnpackBool, DynamicAligned, true, &unpack) + ->ArgsProduct(kBitWidthsNumValuesBool); +BENCHMARK_CAPTURE(BM_UnpackBool, DynamicUnaligned, false, &unpack) + ->ArgsProduct(kBitWidthsNumValuesBool); + +BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicAligned, true, &unpack) + ->ArgsProduct(kBitWidthsNumValues8); +BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicUnaligned, false, &unpack) + ->ArgsProduct(kBitWidthsNumValues8); + BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicAligned, true, &unpack) ->ArgsProduct(kBitWidthsNumValues16); BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicUnaligned, false, &unpack) diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h index eed5542808d..509337f0b31 100644 --- a/cpp/src/arrow/util/bpacking_dispatch_internal.h +++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h @@ -22,6 +22,7 @@ #include #include "arrow/util/bit_util.h" +#include "arrow/util/bpacking_internal.h" #include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" @@ -166,13 +167,26 @@ int unpack_exact(const uint8_t* in, Uint* out, int batch_size, int bit_offset) { /// fixed amount of values (usually constrained by SIMD batch sizes and /// byte alignment). /// @tparam UnpackedUInt The type in which we unpack the values. +/// @param batch_size The number of values to unpack. +/// @param max_read_bytes The maximum size of the input byte array that can be read. +/// This is used to safely overread. +/// Negative value to deduce from batch_size. template typename Unpacker, typename UnpackedUInt> -void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int bit_offset) { +void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int bit_offset, + int max_read_bytes) { if constexpr (kPackedBitWidth == 0) { // Easy case to handle, simply setting memory to zero. return unpack_null(in, out, batch_size); } else { + // Number of size to read according to batch_size. + const int bytes_batch = static_cast( + bit_util::BytesForBits(batch_size * kPackedBitWidth + bit_offset)); + // If specified, max_read_bytes must be greater that the bytes needed to extract the + // number of desired values. + ARROW_DCHECK(max_read_bytes < 0 || bytes_batch <= max_read_bytes); + const uint8_t* in_end = in + (max_read_bytes >= 0 ? max_read_bytes : bytes_batch); + // In case of misalignment, we need to run the prolog until aligned. int extracted = unpack_exact(in, out, batch_size, bit_offset); // We either extracted everything or found a alignment @@ -188,20 +202,29 @@ void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int bit_ return unpack_full(in, out, batch_size); } else { using UnpackerForWidth = Unpacker; + // Number of values extracted by one iteration of the kernel constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked; + // Number of bytes read, but not necessarily unpacked, by one iteration of the + // kernel. This constant prevent reading past buffer end. + constexpr auto kBytesRead = UnpackerForWidth::kBytesRead; + + if constexpr (kValuesUnpacked > 0) { + const uint8_t* in_last = in_end - kBytesRead; + // Running the optimized kernel for batch extraction + while ((batch_size >= kValuesUnpacked) && (in <= in_last)) { + in = UnpackerForWidth::unpack(in, out); + out += kValuesUnpacked; + batch_size -= kValuesUnpacked; + } - // Running the optimized kernel for batch extraction - const int unpacker_iter_count = batch_size / kValuesUnpacked; - for (int i = 0; i < unpacker_iter_count; ++i) { - in = UnpackerForWidth::unpack(in, out); - out += kValuesUnpacked; + // Performance check aking sure we ran the kernel loop as much as possible: + // Either we ran out because we could not pack enough values, or because we would + // overread. + ARROW_DCHECK((batch_size < kValuesUnpacked) || (in_end - in) < kBytesRead); } - batch_size -= unpacker_iter_count * kValuesUnpacked; // Running the epilog for the remaining values that don't fit in a kernel - ARROW_DCHECK_LT(batch_size, kValuesUnpacked); ARROW_DCHECK_GE(batch_size, 0); - ARROW_COMPILER_ASSUME(batch_size < kValuesUnpacked); ARROW_COMPILER_ASSUME(batch_size >= 0); unpack_exact(in, out, batch_size, /* bit_offset= */ 0); } @@ -209,276 +232,401 @@ void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int bit_ } template