From 18f501bcb99fc5a96e33eb2a595bd9c646d07d0b Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 29 Jan 2026 15:58:09 +0000 Subject: [PATCH 01/14] feat[btrblocks]: add a dynamic btr blocks compressor Signed-off-by: Joe Isaacs --- Cargo.lock | 1 + vortex-btrblocks/Cargo.toml | 1 + vortex-btrblocks/src/builder.rs | 176 +++++++++++++++++++++ vortex-btrblocks/src/float.rs | 55 ++++--- vortex-btrblocks/src/integer.rs | 65 ++++---- vortex-btrblocks/src/lib.rs | 197 +++++++++++++++++++++--- vortex-btrblocks/src/string.rs | 36 +++-- vortex-layout/src/layouts/compressed.rs | 16 +- 8 files changed, 456 insertions(+), 91 deletions(-) create mode 100644 vortex-btrblocks/src/builder.rs diff --git a/Cargo.lock b/Cargo.lock index 9cc10fe4415..9e35cf2f424 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10290,6 +10290,7 @@ name = "vortex-btrblocks" version = "0.1.0" dependencies = [ "codspeed-divan-compat", + "enum-iterator", "getrandom 0.3.4", "itertools 0.14.0", "num-traits", diff --git a/vortex-btrblocks/Cargo.toml b/vortex-btrblocks/Cargo.toml index dedb6fb5ade..85d7a8d4d67 100644 --- a/vortex-btrblocks/Cargo.toml +++ b/vortex-btrblocks/Cargo.toml @@ -14,6 +14,7 @@ rust-version = { workspace = true } version = { workspace = true } [dependencies] +enum-iterator = { workspace = true } getrandom_v03 = { workspace = true } itertools = { workspace = true } num-traits = { workspace = true } diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs new file mode 100644 index 00000000000..2fa282a8305 --- /dev/null +++ b/vortex-btrblocks/src/builder.rs @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Builder for configuring `BtrBlocksCompressor` instances. + +use enum_iterator::all; +use vortex_utils::aliases::hash_set::HashSet; + +use crate::BtrBlocksCompressor; +use crate::BtrBlocksCompressorConfig; +use crate::FloatCode; +use crate::IntCode; +use crate::StringCode; + +/// Builder for creating configured [`BtrBlocksCompressor`] instances. +/// +/// Use this builder to configure which compression schemes are allowed for each data type. +/// By default, all schemes are enabled. +/// +/// # Examples +/// +/// ```rust +/// use vortex_btrblocks::{BtrBlocksCompressorBuilder, IntCode, FloatCode}; +/// +/// // Default compressor - all schemes allowed +/// let compressor = BtrBlocksCompressorBuilder::new().build(); +/// +/// // Exclude specific schemes +/// let compressor = BtrBlocksCompressorBuilder::new() +/// .exclude_int([IntCode::Dict]) +/// .build(); +/// +/// // Exclude then re-include +/// let compressor = BtrBlocksCompressorBuilder::new() +/// .exclude_int([IntCode::Dict, IntCode::Rle]) +/// .include_int([IntCode::Dict]) +/// .build(); +/// ``` +#[derive(Debug, Clone)] +pub struct BtrBlocksCompressorBuilder { + int_schemes: HashSet, + float_schemes: HashSet, + string_schemes: HashSet, +} + +impl Default for BtrBlocksCompressorBuilder { + fn default() -> Self { + Self::new() + } +} + +impl BtrBlocksCompressorBuilder { + /// Creates a new builder with all schemes enabled. + pub fn new() -> Self { + Self { + int_schemes: all::().collect(), + float_schemes: all::().collect(), + string_schemes: all::().collect(), + } + } + + /// Excludes the specified integer compression schemes (set difference). + /// + /// # Example + /// + /// ```rust + /// use vortex_btrblocks::{BtrBlocksCompressorBuilder, IntCode}; + /// + /// let compressor = BtrBlocksCompressorBuilder::new() + /// .exclude_int([IntCode::Dict, IntCode::Rle]) + /// .build(); + /// ``` + pub fn exclude_int(mut self, schemes: impl IntoIterator) -> Self { + for scheme in schemes { + self.int_schemes.remove(&scheme); + } + self + } + + /// Excludes the specified float compression schemes (set difference). + /// + /// # Example + /// + /// ```rust + /// use vortex_btrblocks::{BtrBlocksCompressorBuilder, FloatCode}; + /// + /// let compressor = BtrBlocksCompressorBuilder::new() + /// .exclude_float([FloatCode::Dict, FloatCode::Alp]) + /// .build(); + /// ``` + pub fn exclude_float(mut self, schemes: impl IntoIterator) -> Self { + for scheme in schemes { + self.float_schemes.remove(&scheme); + } + self + } + + /// Excludes the specified string compression schemes (set difference). + /// + /// # Example + /// + /// ```rust + /// use vortex_btrblocks::{BtrBlocksCompressorBuilder, StringCode}; + /// + /// let compressor = BtrBlocksCompressorBuilder::new() + /// .exclude_string([StringCode::Dict, StringCode::Fsst]) + /// .build(); + /// ``` + pub fn exclude_string(mut self, schemes: impl IntoIterator) -> Self { + for scheme in schemes { + self.string_schemes.remove(&scheme); + } + self + } + + /// Includes the specified integer compression schemes (set union). + /// + /// # Example + /// + /// ```rust + /// use vortex_btrblocks::{BtrBlocksCompressorBuilder, IntCode}; + /// + /// let compressor = BtrBlocksCompressorBuilder::new() + /// .exclude_int([IntCode::Dict, IntCode::Rle]) + /// .include_int([IntCode::Dict]) // re-enables Dict + /// .build(); + /// ``` + pub fn include_int(mut self, schemes: impl IntoIterator) -> Self { + self.int_schemes.extend(schemes); + self + } + + /// Includes the specified float compression schemes (set union). + /// + /// # Example + /// + /// ```rust + /// use vortex_btrblocks::{BtrBlocksCompressorBuilder, FloatCode}; + /// + /// let compressor = BtrBlocksCompressorBuilder::new() + /// .exclude_float([FloatCode::Alp, FloatCode::AlpRd]) + /// .include_float([FloatCode::Alp]) // re-enables Alp + /// .build(); + /// ``` + pub fn include_float(mut self, schemes: impl IntoIterator) -> Self { + self.float_schemes.extend(schemes); + self + } + + /// Includes the specified string compression schemes (set union). + /// + /// # Example + /// + /// ```rust + /// use vortex_btrblocks::{BtrBlocksCompressorBuilder, StringCode}; + /// + /// let compressor = BtrBlocksCompressorBuilder::new() + /// .exclude_string([StringCode::Dict, StringCode::Fsst]) + /// .include_string([StringCode::Dict]) // re-enables Dict + /// .build(); + /// ``` + pub fn include_string(mut self, schemes: impl IntoIterator) -> Self { + self.string_schemes.extend(schemes); + self + } + + /// Builds the configured `BtrBlocksCompressor`. + pub fn build(self) -> BtrBlocksCompressor { + let config = BtrBlocksCompressorConfig::from_schemes( + self.int_schemes, + self.float_schemes, + self.string_schemes, + ); + BtrBlocksCompressor::from_config(config) + } +} diff --git a/vortex-btrblocks/src/float.rs b/vortex-btrblocks/src/float.rs index 795d2fb0fcb..017ccdb055e 100644 --- a/vortex-btrblocks/src/float.rs +++ b/vortex-btrblocks/src/float.rs @@ -4,6 +4,7 @@ pub(crate) mod dictionary; mod stats; +use enum_iterator::Sequence; use vortex_alp::ALPArray; use vortex_alp::ALPVTable; use vortex_alp::RDEncoder; @@ -65,19 +66,30 @@ impl Compressor for FloatCompressor { } fn dict_scheme_code() -> FloatCode { - DICT_SCHEME + FloatCode::Dict } } -const UNCOMPRESSED_SCHEME: FloatCode = FloatCode(0); -const CONSTANT_SCHEME: FloatCode = FloatCode(1); -const ALP_SCHEME: FloatCode = FloatCode(2); -const ALPRD_SCHEME: FloatCode = FloatCode(3); -const DICT_SCHEME: FloatCode = FloatCode(4); -const RUN_END_SCHEME: FloatCode = FloatCode(5); -const RUN_LENGTH_SCHEME: FloatCode = FloatCode(6); - -const SPARSE_SCHEME: FloatCode = FloatCode(7); +/// Unique identifier for float compression schemes. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Sequence)] +pub enum FloatCode { + /// No compression applied. + Uncompressed, + /// Constant encoding for arrays with a single distinct value. + Constant, + /// ALP (Adaptive Lossless floating-Point) encoding. + Alp, + /// ALPRD (ALP with Right Division) encoding variant. + AlpRd, + /// Dictionary encoding for low-cardinality float values. + Dict, + /// Run-end encoding. + RunEnd, + /// RLE encoding - generic run-length encoding. + Rle, + /// Sparse encoding for null-dominated arrays. + Sparse, +} #[derive(Debug, Copy, Clone)] struct UncompressedScheme; @@ -98,7 +110,7 @@ struct DictScheme; pub struct NullDominated; pub const RLE_FLOAT_SCHEME: RLEScheme = RLEScheme::new( - RUN_LENGTH_SCHEME, + FloatCode::Rle, |values, is_sample, allowed_cascading, excludes| { FloatCompressor::compress(values, is_sample, allowed_cascading, excludes) }, @@ -109,7 +121,7 @@ impl Scheme for UncompressedScheme { type CodeType = FloatCode; fn code(&self) -> FloatCode { - UNCOMPRESSED_SCHEME + FloatCode::Uncompressed } fn expected_compression_ratio( @@ -138,7 +150,7 @@ impl Scheme for ConstantScheme { type CodeType = FloatCode; fn code(&self) -> FloatCode { - CONSTANT_SCHEME + FloatCode::Constant } fn expected_compression_ratio( @@ -194,15 +206,12 @@ impl Scheme for ConstantScheme { } } -#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] -pub struct FloatCode(u8); - impl Scheme for ALPScheme { type StatsType = FloatStats; type CodeType = FloatCode; fn code(&self) -> FloatCode { - ALP_SCHEME + FloatCode::Alp } fn expected_compression_ratio( @@ -247,10 +256,10 @@ impl Scheme for ALPScheme { // Patches are not compressed. They should be infrequent, and if they are not then we want // to keep them linear for easy indexing. let mut int_excludes = Vec::new(); - if excludes.contains(&DICT_SCHEME) { + if excludes.contains(&FloatCode::Dict) { int_excludes.push(integer::DictScheme.code()); } - if excludes.contains(&RUN_END_SCHEME) { + if excludes.contains(&FloatCode::RunEnd) { int_excludes.push(integer::RunEndScheme.code()); } @@ -268,7 +277,7 @@ impl Scheme for ALPRDScheme { type CodeType = FloatCode; fn code(&self) -> FloatCode { - ALPRD_SCHEME + FloatCode::AlpRd } fn expected_compression_ratio( @@ -321,7 +330,7 @@ impl Scheme for DictScheme { type CodeType = FloatCode; fn code(&self) -> FloatCode { - DICT_SCHEME + FloatCode::Dict } fn expected_compression_ratio( @@ -383,7 +392,7 @@ impl Scheme for DictScheme { &dict_array.values().to_primitive(), is_sample, allowed_cascading - 1, - &[DICT_SCHEME], + &[FloatCode::Dict], )?; // SAFETY: compressing codes or values does not alter the invariants @@ -402,7 +411,7 @@ impl Scheme for NullDominated { type CodeType = FloatCode; fn code(&self) -> Self::CodeType { - SPARSE_SCHEME + FloatCode::Sparse } fn expected_compression_ratio( diff --git a/vortex-btrblocks/src/integer.rs b/vortex-btrblocks/src/integer.rs index 459b067b8fe..53fcf06aa9f 100644 --- a/vortex-btrblocks/src/integer.rs +++ b/vortex-btrblocks/src/integer.rs @@ -4,9 +4,7 @@ pub mod dictionary; mod stats; -use std::fmt::Debug; -use std::hash::Hash; - +use enum_iterator::Sequence; pub use stats::IntegerStats; use vortex_array::ArrayRef; use vortex_array::IntoArray; @@ -70,7 +68,7 @@ impl Compressor for IntCompressor { } fn dict_scheme_code() -> IntCode { - DICT_SCHEME + IntCode::Dict } } @@ -105,19 +103,30 @@ pub trait IntegerScheme: Scheme {} // Auto-impl impl IntegerScheme for T where T: Scheme {} -#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] -pub struct IntCode(u8); - -const UNCOMPRESSED_SCHEME: IntCode = IntCode(0); -const CONSTANT_SCHEME: IntCode = IntCode(1); -const FOR_SCHEME: IntCode = IntCode(2); -const ZIGZAG_SCHEME: IntCode = IntCode(3); -const BITPACKING_SCHEME: IntCode = IntCode(4); -const SPARSE_SCHEME: IntCode = IntCode(5); -const DICT_SCHEME: IntCode = IntCode(6); -const RUN_END_SCHEME: IntCode = IntCode(7); -const SEQUENCE_SCHEME: IntCode = IntCode(8); -const RUN_LENGTH_SCHEME: IntCode = IntCode(9); +/// Unique identifier for integer compression schemes. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Sequence)] +pub enum IntCode { + /// No compression applied. + Uncompressed, + /// Constant encoding for arrays with a single distinct value. + Constant, + /// Frame of Reference encoding - subtracts minimum value then bitpacks. + For, + /// ZigZag encoding - transforms negative integers to positive for better bitpacking. + ZigZag, + /// BitPacking encoding - compresses non-negative integers by reducing bit width. + BitPacking, + /// Sparse encoding - optimizes null-dominated or single-value-dominated arrays. + Sparse, + /// Dictionary encoding - creates a dictionary of unique values. + Dict, + /// Run-end encoding - run-length encoding with end positions. + RunEnd, + /// Sequence encoding - detects sequential patterns. + Sequence, + /// RLE encoding - generic run-length encoding. + Rle, +} #[derive(Debug, Copy, Clone)] pub struct UncompressedScheme; @@ -150,7 +159,7 @@ pub struct SequenceScheme; const RUN_END_THRESHOLD: u32 = 4; pub const RLE_INTEGER_SCHEME: RLEScheme = RLEScheme::new( - RUN_LENGTH_SCHEME, + IntCode::Rle, |values, is_sample, allowed_cascading, excludes| { IntCompressor::compress_no_dict(values, is_sample, allowed_cascading, excludes) }, @@ -161,7 +170,7 @@ impl Scheme for UncompressedScheme { type CodeType = IntCode; fn code(&self) -> IntCode { - UNCOMPRESSED_SCHEME + IntCode::Uncompressed } fn expected_compression_ratio( @@ -191,7 +200,7 @@ impl Scheme for ConstantScheme { type CodeType = IntCode; fn code(&self) -> IntCode { - CONSTANT_SCHEME + IntCode::Constant } fn is_constant(&self) -> bool { @@ -252,7 +261,7 @@ impl Scheme for FORScheme { type CodeType = IntCode; fn code(&self) -> IntCode { - FOR_SCHEME + IntCode::For } fn expected_compression_ratio( @@ -335,7 +344,7 @@ impl Scheme for ZigZagScheme { type CodeType = IntCode; fn code(&self) -> IntCode { - ZIGZAG_SCHEME + IntCode::ZigZag } fn expected_compression_ratio( @@ -405,7 +414,7 @@ impl Scheme for BitPackingScheme { type CodeType = IntCode; fn code(&self) -> IntCode { - BITPACKING_SCHEME + IntCode::BitPacking } fn expected_compression_ratio( @@ -461,7 +470,7 @@ impl Scheme for SparseScheme { type CodeType = IntCode; fn code(&self) -> IntCode { - SPARSE_SCHEME + IntCode::Sparse } // We can avoid asserting the encoding tree instead. @@ -574,7 +583,7 @@ impl Scheme for DictScheme { type CodeType = IntCode; fn code(&self) -> IntCode { - DICT_SCHEME + IntCode::Dict } fn expected_compression_ratio( @@ -633,7 +642,7 @@ impl Scheme for DictScheme { // Cascade the codes child // Don't allow SequenceArray as the codes child as it merely adds extra indirection without actually compressing data. - let mut new_excludes = vec![DICT_SCHEME, SEQUENCE_SCHEME]; + let mut new_excludes = vec![IntCode::Dict, IntCode::Sequence]; new_excludes.extend_from_slice(excludes); let compressed_codes = IntCompressor::compress_no_dict( @@ -659,7 +668,7 @@ impl Scheme for RunEndScheme { type CodeType = IntCode; fn code(&self) -> IntCode { - RUN_END_SCHEME + IntCode::RunEnd } fn expected_compression_ratio( @@ -740,7 +749,7 @@ impl Scheme for SequenceScheme { type CodeType = IntCode; fn code(&self) -> Self::CodeType { - SEQUENCE_SCHEME + IntCode::Sequence } fn expected_compression_ratio( diff --git a/vortex-btrblocks/src/lib.rs b/vortex-btrblocks/src/lib.rs index 7888fa671dd..2b50589dcdc 100644 --- a/vortex-btrblocks/src/lib.rs +++ b/vortex-btrblocks/src/lib.rs @@ -21,18 +21,25 @@ //! # Example //! //! ```rust -//! use vortex_btrblocks::BtrBlocksCompressor; +//! use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, IntCode}; //! use vortex_array::Array; //! +//! // Default compressor with all schemes enabled //! let compressor = BtrBlocksCompressor::default(); -//! // let compressed = compressor.compress(&array)?; +//! +//! // Configure with builder to exclude specific schemes +//! let compressor = BtrBlocksCompressorBuilder::new() +//! .exclude_int([IntCode::Dict]) +//! .build(); //! ``` //! //! [BtrBlocks]: https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/papers/btrblocks.pdf use std::fmt::Debug; use std::hash::Hash; +use std::sync::Arc; +use enum_iterator::all; use vortex_array::Array; use vortex_array::ArrayRef; use vortex_array::Canonical; @@ -55,18 +62,23 @@ use vortex_dtype::Nullability; use vortex_dtype::datetime::Timestamp; use vortex_error::VortexExpect; use vortex_error::VortexResult; +use vortex_utils::aliases::hash_set::HashSet; use crate::decimal::compress_decimal; +pub use crate::float::FloatCode; pub use crate::float::FloatCompressor; pub use crate::float::FloatStats; pub use crate::float::dictionary::dictionary_encode as float_dictionary_encode; +pub use crate::integer::IntCode; pub use crate::integer::IntCompressor; pub use crate::integer::IntegerStats; pub use crate::integer::dictionary::dictionary_encode as integer_dictionary_encode; +pub use crate::string::StringCode; pub use crate::string::StringCompressor; pub use crate::string::StringStats; pub use crate::temporal::compress_temporal; +mod builder; mod decimal; mod float; mod integer; @@ -76,6 +88,8 @@ mod sample; mod string; mod temporal; +pub use builder::BtrBlocksCompressorBuilder; + /// Configures how stats are generated. pub struct GenerateStatsOptions { /// Should distinct values should be counted during stats generation. @@ -357,6 +371,49 @@ pub trait Compressor { } } +/// Configuration for allowed compression schemes. +/// +/// This is immutable after construction and can be shared across multiple compression calls. +/// Use [`BtrBlocksCompressorBuilder`] to create a custom configuration. +#[derive(Debug, Clone)] +pub struct BtrBlocksCompressorConfig { + /// Allowed integer compression schemes. + int_schemes: HashSet, + + /// Allowed float compression schemes. + float_schemes: HashSet, + + /// Allowed string compression schemes. + string_schemes: HashSet, +} + +impl Default for BtrBlocksCompressorConfig { + fn default() -> Self { + Self { + int_schemes: all::().collect(), + float_schemes: all::().collect(), + string_schemes: all::().collect(), + } + } +} + +impl BtrBlocksCompressorConfig { + /// Creates a config from the given allowed schemes. + /// + /// This is used by [`BtrBlocksCompressorBuilder::build`]. + pub(crate) fn from_schemes( + int_schemes: HashSet, + float_schemes: HashSet, + string_schemes: HashSet, + ) -> Self { + Self { + int_schemes, + float_schemes, + string_schemes, + } + } +} + /// The main compressor type implementing BtrBlocks-inspired compression. /// /// This compressor applies adaptive compression schemes to arrays based on their data types @@ -369,26 +426,122 @@ pub trait Compressor { /// 3. Recursively compressing nested structures /// 4. Applying type-specific compression for primitives, strings, and temporal data /// +/// Use [`BtrBlocksCompressorBuilder`] to configure which compression schemes are enabled. +/// /// # Examples /// /// ```rust -/// use vortex_btrblocks::BtrBlocksCompressor; -/// use vortex_array::Array; +/// use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, IntCode}; /// +/// // Default compressor - all schemes allowed /// let compressor = BtrBlocksCompressor::default(); -/// // let compressed = compressor.compress(&array)?; +/// +/// // Exclude specific schemes using the builder +/// let compressor = BtrBlocksCompressorBuilder::new() +/// .exclude_int([IntCode::Dict]) +/// .build(); /// ``` -#[derive(Default, Debug, Clone)] +#[derive(Debug, Clone)] pub struct BtrBlocksCompressor { - /// Whether to exclude ints from dictionary encoding. - /// - /// When `true`, integer arrays will not use dictionary compression schemes, - /// which can be useful when the data has high cardinality or when dictionary - /// overhead would exceed compression benefits. - pub exclude_int_dict_encoding: bool, + /// Immutable configuration for allowed schemes. + config: Arc, + + /// Runtime integer excludes used during recursion. + int_excludes: Vec, + + /// Runtime float excludes used during recursion. + float_excludes: Vec, + + /// Runtime string excludes used during recursion. + string_excludes: Vec, +} + +impl Default for BtrBlocksCompressor { + fn default() -> Self { + Self { + config: Arc::new(BtrBlocksCompressorConfig::default()), + int_excludes: Vec::new(), + float_excludes: Vec::new(), + string_excludes: Vec::new(), + } + } } impl BtrBlocksCompressor { + /// Creates a new compressor with default settings (all schemes allowed). + pub fn new() -> Self { + Self::default() + } + + /// Creates a compressor from a config. + /// + /// This is used by [`BtrBlocksCompressorBuilder::build`]. + pub(crate) fn from_config(config: BtrBlocksCompressorConfig) -> Self { + Self { + config: Arc::new(config), + int_excludes: Vec::new(), + float_excludes: Vec::new(), + string_excludes: Vec::new(), + } + } + + fn int_excludes(&self) -> Vec { + all::() + .filter(|c| !self.config.int_schemes.contains(c) || self.int_excludes.contains(c)) + .collect() + } + + fn float_excludes(&self) -> Vec { + all::() + .filter(|c| !self.config.float_schemes.contains(c) || self.float_excludes.contains(c)) + .collect() + } + + fn string_excludes(&self) -> Vec { + all::() + .filter(|c| !self.config.string_schemes.contains(c) || self.string_excludes.contains(c)) + .collect() + } + + /// Returns a new compressor with additional runtime int excludes for recursion. + #[allow(dead_code)] + fn with_int_excludes(&self, excludes: impl IntoIterator) -> Self { + let mut int_excludes = self.int_excludes.clone(); + int_excludes.extend(excludes); + Self { + config: Arc::clone(&self.config), + int_excludes, + float_excludes: self.float_excludes.clone(), + string_excludes: self.string_excludes.clone(), + } + } + + /// Returns a new compressor with additional runtime float excludes for recursion. + #[allow(dead_code)] + fn with_float_excludes(&self, excludes: impl IntoIterator) -> Self { + let mut float_excludes = self.float_excludes.clone(); + float_excludes.extend(excludes); + Self { + config: Arc::clone(&self.config), + int_excludes: self.int_excludes.clone(), + float_excludes, + string_excludes: self.string_excludes.clone(), + } + } + + /// Returns a new compressor with additional runtime string excludes for recursion. + #[allow(dead_code)] + fn with_string_excludes(&self, excludes: impl IntoIterator) -> Self { + let mut string_excludes = self.string_excludes.clone(); + string_excludes.extend(excludes); + Self { + config: Arc::clone(&self.config), + int_excludes: self.int_excludes.clone(), + float_excludes: self.float_excludes.clone(), + string_excludes, + } + } + /// Compresses an array using BtrBlocks-inspired compression. /// /// First canonicalizes and compacts the array, then applies optimal compression schemes. @@ -412,13 +565,14 @@ impl BtrBlocksCompressor { Canonical::Bool(bool_array) => Ok(bool_array.into_array()), Canonical::Primitive(primitive) => { if primitive.ptype().is_int() { - if self.exclude_int_dict_encoding { - IntCompressor::compress_no_dict(&primitive, false, MAX_CASCADE, &[]) - } else { - IntCompressor::compress(&primitive, false, MAX_CASCADE, &[]) - } + IntCompressor::compress(&primitive, false, MAX_CASCADE, &self.int_excludes()) } else { - FloatCompressor::compress(&primitive, false, MAX_CASCADE, &[]) + FloatCompressor::compress( + &primitive, + false, + MAX_CASCADE, + &self.float_excludes(), + ) } } Canonical::Decimal(decimal) => compress_decimal(&decimal), @@ -483,7 +637,12 @@ impl BtrBlocksCompressor { .dtype() .eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable)) { - StringCompressor::compress(&strings, false, MAX_CASCADE, &[]) + StringCompressor::compress( + &strings, + false, + MAX_CASCADE, + &self.string_excludes(), + ) } else { // Binary arrays do not compress Ok(strings.into_array()) diff --git a/vortex-btrblocks/src/string.rs b/vortex-btrblocks/src/string.rs index 2ee8d036ea9..560003be95c 100644 --- a/vortex-btrblocks/src/string.rs +++ b/vortex-btrblocks/src/string.rs @@ -1,6 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use enum_iterator::Sequence; use vortex_array::ArrayRef; use vortex_array::IntoArray; use vortex_array::ToCanonical; @@ -128,7 +129,7 @@ impl Compressor for StringCompressor { } fn dict_scheme_code() -> StringCode { - DICT_SCHEME + StringCode::Dict } } @@ -151,22 +152,27 @@ pub struct ConstantScheme; #[derive(Debug, Copy, Clone)] pub struct NullDominated; -#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] -pub struct StringCode(u8); - -const UNCOMPRESSED_SCHEME: StringCode = StringCode(0); -const DICT_SCHEME: StringCode = StringCode(1); -const FSST_SCHEME: StringCode = StringCode(2); -const CONSTANT_SCHEME: StringCode = StringCode(3); - -const SPARSE_SCHEME: StringCode = StringCode(4); +/// Unique identifier for string compression schemes. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Sequence)] +pub enum StringCode { + /// No compression applied. + Uncompressed, + /// Dictionary encoding for low-cardinality strings. + Dict, + /// FSST (Fast Static Symbol Table) compression. + Fsst, + /// Constant encoding for arrays with a single distinct value. + Constant, + /// Sparse encoding for null-dominated arrays. + Sparse, +} impl Scheme for UncompressedScheme { type StatsType = StringStats; type CodeType = StringCode; fn code(&self) -> StringCode { - UNCOMPRESSED_SCHEME + StringCode::Uncompressed } fn expected_compression_ratio( @@ -195,7 +201,7 @@ impl Scheme for DictScheme { type CodeType = StringCode; fn code(&self) -> StringCode { - DICT_SCHEME + StringCode::Dict } fn expected_compression_ratio( @@ -271,7 +277,7 @@ impl Scheme for FSSTScheme { type CodeType = StringCode; fn code(&self) -> StringCode { - FSST_SCHEME + StringCode::Fsst } fn compress( @@ -321,7 +327,7 @@ impl Scheme for ConstantScheme { type CodeType = StringCode; fn code(&self) -> Self::CodeType { - CONSTANT_SCHEME + StringCode::Constant } fn is_constant(&self) -> bool { @@ -382,7 +388,7 @@ impl Scheme for NullDominated { type CodeType = StringCode; fn code(&self) -> Self::CodeType { - SPARSE_SCHEME + StringCode::Sparse } fn expected_compression_ratio( diff --git a/vortex-layout/src/layouts/compressed.rs b/vortex-layout/src/layouts/compressed.rs index ca6000afc2f..83e5e93f001 100644 --- a/vortex-layout/src/layouts/compressed.rs +++ b/vortex-layout/src/layouts/compressed.rs @@ -10,6 +10,8 @@ use vortex_array::ArrayContext; use vortex_array::ArrayRef; use vortex_array::expr::stats::Stat; use vortex_btrblocks::BtrBlocksCompressor; +use vortex_btrblocks::BtrBlocksCompressorBuilder; +use vortex_btrblocks::IntCode; use vortex_error::VortexResult; use vortex_io::runtime::Handle; @@ -75,12 +77,14 @@ impl CompressingStrategy { /// Set `exclude_int_dict_encoding` to true to prevent dictionary encoding of integer arrays, /// which is useful when compressing dictionary codes to avoid recursive dictionary encoding. pub fn new_btrblocks(child: S, exclude_int_dict_encoding: bool) -> Self { - Self::new( - child, - Arc::new(BtrBlocksCompressor { - exclude_int_dict_encoding, - }), - ) + let compressor = if exclude_int_dict_encoding { + BtrBlocksCompressorBuilder::new() + .exclude_int([IntCode::Dict]) + .build() + } else { + BtrBlocksCompressor::default() + }; + Self::new(child, Arc::new(compressor)) } /// Create a new writer that compresses using a `CompactCompressor` to compress chunks. From 08ea56e318246c0839d012214357c8028d7a728a Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 30 Jan 2026 15:33:22 +0000 Subject: [PATCH 02/14] wip Signed-off-by: Joe Isaacs --- Cargo.lock | 1 + vortex-btrblocks/Cargo.toml | 1 + vortex-btrblocks/benches/compress.rs | 4 +- vortex-btrblocks/src/builder.rs | 83 ++- vortex-btrblocks/src/decimal.rs | 17 +- vortex-btrblocks/src/float.rs | 615 ++++++++++++------- vortex-btrblocks/src/integer.rs | 852 ++++++++++++++++----------- vortex-btrblocks/src/lib.rs | 432 ++++++++------ vortex-btrblocks/src/patches.rs | 2 + vortex-btrblocks/src/rle.rs | 115 ++-- vortex-btrblocks/src/string.rs | 187 ++++-- vortex-btrblocks/src/temporal.rs | 22 +- 12 files changed, 1427 insertions(+), 904 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9e35cf2f424..104337ecd5a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10298,6 +10298,7 @@ dependencies = [ "rustc-hash", "test-with", "tracing", + "tracing-subscriber", "vortex-alp", "vortex-array", "vortex-buffer", diff --git a/vortex-btrblocks/Cargo.toml b/vortex-btrblocks/Cargo.toml index 85d7a8d4d67..679a81f33da 100644 --- a/vortex-btrblocks/Cargo.toml +++ b/vortex-btrblocks/Cargo.toml @@ -41,6 +41,7 @@ vortex-zigzag = { workspace = true } [dev-dependencies] divan = { workspace = true } test-with = { workspace = true } +tracing-subscriber = { workspace = true } vortex-array = { workspace = true, features = ["_test-harness"] } [features] diff --git a/vortex-btrblocks/benches/compress.rs b/vortex-btrblocks/benches/compress.rs index 0c51c0efdec..d352aa6064b 100644 --- a/vortex-btrblocks/benches/compress.rs +++ b/vortex-btrblocks/benches/compress.rs @@ -15,8 +15,8 @@ mod benchmarks { use vortex_array::ArrayRef; use vortex_array::IntoArray; use vortex_array::ToCanonical; - use vortex_btrblocks::Compressor; use vortex_btrblocks::IntCompressor; + use vortex_btrblocks::compress; use vortex_buffer::buffer_mut; use vortex_utils::aliases::hash_set::HashSet; @@ -46,7 +46,7 @@ mod benchmarks { .with_inputs(|| &array) .input_counter(|array| ItemsCount::new(array.len())) .input_counter(|array| BytesCount::of_many::(array.len())) - .bench_refs(|array| IntCompressor::compress(array, false, 3, &[]).unwrap()); + .bench_refs(|array| compress(&IntCompressor::default(), array, false, 3, &[]).unwrap()); } } diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index 2fa282a8305..06f04bc9086 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -3,14 +3,22 @@ //! Builder for configuring `BtrBlocksCompressor` instances. -use enum_iterator::all; +use itertools::Itertools; use vortex_utils::aliases::hash_set::HashSet; use crate::BtrBlocksCompressor; -use crate::BtrBlocksCompressorConfig; use crate::FloatCode; +use crate::FloatCompressor; use crate::IntCode; +use crate::IntCompressor; use crate::StringCode; +use crate::StringCompressor; +use crate::float::ALL_FLOAT_SCHEMES; +use crate::float::FloatScheme; +use crate::integer::ALL_INT_SCHEMES; +use crate::integer::IntegerScheme; +use crate::string::ALL_STRING_SCHEMES; +use crate::string::StringScheme; /// Builder for creating configured [`BtrBlocksCompressor`] instances. /// @@ -38,9 +46,9 @@ use crate::StringCode; /// ``` #[derive(Debug, Clone)] pub struct BtrBlocksCompressorBuilder { - int_schemes: HashSet, - float_schemes: HashSet, - string_schemes: HashSet, + int_schemes: HashSet<&'static dyn IntegerScheme>, + float_schemes: HashSet<&'static dyn FloatScheme>, + string_schemes: HashSet<&'static dyn StringScheme>, } impl Default for BtrBlocksCompressorBuilder { @@ -53,9 +61,9 @@ impl BtrBlocksCompressorBuilder { /// Creates a new builder with all schemes enabled. pub fn new() -> Self { Self { - int_schemes: all::().collect(), - float_schemes: all::().collect(), - string_schemes: all::().collect(), + int_schemes: ALL_INT_SCHEMES.iter().copied().collect(), + float_schemes: ALL_FLOAT_SCHEMES.iter().copied().collect(), + string_schemes: ALL_STRING_SCHEMES.iter().copied().collect(), } } @@ -70,10 +78,9 @@ impl BtrBlocksCompressorBuilder { /// .exclude_int([IntCode::Dict, IntCode::Rle]) /// .build(); /// ``` - pub fn exclude_int(mut self, schemes: impl IntoIterator) -> Self { - for scheme in schemes { - self.int_schemes.remove(&scheme); - } + pub fn exclude_int(mut self, codes: impl IntoIterator) -> Self { + let codes: HashSet<_> = codes.into_iter().collect(); + self.int_schemes.retain(|s| !codes.contains(&s.code())); self } @@ -88,10 +95,9 @@ impl BtrBlocksCompressorBuilder { /// .exclude_float([FloatCode::Dict, FloatCode::Alp]) /// .build(); /// ``` - pub fn exclude_float(mut self, schemes: impl IntoIterator) -> Self { - for scheme in schemes { - self.float_schemes.remove(&scheme); - } + pub fn exclude_float(mut self, codes: impl IntoIterator) -> Self { + let codes: HashSet<_> = codes.into_iter().collect(); + self.float_schemes.retain(|s| !codes.contains(&s.code())); self } @@ -106,10 +112,9 @@ impl BtrBlocksCompressorBuilder { /// .exclude_string([StringCode::Dict, StringCode::Fsst]) /// .build(); /// ``` - pub fn exclude_string(mut self, schemes: impl IntoIterator) -> Self { - for scheme in schemes { - self.string_schemes.remove(&scheme); - } + pub fn exclude_string(mut self, codes: impl IntoIterator) -> Self { + let codes: HashSet<_> = codes.into_iter().collect(); + self.string_schemes.retain(|s| !codes.contains(&s.code())); self } @@ -125,8 +130,13 @@ impl BtrBlocksCompressorBuilder { /// .include_int([IntCode::Dict]) // re-enables Dict /// .build(); /// ``` - pub fn include_int(mut self, schemes: impl IntoIterator) -> Self { - self.int_schemes.extend(schemes); + pub fn include_int(mut self, codes: impl IntoIterator) -> Self { + let codes: HashSet<_> = codes.into_iter().collect(); + for scheme in ALL_INT_SCHEMES { + if codes.contains(&scheme.code()) { + self.int_schemes.insert(*scheme); + } + } self } @@ -142,8 +152,13 @@ impl BtrBlocksCompressorBuilder { /// .include_float([FloatCode::Alp]) // re-enables Alp /// .build(); /// ``` - pub fn include_float(mut self, schemes: impl IntoIterator) -> Self { - self.float_schemes.extend(schemes); + pub fn include_float(mut self, codes: impl IntoIterator) -> Self { + let codes: HashSet<_> = codes.into_iter().collect(); + for scheme in ALL_FLOAT_SCHEMES { + if codes.contains(&scheme.code()) { + self.float_schemes.insert(*scheme); + } + } self } @@ -159,18 +174,22 @@ impl BtrBlocksCompressorBuilder { /// .include_string([StringCode::Dict]) // re-enables Dict /// .build(); /// ``` - pub fn include_string(mut self, schemes: impl IntoIterator) -> Self { - self.string_schemes.extend(schemes); + pub fn include_string(mut self, codes: impl IntoIterator) -> Self { + let codes: HashSet<_> = codes.into_iter().collect(); + for scheme in ALL_STRING_SCHEMES { + if codes.contains(&scheme.code()) { + self.string_schemes.insert(*scheme); + } + } self } /// Builds the configured `BtrBlocksCompressor`. pub fn build(self) -> BtrBlocksCompressor { - let config = BtrBlocksCompressorConfig::from_schemes( - self.int_schemes, - self.float_schemes, - self.string_schemes, - ); - BtrBlocksCompressor::from_config(config) + BtrBlocksCompressor { + int_schemes: self.int_schemes.into_iter().collect_vec(), + float_schemes: self.float_schemes.into_iter().collect_vec(), + string_schemes: self.string_schemes.into_iter().collect_vec(), + } } } diff --git a/vortex-btrblocks/src/decimal.rs b/vortex-btrblocks/src/decimal.rs index 68711aaf6fd..431ffe57bf4 100644 --- a/vortex-btrblocks/src/decimal.rs +++ b/vortex-btrblocks/src/decimal.rs @@ -2,6 +2,7 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use vortex_array::ArrayRef; +use vortex_array::Canonical; use vortex_array::arrays::DecimalArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::narrowed_decimal; @@ -10,13 +11,18 @@ use vortex_decimal_byte_parts::DecimalBytePartsArray; use vortex_error::VortexResult; use vortex_scalar::DecimalType; -use crate::Compressor; +use crate::BtrBlocksCompressor; +use crate::CanonicalCompressor; +use crate::Excludes; use crate::IntCompressor; use crate::MAX_CASCADE; // TODO(joe): add support splitting i128/256 buffers into chunks primitive values for compression. // 2 for i128 and 4 for i256 -pub fn compress_decimal(decimal: &DecimalArray) -> VortexResult { +pub fn compress_decimal( + compressor: &BtrBlocksCompressor, + decimal: &DecimalArray, +) -> VortexResult { let decimal = narrowed_decimal(decimal.clone()); let validity = decimal.validity(); let prim = match decimal.values_type() { @@ -27,7 +33,12 @@ pub fn compress_decimal(decimal: &DecimalArray) -> VortexResult { _ => return Ok(decimal.to_array()), }; - let compressed = IntCompressor::compress(&prim, false, MAX_CASCADE, &[])?; + let compressed = compressor.compress_canonical( + Canonical::Primitive(prim), + false, + MAX_CASCADE, + Excludes::none(), + )?; DecimalBytePartsArray::try_new(compressed, decimal.decimal_dtype()).map(|d| d.to_array()) } diff --git a/vortex-btrblocks/src/float.rs b/vortex-btrblocks/src/float.rs index 017ccdb055e..b2f2eaff340 100644 --- a/vortex-btrblocks/src/float.rs +++ b/vortex-btrblocks/src/float.rs @@ -4,18 +4,25 @@ pub(crate) mod dictionary; mod stats; +use std::hash::Hash; +use std::hash::Hasher; + use enum_iterator::Sequence; use vortex_alp::ALPArray; use vortex_alp::ALPVTable; use vortex_alp::RDEncoder; use vortex_alp::alp_encode; use vortex_array::ArrayRef; +use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::ToCanonical; use vortex_array::arrays::ConstantArray; use vortex_array::arrays::DictArray; +use vortex_array::arrays::DictArrayParts; use vortex_array::arrays::MaskedArray; +use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::PrimitiveVTable; +use vortex_array::vtable::VTable; use vortex_array::vtable::ValidityHelper; use vortex_dtype::PType; use vortex_error::VortexResult; @@ -25,48 +32,141 @@ use vortex_sparse::SparseArray; use vortex_sparse::SparseVTable; pub use self::stats::FloatStats; +use crate::BtrBlocksCompressor; +use crate::CanonicalCompressor; use crate::Compressor; use crate::CompressorStats; +use crate::Excludes; use crate::GenerateStatsOptions; +use crate::IntCode; use crate::Scheme; +use crate::compress; use crate::estimate_compression_ratio_with_sampling; use crate::float::dictionary::dictionary_encode; use crate::integer; use crate::integer::IntCompressor; use crate::integer::IntegerStats; use crate::patches::compress_patches; +use crate::rle; use crate::rle::RLEScheme; -pub trait FloatScheme: Scheme {} +pub trait FloatScheme: Scheme + Send + Sync {} + +impl FloatScheme for T where T: Scheme + Send + Sync +{} -impl FloatScheme for T where T: Scheme {} +impl PartialEq for dyn FloatScheme { + fn eq(&self, other: &Self) -> bool { + self.code() == other.code() + } +} + +impl Eq for dyn FloatScheme {} + +impl Hash for dyn FloatScheme { + fn hash(&self, state: &mut H) { + self.code().hash(state) + } +} + +/// All available float compression schemes. +pub const ALL_FLOAT_SCHEMES: &[&dyn FloatScheme] = &[ + &UncompressedScheme, + &ConstantScheme, + &ALPScheme, + &ALPRDScheme, + // &DictScheme, + &NullDominated, + // &RLE_FLOAT_SCHEME, +]; /// [`Compressor`] for floating-point numbers. -pub struct FloatCompressor; +#[derive(Clone)] +pub struct FloatCompressor<'a> { + /// tmp + pub btr_blocks_compressor: &'a BtrBlocksCompressor, // schemes: Vec<&'static dyn FloatScheme>, +} -impl Compressor for FloatCompressor { +// impl Default for FloatCompressor { +// fn default() -> Self { +// Self { +// schemes: ALL_FLOAT_SCHEMES.to_vec(), +// } +// } +// } + +// impl<'a> FloatCompressor<'a> { +// /// Creates a new compressor with all schemes enabled. +// // pub fn new() -> Self { +// // Self:: +// // } +// +// /// Creates a compressor with only the specified schemes. +// pub fn with_schemes(schemes: Vec<&'static dyn FloatScheme>) -> Self { +// Self { schemes } +// } +// +// /// Creates a compressor excluding schemes with the given codes. +// pub fn excluding(excludes: &[FloatCode]) -> Self { +// Self { +// schemes: ALL_FLOAT_SCHEMES +// .iter() +// .filter(|s| !excludes.contains(&s.code())) +// .copied() +// .collect(), +// } +// } +// +// /// Compress with default settings (static helper for internal use). +// pub(crate) fn compress_static( +// array: &PrimitiveArray, +// is_sample: bool, +// allowed_cascading: usize, +// excludes: &[FloatCode], +// ) -> VortexResult { +// let compressor = if excludes.is_empty() { +// Self::default() +// } else { +// Self::excluding(excludes) +// }; +// compress(&compressor, array, is_sample, allowed_cascading, excludes) +// } +// } + +impl<'a> Compressor for FloatCompressor<'a> { type ArrayVTable = PrimitiveVTable; type SchemeType = dyn FloatScheme; type StatsType = FloatStats; - fn schemes() -> &'static [&'static Self::SchemeType] { - &[ - &UncompressedScheme, - &ConstantScheme, - &ALPScheme, - &ALPRDScheme, - &DictScheme, - &NullDominated, - &RLE_FLOAT_SCHEME, - ] + fn gen_stats(&self, array: &::Array) -> Self::StatsType { + if self + .btr_blocks_compressor + .float_schemes + .iter() + .any(|s| s.code() == DictScheme.code()) + { + FloatStats::generate_opts( + array, + GenerateStatsOptions { + count_distinct_values: true, + }, + ) + } else { + FloatStats::generate_opts( + array, + GenerateStatsOptions { + count_distinct_values: false, + }, + ) + } } - fn default_scheme() -> &'static Self::SchemeType { - &UncompressedScheme + fn schemes(&self) -> &[&'static dyn FloatScheme] { + &self.btr_blocks_compressor.float_schemes } - fn dict_scheme_code() -> FloatCode { - FloatCode::Dict + fn default_scheme(&self) -> &'static Self::SchemeType { + &UncompressedScheme } } @@ -91,30 +191,46 @@ pub enum FloatCode { Sparse, } -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] struct UncompressedScheme; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] struct ConstantScheme; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] struct ALPScheme; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] struct ALPRDScheme; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] struct DictScheme; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct NullDominated; -pub const RLE_FLOAT_SCHEME: RLEScheme = RLEScheme::new( - FloatCode::Rle, - |values, is_sample, allowed_cascading, excludes| { - FloatCompressor::compress(values, is_sample, allowed_cascading, excludes) - }, -); +/// Configuration for float RLE compression. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct FloatRLEConfig; + +// impl rle::RLEConfig for FloatRLEConfig { +// type Stats = FloatStats; +// type Code = FloatCode; +// +// const CODE: FloatCode = FloatCode::Rle; +// +// fn compress_values( +// values: &PrimitiveArray, +// is_sample: bool, +// allowed_cascading: usize, +// excludes: &[FloatCode], +// ) -> VortexResult { +// +// } +// } +// +// /// RLE scheme for float compression. +// pub const RLE_FLOAT_SCHEME: RLEScheme = RLEScheme::new(); impl Scheme for UncompressedScheme { type StatsType = FloatStats; @@ -126,6 +242,7 @@ impl Scheme for UncompressedScheme { fn expected_compression_ratio( &self, + _compressor: &BtrBlocksCompressor, _stats: &Self::StatsType, _is_sample: bool, _allowed_cascading: usize, @@ -136,6 +253,7 @@ impl Scheme for UncompressedScheme { fn compress( &self, + _btr_blocks_compressor: &BtrBlocksCompressor, stats: &Self::StatsType, _is_sample: bool, _allowed_cascading: usize, @@ -155,6 +273,7 @@ impl Scheme for ConstantScheme { fn expected_compression_ratio( &self, + _btr_blocks_compressor: &BtrBlocksCompressor, stats: &Self::StatsType, is_sample: bool, _allowed_cascading: usize, @@ -179,6 +298,7 @@ impl Scheme for ConstantScheme { fn compress( &self, + _btr_blocks_compressor: &BtrBlocksCompressor, stats: &Self::StatsType, _is_sample: bool, _allowed_cascading: usize, @@ -216,6 +336,7 @@ impl Scheme for ALPScheme { fn expected_compression_ratio( &self, + compressor: &BtrBlocksCompressor, stats: &Self::StatsType, is_sample: bool, allowed_cascading: usize, @@ -234,6 +355,7 @@ impl Scheme for ALPScheme { estimate_compression_ratio_with_sampling( self, + compressor, stats, is_sample, allowed_cascading, @@ -243,6 +365,7 @@ impl Scheme for ALPScheme { fn compress( &self, + compressor: &BtrBlocksCompressor, stats: &FloatStats, is_sample: bool, allowed_cascading: usize, @@ -263,8 +386,19 @@ impl Scheme for ALPScheme { int_excludes.push(integer::RunEndScheme.code()); } - let compressed_alp_ints = - IntCompressor::compress(&alp_ints, is_sample, allowed_cascading - 1, &int_excludes)?; + let compressed_alp_ints = compressor.compress_canonical( + Canonical::Primitive(alp_ints), + is_sample, + allowed_cascading - 1, + Excludes::float_only(excludes), + )?; + + // let compressed_alp_ints = IntCompressor::compress_static( + // &alp_ints, + // is_sample, + // allowed_cascading - 1, + // &int_excludes, + // )?; let patches = alp.patches().map(compress_patches).transpose()?; @@ -282,6 +416,7 @@ impl Scheme for ALPRDScheme { fn expected_compression_ratio( &self, + compressor: &BtrBlocksCompressor, stats: &Self::StatsType, is_sample: bool, allowed_cascading: usize, @@ -293,6 +428,7 @@ impl Scheme for ALPRDScheme { estimate_compression_ratio_with_sampling( self, + compressor, stats, is_sample, allowed_cascading, @@ -302,6 +438,7 @@ impl Scheme for ALPRDScheme { fn compress( &self, + compressor: &BtrBlocksCompressor, stats: &Self::StatsType, _is_sample: bool, _allowed_cascading: usize, @@ -335,6 +472,7 @@ impl Scheme for DictScheme { fn expected_compression_ratio( &self, + compressor: &BtrBlocksCompressor, stats: &Self::StatsType, is_sample: bool, allowed_cascading: usize, @@ -352,6 +490,7 @@ impl Scheme for DictScheme { // Take a sample and run compression on the sample to determine before/after size. estimate_compression_ratio_with_sampling( self, + compressor, stats, is_sample, allowed_cascading, @@ -361,45 +500,59 @@ impl Scheme for DictScheme { fn compress( &self, + compressor: &BtrBlocksCompressor, stats: &Self::StatsType, is_sample: bool, allowed_cascading: usize, - _excludes: &[FloatCode], + excludes: &[Self::CodeType], ) -> VortexResult { - let dict_array = dictionary_encode(stats); + let dict = dictionary_encode(stats); + let has_all_values_referenced = dict.has_all_values_referenced(); + let DictArrayParts { codes, values, .. } = dict.into_parts(); // Only compress the codes. - let codes_stats = IntegerStats::generate_opts( - &dict_array.codes().to_primitive().narrow()?, - GenerateStatsOptions { - count_distinct_values: false, - }, - ); - let codes_scheme = IntCompressor::choose_scheme( - &codes_stats, - is_sample, - allowed_cascading - 1, - &[integer::DictScheme.code(), integer::SequenceScheme.code()], - )?; - let compressed_codes = codes_scheme.compress( - &codes_stats, + // let codes_stats = IntegerStats::generate_opts( + // &codes.to_primitive().narrow()?, + // GenerateStatsOptions { + // count_distinct_values: false, + // }, + // ); + // + // let codes_excludes = &[integer::DictScheme.code(), integer::SequenceScheme.code()]; + // let codes_compressor = IntCompressor::excluding(codes_excludes); + // let codes_scheme = codes_compressor.choose_scheme( + // &codes_stats, + // is_sample, + // allowed_cascading - 1, + // codes_excludes, + // )?; + // let compressed_codes = codes_scheme.compress( + // compressor, + // &codes_stats, + // is_sample, + // allowed_cascading - 1, + // &[integer::DictScheme.code()], + // )?; + let compressed_codes = compressor.compress_canonical( + Canonical::Primitive(codes.to_primitive()), is_sample, allowed_cascading - 1, - &[integer::DictScheme.code()], + Excludes::int_only(&[IntCode::Dict, IntCode::Sequence]), )?; - let compressed_values = FloatCompressor::compress( - &dict_array.values().to_primitive(), + assert!(values.is_canonical()); + let compressed_values = compressor.compress_canonical( + Canonical::Primitive(values.to_primitive()), is_sample, allowed_cascading - 1, - &[FloatCode::Dict], + Excludes::float_only(&[FloatCode::Dict]), )?; // SAFETY: compressing codes or values does not alter the invariants unsafe { Ok( DictArray::new_unchecked(compressed_codes, compressed_values) - .set_all_values_referenced(dict_array.has_all_values_referenced()) + .set_all_values_referenced(has_all_values_referenced) .into_array(), ) } @@ -416,6 +569,7 @@ impl Scheme for NullDominated { fn expected_compression_ratio( &self, + _compressor: &BtrBlocksCompressor, stats: &Self::StatsType, _is_sample: bool, allowed_cascading: usize, @@ -442,6 +596,7 @@ impl Scheme for NullDominated { fn compress( &self, + compressor: &BtrBlocksCompressor, stats: &Self::StatsType, is_sample: bool, allowed_cascading: usize, @@ -454,17 +609,23 @@ impl Scheme for NullDominated { if let Some(sparse) = sparse_encoded.as_opt::() { // Compress the values - let new_excludes = vec![integer::SparseScheme.code()]; + let new_excludes = [integer::SparseScheme.code()]; // Don't attempt to compress the non-null values let indices = sparse.patches().indices().to_primitive().narrow()?; - let compressed_indices = IntCompressor::compress_no_dict( - &indices, + let compressed_indices = compressor.compress_canonical( + Canonical::Primitive(indices.to_primitive()), is_sample, allowed_cascading - 1, - &new_excludes, + Excludes::int_only(&new_excludes), )?; + // let compressed_indices = IntCompressor::compress_no_dict_static( + // &indices, + // is_sample, + // allowed_cascading - 1, + // &new_excludes, + // )?; SparseArray::try_new( compressed_indices, @@ -478,173 +639,173 @@ impl Scheme for NullDominated { } } } - -#[cfg(test)] -mod tests { - - use std::iter; - - use vortex_array::Array; - use vortex_array::IntoArray; - use vortex_array::ToCanonical; - use vortex_array::arrays::PrimitiveArray; - use vortex_array::assert_arrays_eq; - use vortex_array::builders::ArrayBuilder; - use vortex_array::builders::PrimitiveBuilder; - use vortex_array::display::DisplayOptions; - use vortex_array::validity::Validity; - use vortex_buffer::Buffer; - use vortex_buffer::buffer_mut; - use vortex_dtype::Nullability; - use vortex_error::VortexResult; - - use crate::Compressor; - use crate::CompressorStats; - use crate::MAX_CASCADE; - use crate::Scheme; - use crate::float::FloatCompressor; - use crate::float::RLE_FLOAT_SCHEME; - - #[test] - fn test_empty() -> VortexResult<()> { - // Make sure empty array compression does not fail - let result = FloatCompressor::compress( - &PrimitiveArray::new(Buffer::::empty(), Validity::NonNullable), - false, - 3, - &[], - )?; - - assert!(result.is_empty()); - Ok(()) - } - - #[test] - fn test_compress() -> VortexResult<()> { - let mut values = buffer_mut![1.0f32; 1024]; - // Sprinkle some other values in. - for i in 0..1024 { - // Insert 2.0 at all odd positions. - // This should force dictionary encoding and exclude run-end due to the - // average run length being 1. - values[i] = (i % 50) as f32; - } - - let floats = values.into_array().to_primitive(); - let compressed = FloatCompressor::compress(&floats, false, MAX_CASCADE, &[])?; - assert_eq!(compressed.len(), 1024); - - let display = compressed - .display_as(DisplayOptions::MetadataOnly) - .to_string() - .to_lowercase(); - assert_eq!(display, "vortex.dict(f32, len=1024)"); - - Ok(()) - } - - #[test] - fn test_rle_compression() -> VortexResult<()> { - let mut values = Vec::new(); - values.extend(iter::repeat_n(1.5f32, 100)); - values.extend(iter::repeat_n(2.7f32, 200)); - values.extend(iter::repeat_n(3.15f32, 150)); - - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let stats = crate::float::FloatStats::generate(&array); - let compressed = RLE_FLOAT_SCHEME.compress(&stats, false, 3, &[])?; - - let decoded = compressed; - let expected = Buffer::copy_from(&values).into_array(); - assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); - Ok(()) - } - - #[test] - fn test_sparse_compression() -> VortexResult<()> { - let mut array = PrimitiveBuilder::::with_capacity(Nullability::Nullable, 100); - array.append_value(f32::NAN); - array.append_value(-f32::NAN); - array.append_value(f32::INFINITY); - array.append_value(-f32::INFINITY); - array.append_value(0.0f32); - array.append_value(-0.0f32); - array.append_nulls(90); - - let floats = array.finish_into_primitive(); - - let compressed = FloatCompressor::compress(&floats, false, MAX_CASCADE, &[])?; - assert_eq!(compressed.len(), 96); - - let display = compressed - .display_as(DisplayOptions::MetadataOnly) - .to_string() - .to_lowercase(); - assert_eq!(display, "vortex.sparse(f32?, len=96)"); - - Ok(()) - } -} - -/// Tests to verify that each float compression scheme produces the expected encoding. -#[cfg(test)] -mod scheme_selection_tests { - - use vortex_alp::ALPVTable; - use vortex_array::arrays::ConstantVTable; - use vortex_array::arrays::DictVTable; - use vortex_array::arrays::PrimitiveArray; - use vortex_array::builders::ArrayBuilder; - use vortex_array::builders::PrimitiveBuilder; - use vortex_array::validity::Validity; - use vortex_buffer::Buffer; - use vortex_dtype::Nullability; - use vortex_error::VortexResult; - - use crate::Compressor; - use crate::float::FloatCompressor; - - #[test] - fn test_constant_compressed() -> VortexResult<()> { - let values: Vec = vec![42.5; 100]; - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let compressed = FloatCompressor::compress(&array, false, 3, &[])?; - assert!(compressed.is::()); - Ok(()) - } - - #[test] - fn test_alp_compressed() -> VortexResult<()> { - let values: Vec = (0..1000).map(|i| (i as f64) * 0.01).collect(); - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let compressed = FloatCompressor::compress(&array, false, 3, &[])?; - assert!(compressed.is::()); - Ok(()) - } - - #[test] - fn test_dict_compressed() -> VortexResult<()> { - let distinct_values = [1.1, 2.2, 3.3, 4.4, 5.5]; - let values: Vec = (0..1000) - .map(|i| distinct_values[i % distinct_values.len()]) - .collect(); - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let compressed = FloatCompressor::compress(&array, false, 3, &[])?; - assert!(compressed.is::()); - Ok(()) - } - - #[test] - fn test_null_dominated_compressed() -> VortexResult<()> { - let mut builder = PrimitiveBuilder::::with_capacity(Nullability::Nullable, 100); - for i in 0..5 { - builder.append_value(i as f64); - } - builder.append_nulls(95); - let array = builder.finish_into_primitive(); - let compressed = FloatCompressor::compress(&array, false, 3, &[])?; - // Verify the compressed array preserves values. - assert_eq!(compressed.len(), 100); - Ok(()) - } -} +// +// #[cfg(test)] +// mod tests { +// +// use std::iter; +// +// use vortex_array::Array; +// use vortex_array::IntoArray; +// use vortex_array::ToCanonical; +// use vortex_array::arrays::PrimitiveArray; +// use vortex_array::assert_arrays_eq; +// use vortex_array::builders::ArrayBuilder; +// use vortex_array::builders::PrimitiveBuilder; +// use vortex_array::display::DisplayOptions; +// use vortex_array::validity::Validity; +// use vortex_buffer::Buffer; +// use vortex_buffer::buffer_mut; +// use vortex_dtype::Nullability; +// use vortex_error::VortexResult; +// +// use crate::{compress, Compressor}; +// use crate::CompressorStats; +// use crate::MAX_CASCADE; +// use crate::Scheme; +// use crate::float::FloatCompressor; +// use crate::float::RLE_FLOAT_SCHEME; +// +// #[test] +// fn test_empty() -> VortexResult<()> { +// // Make sure empty array compression does not fail +// +// let result = FloatCompressor::default().compress( +// &PrimitiveArray::new(Buffer::::empty(), Validity::NonNullable), +// false, +// 3, +// )?; +// +// assert!(result.is_empty()); +// Ok(()) +// } +// +// #[test] +// fn test_compress() -> VortexResult<()> { +// let mut values = buffer_mut![1.0f32; 1024]; +// // Sprinkle some other values in. +// for i in 0..1024 { +// // Insert 2.0 at all odd positions. +// // This should force dictionary encoding and exclude run-end due to the +// // average run length being 1. +// values[i] = (i % 50) as f32; +// } +// +// let floats = values.into_array().to_primitive(); +// let compressed = FloatCompressor::default().compress(&floats, false, MAX_CASCADE)?; +// assert_eq!(compressed.len(), 1024); +// +// let display = compressed +// .display_as(DisplayOptions::MetadataOnly) +// .to_string() +// .to_lowercase(); +// assert_eq!(display, "vortex.dict(f32, len=1024)"); +// +// Ok(()) +// } +// +// #[test] +// fn test_rle_compression() -> VortexResult<()> { +// let mut values = Vec::new(); +// values.extend(iter::repeat_n(1.5f32, 100)); +// values.extend(iter::repeat_n(2.7f32, 200)); +// values.extend(iter::repeat_n(3.15f32, 150)); +// +// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); +// let stats = crate::float::FloatStats::generate(&array); +// let compressed = RLE_FLOAT_SCHEME.compress(&stats, false, 3, &[])?; +// +// let decoded = compressed; +// let expected = Buffer::copy_from(&values).into_array(); +// assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); +// Ok(()) +// } +// +// #[test] +// fn test_sparse_compression() -> VortexResult<()> { +// let mut array = PrimitiveBuilder::::with_capacity(Nullability::Nullable, 100); +// array.append_value(f32::NAN); +// array.append_value(-f32::NAN); +// array.append_value(f32::INFINITY); +// array.append_value(-f32::INFINITY); +// array.append_value(0.0f32); +// array.append_value(-0.0f32); +// array.append_nulls(90); +// +// let floats = array.finish_into_primitive(); +// +// let compressed = FloatCompressor::default().compress(&floats, false, MAX_CASCADE)?; +// assert_eq!(compressed.len(), 96); +// +// let display = compressed +// .display_as(DisplayOptions::MetadataOnly) +// .to_string() +// .to_lowercase(); +// assert_eq!(display, "vortex.sparse(f32?, len=96)"); +// +// Ok(()) +// } +// } +// +// /// Tests to verify that each float compression scheme produces the expected encoding. +// #[cfg(test)] +// mod scheme_selection_tests { +// +// use vortex_alp::ALPVTable; +// use vortex_array::arrays::ConstantVTable; +// use vortex_array::arrays::DictVTable; +// use vortex_array::arrays::PrimitiveArray; +// use vortex_array::builders::ArrayBuilder; +// use vortex_array::builders::PrimitiveBuilder; +// use vortex_array::validity::Validity; +// use vortex_buffer::Buffer; +// use vortex_dtype::Nullability; +// use vortex_error::VortexResult; +// +// use crate::Compressor; +// use crate::float::FloatCompressor; +// +// #[test] +// fn test_constant_compressed() -> VortexResult<()> { +// let values: Vec = vec![42.5; 100]; +// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); +// let compressed = FloatCompressor::default().compress(&array, false, 3)?; +// assert!(compressed.is::()); +// Ok(()) +// } +// +// #[test] +// fn test_alp_compressed() -> VortexResult<()> { +// let values: Vec = (0..1000).map(|i| (i as f64) * 0.01).collect(); +// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); +// let compressed = FloatCompressor::default().compress(&array, false, 3)?; +// assert!(compressed.is::()); +// Ok(()) +// } +// +// #[test] +// fn test_dict_compressed() -> VortexResult<()> { +// let distinct_values = [1.1, 2.2, 3.3, 4.4, 5.5]; +// let values: Vec = (0..1000) +// .map(|i| distinct_values[i % distinct_values.len()]) +// .collect(); +// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); +// let compressed = FloatCompressor::default().compress(&array, false, 3)?; +// assert!(compressed.is::()); +// Ok(()) +// } +// +// #[test] +// fn test_null_dominated_compressed() -> VortexResult<()> { +// let mut builder = PrimitiveBuilder::::with_capacity(Nullability::Nullable, 100); +// for i in 0..5 { +// builder.append_value(i as f64); +// } +// builder.append_nulls(95); +// let array = builder.finish_into_primitive(); +// let compressed = FloatCompressor::default().compress(&array, false, 3)?; +// // Verify the compressed array preserves values. +// assert_eq!(compressed.len(), 100); +// Ok(()) +// } +// } diff --git a/vortex-btrblocks/src/integer.rs b/vortex-btrblocks/src/integer.rs index 53fcf06aa9f..99f883a02bd 100644 --- a/vortex-btrblocks/src/integer.rs +++ b/vortex-btrblocks/src/integer.rs @@ -4,9 +4,13 @@ pub mod dictionary; mod stats; +use std::hash::Hash; +use std::hash::Hasher; + use enum_iterator::Sequence; pub use stats::IntegerStats; use vortex_array::ArrayRef; +use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::ToCanonical; use vortex_array::arrays::ConstantArray; @@ -14,6 +18,7 @@ use vortex_array::arrays::DictArray; use vortex_array::arrays::MaskedArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::PrimitiveVTable; +use vortex_array::vtable::VTable; use vortex_array::vtable::ValidityHelper; use vortex_error::VortexExpect; use vortex_error::VortexResult; @@ -32,76 +37,174 @@ use vortex_sparse::SparseVTable; use vortex_zigzag::ZigZagArray; use vortex_zigzag::zigzag_encode; +use crate::BtrBlocksCompressor; +use crate::CanonicalCompressor; use crate::Compressor; use crate::CompressorStats; +use crate::Excludes; use crate::GenerateStatsOptions; use crate::Scheme; +use crate::compress; use crate::estimate_compression_ratio_with_sampling; use crate::integer::dictionary::dictionary_encode; use crate::patches::compress_patches; +use crate::rle; use crate::rle::RLEScheme; +/// All available integer compression schemes. +pub const ALL_INT_SCHEMES: &[&dyn IntegerScheme] = &[ + &ConstantScheme, + &FORScheme, + &ZigZagScheme, + &BitPackingScheme, + &SparseScheme, + &DictScheme, + &RunEndScheme, + &SequenceScheme, + &RLE_INTEGER_SCHEME, +]; + /// [`Compressor`] for signed and unsigned integers. -pub struct IntCompressor; +#[derive(Clone)] +pub struct IntCompressor { + schemes: Vec<&'static dyn IntegerScheme>, + default: &'static dyn IntegerScheme, +} -impl Compressor for IntCompressor { - type ArrayVTable = PrimitiveVTable; - type SchemeType = dyn IntegerScheme; - type StatsType = IntegerStats; +impl Default for IntCompressor { + fn default() -> Self { + Self { + schemes: ALL_INT_SCHEMES.to_vec(), + default: &UncompressedScheme, + } + } +} - fn schemes() -> &'static [&'static dyn IntegerScheme] { - &[ - &ConstantScheme, - &FORScheme, - &ZigZagScheme, - &BitPackingScheme, - &SparseScheme, - &DictScheme, - &RunEndScheme, - &SequenceScheme, - &RLE_INTEGER_SCHEME, - ] +impl IntCompressor { + /// Creates a new compressor with all schemes enabled. + pub fn new() -> Self { + Self::default() } - fn default_scheme() -> &'static Self::SchemeType { - &UncompressedScheme + /// Creates a compressor with only the specified schemes. + pub fn with_schemes(schemes: Vec<&'static dyn IntegerScheme>) -> Self { + Self { + schemes, + default: &UncompressedScheme, + } } - fn dict_scheme_code() -> IntCode { - IntCode::Dict + /// Creates a compressor excluding schemes with the given codes. + pub fn excluding(excludes: &[IntCode]) -> Self { + Self::with_schemes( + ALL_INT_SCHEMES + .iter() + .filter(|s| !excludes.contains(&s.code())) + .copied() + .collect(), + ) } -} -impl IntCompressor { - pub(crate) fn compress_no_dict( + /// Creates a compressor without dictionary encoding. + pub fn no_dict() -> Self { + Self::excluding(&[IntCode::Dict]) + } + + /// Compress without dictionary encoding (static helper for internal use). + // pub(crate) fn compress_no_dict_static( + // array: &PrimitiveArray, + // is_sample: bool, + // allowed_cascading: usize, + // excludes: &[IntCode], + // ) -> VortexResult { + // let compressor = Self::excluding(&[IntCode::Dict]); + // let compressor = if excludes.is_empty() { + // compressor + // } else { + // Self::with_schemes( + // compressor + // .schemes + // .iter() + // .filter(|s| !excludes.contains(&s.code())) + // .copied() + // .collect(), + // ) + // }; + // compress(&compressor, array, is_sample, allowed_cascading, excludes) + // } + + /// Compress with default settings (static helper for internal use). + pub(crate) fn compress_static( array: &PrimitiveArray, is_sample: bool, allowed_cascading: usize, excludes: &[IntCode], ) -> VortexResult { - let stats = IntegerStats::generate_opts( - array, - GenerateStatsOptions { - count_distinct_values: false, - }, - ); + let compressor = if excludes.is_empty() { + Self::default() + } else { + Self::excluding(excludes) + }; + compress(&compressor, array, is_sample, allowed_cascading, excludes) + } +} + +impl Compressor for IntCompressor { + type ArrayVTable = PrimitiveVTable; + type SchemeType = dyn IntegerScheme; + type StatsType = IntegerStats; - let scheme = Self::choose_scheme(&stats, is_sample, allowed_cascading, excludes)?; - let output = scheme.compress(&stats, is_sample, allowed_cascading, excludes)?; + fn schemes(&self) -> &[&'static dyn IntegerScheme] { + &self.schemes + } - if output.nbytes() < array.nbytes() { - Ok(output) + fn default_scheme(&self) -> &'static Self::SchemeType { + self.default + } + + fn gen_stats(&self, array: &::Array) -> Self::StatsType { + if self.schemes.iter().any(|s| s.code() == IntCode::Dict) { + IntegerStats::generate_opts( + array, + GenerateStatsOptions { + count_distinct_values: true, + }, + ) } else { - tracing::debug!("resulting tree too large: {}", output.display_tree()); - Ok(array.to_array()) + IntegerStats::generate_opts( + array, + GenerateStatsOptions { + count_distinct_values: false, + }, + ) } } } -pub trait IntegerScheme: Scheme {} +pub trait IntegerScheme: + Scheme + Send + Sync +{ +} // Auto-impl -impl IntegerScheme for T where T: Scheme {} +impl IntegerScheme for T where + T: Scheme + Send + Sync +{ +} + +impl PartialEq for dyn IntegerScheme { + fn eq(&self, other: &Self) -> bool { + self.code() == other.code() + } +} + +impl Eq for dyn IntegerScheme {} + +impl Hash for dyn IntegerScheme { + fn hash(&self, state: &mut H) { + self.code().hash(state) + } +} /// Unique identifier for integer compression schemes. #[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Sequence)] @@ -128,42 +231,61 @@ pub enum IntCode { Rle, } -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] + pub struct UncompressedScheme; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] + pub struct ConstantScheme; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] + pub struct FORScheme; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct ZigZagScheme; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct BitPackingScheme; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct SparseScheme; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct DictScheme; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct RunEndScheme; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct SequenceScheme; /// Threshold for the average run length in an array before we consider run-end encoding. const RUN_END_THRESHOLD: u32 = 4; -pub const RLE_INTEGER_SCHEME: RLEScheme = RLEScheme::new( - IntCode::Rle, - |values, is_sample, allowed_cascading, excludes| { - IntCompressor::compress_no_dict(values, is_sample, allowed_cascading, excludes) - }, -); +/// Configuration for integer RLE compression. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct IntRLEConfig; + +impl rle::RLEConfig for IntRLEConfig { + type Stats = IntegerStats; + type Code = IntCode; + + const CODE: IntCode = IntCode::Rle; + + fn compress_values( + values: &PrimitiveArray, + is_sample: bool, + allowed_cascading: usize, + excludes: &[IntCode], + ) -> VortexResult { + IntCompressor::compress_no_dict_static(values, is_sample, allowed_cascading, excludes) + } +} + +/// RLE scheme for integer compression. +pub const RLE_INTEGER_SCHEME: RLEScheme = RLEScheme::new(); impl Scheme for UncompressedScheme { type StatsType = IntegerStats; @@ -175,6 +297,7 @@ impl Scheme for UncompressedScheme { fn expected_compression_ratio( &self, + _compressor: &BtrBlocksCompressor, _stats: &IntegerStats, _is_sample: bool, _allowed_cascading: usize, @@ -186,6 +309,7 @@ impl Scheme for UncompressedScheme { fn compress( &self, + _compressor: &BtrBlocksCompressor, stats: &IntegerStats, _is_sample: bool, _allowed_cascading: usize, @@ -209,6 +333,7 @@ impl Scheme for ConstantScheme { fn expected_compression_ratio( &self, + _compressor: &BtrBlocksCompressor, stats: &IntegerStats, is_sample: bool, _allowed_cascading: usize, @@ -229,6 +354,7 @@ impl Scheme for ConstantScheme { fn compress( &self, + _compressor: &BtrBlocksCompressor, stats: &IntegerStats, _is_sample: bool, _allowed_cascading: usize, @@ -266,6 +392,7 @@ impl Scheme for FORScheme { fn expected_compression_ratio( &self, + _compressor: &BtrBlocksCompressor, stats: &IntegerStats, _is_sample: bool, allowed_cascading: usize, @@ -310,6 +437,7 @@ impl Scheme for FORScheme { fn compress( &self, + compressor: &BtrBlocksCompressor, stats: &IntegerStats, is_sample: bool, _allowed_cascading: usize, @@ -328,7 +456,8 @@ impl Scheme for FORScheme { // of bitpacking. // NOTE: we could delegate in the future if we had another downstream codec that performs // as well. - let compressed = BitPackingScheme.compress(&biased_stats, is_sample, 0, excludes)?; + let compressed = + BitPackingScheme.compress(compressor, &biased_stats, is_sample, 0, excludes)?; let for_compressed = FoRArray::try_new(compressed, for_array.reference_scalar().clone())?; for_compressed @@ -349,6 +478,7 @@ impl Scheme for ZigZagScheme { fn expected_compression_ratio( &self, + compressor: &BtrBlocksCompressor, stats: &IntegerStats, is_sample: bool, allowed_cascading: usize, @@ -372,6 +502,7 @@ impl Scheme for ZigZagScheme { // Run compression on a sample to see how it performs. estimate_compression_ratio_with_sampling( self, + compressor, stats, is_sample, allowed_cascading, @@ -381,6 +512,7 @@ impl Scheme for ZigZagScheme { fn compress( &self, + compressor: &BtrBlocksCompressor, stats: &IntegerStats, is_sample: bool, allowed_cascading: usize, @@ -400,8 +532,18 @@ impl Scheme for ZigZagScheme { ]; new_excludes.extend_from_slice(excludes); - let compressed = - IntCompressor::compress(&encoded, is_sample, allowed_cascading - 1, &new_excludes)?; + let compressed = compressor.compress_canonical( + Canonical::Primitive(encoded), + is_sample, + allowed_cascading - 1, + Excludes::int_only(&new_excludes), + )?; + // let compressed = IntCompressor::compress_static( + // &encoded, + // is_sample, + // allowed_cascading - 1, + // &new_excludes, + // )?; tracing::debug!("zigzag output: {}", compressed.display_tree()); @@ -419,6 +561,7 @@ impl Scheme for BitPackingScheme { fn expected_compression_ratio( &self, + compressor: &BtrBlocksCompressor, stats: &IntegerStats, is_sample: bool, allowed_cascading: usize, @@ -436,6 +579,7 @@ impl Scheme for BitPackingScheme { estimate_compression_ratio_with_sampling( self, + compressor, stats, is_sample, allowed_cascading, @@ -445,6 +589,7 @@ impl Scheme for BitPackingScheme { fn compress( &self, + _compressor: &BtrBlocksCompressor, stats: &IntegerStats, _is_sample: bool, _allowed_cascading: usize, @@ -476,6 +621,7 @@ impl Scheme for SparseScheme { // We can avoid asserting the encoding tree instead. fn expected_compression_ratio( &self, + compressor: &BtrBlocksCompressor, stats: &IntegerStats, _is_sample: bool, allowed_cascading: usize, @@ -515,6 +661,7 @@ impl Scheme for SparseScheme { fn compress( &self, + compressor: &BtrBlocksCompressor, stats: &IntegerStats, is_sample: bool, allowed_cascading: usize, @@ -549,7 +696,7 @@ impl Scheme for SparseScheme { let mut new_excludes = vec![SparseScheme.code()]; new_excludes.extend_from_slice(excludes); - let compressed_values = IntCompressor::compress_no_dict( + let compressed_values = IntCompressor::compress_no_dict_static( &sparse.patches().values().to_primitive(), is_sample, allowed_cascading - 1, @@ -558,7 +705,7 @@ impl Scheme for SparseScheme { let indices = sparse.patches().indices().to_primitive().narrow()?; - let compressed_indices = IntCompressor::compress_no_dict( + let compressed_indices = IntCompressor::compress_no_dict_static( &indices, is_sample, allowed_cascading - 1, @@ -588,6 +735,7 @@ impl Scheme for DictScheme { fn expected_compression_ratio( &self, + compressor: &BtrBlocksCompressor, stats: &IntegerStats, _is_sample: bool, allowed_cascading: usize, @@ -628,6 +776,7 @@ impl Scheme for DictScheme { fn compress( &self, + compressor: &BtrBlocksCompressor, stats: &IntegerStats, is_sample: bool, allowed_cascading: usize, @@ -645,7 +794,7 @@ impl Scheme for DictScheme { let mut new_excludes = vec![IntCode::Dict, IntCode::Sequence]; new_excludes.extend_from_slice(excludes); - let compressed_codes = IntCompressor::compress_no_dict( + let compressed_codes = IntCompressor::compress_no_dict_static( &dict.codes().to_primitive().narrow()?, is_sample, allowed_cascading - 1, @@ -673,6 +822,7 @@ impl Scheme for RunEndScheme { fn expected_compression_ratio( &self, + compressor: &BtrBlocksCompressor, stats: &IntegerStats, is_sample: bool, allowed_cascading: usize, @@ -690,6 +840,7 @@ impl Scheme for RunEndScheme { // Run compression on a sample, see how it performs. estimate_compression_ratio_with_sampling( self, + compressor, stats, is_sample, allowed_cascading, @@ -699,6 +850,7 @@ impl Scheme for RunEndScheme { fn compress( &self, + compressor: &BtrBlocksCompressor, stats: &IntegerStats, is_sample: bool, allowed_cascading: usize, @@ -718,16 +870,23 @@ impl Scheme for RunEndScheme { count_distinct_values: false, }, ); - let ends_scheme = IntCompressor::choose_scheme( + let ends_compressor = IntCompressor::excluding(&new_excludes); + let ends_scheme = ends_compressor.choose_scheme( + compressor, + &ends_stats, + is_sample, + allowed_cascading - 1, + &new_excludes, + )?; + let compressed_ends = ends_scheme.compress( + compressor, &ends_stats, is_sample, allowed_cascading - 1, &new_excludes, )?; - let compressed_ends = - ends_scheme.compress(&ends_stats, is_sample, allowed_cascading - 1, &new_excludes)?; - let compressed_values = IntCompressor::compress_no_dict( + let compressed_values = IntCompressor::compress_no_dict_static( &values.to_primitive(), is_sample, allowed_cascading - 1, @@ -754,6 +913,7 @@ impl Scheme for SequenceScheme { fn expected_compression_ratio( &self, + _compressor: &BtrBlocksCompressor, stats: &Self::StatsType, _is_sample: bool, _allowed_cascading: usize, @@ -762,6 +922,12 @@ impl Scheme for SequenceScheme { if stats.null_count > 0 { return Ok(0.0); } + + // All values in a seq are unique. + if stats.distinct_values_count as usize != stats.src.len() { + return Ok(0.0); + } + // Since two values are required to store base and multiplier the // compression ratio is divided by 2. Ok(sequence_encode(&stats.src)? @@ -771,6 +937,7 @@ impl Scheme for SequenceScheme { fn compress( &self, + _compressor: &BtrBlocksCompressor, stats: &Self::StatsType, _is_sample: bool, _allowed_cascading: usize, @@ -782,275 +949,290 @@ impl Scheme for SequenceScheme { sequence_encode(&stats.src)?.ok_or_else(|| vortex_err!("cannot sequence encode array")) } } - -#[cfg(test)] -mod tests { - use std::iter; - - use itertools::Itertools; - use rand::RngCore; - use rand::SeedableRng; - use rand::rngs::StdRng; - use vortex_array::Array; - use vortex_array::IntoArray; - use vortex_array::ToCanonical; - use vortex_array::arrays::DictVTable; - use vortex_array::arrays::PrimitiveArray; - use vortex_array::assert_arrays_eq; - use vortex_array::validity::Validity; - use vortex_array::vtable::ValidityHelper; - use vortex_buffer::Buffer; - use vortex_buffer::BufferMut; - use vortex_buffer::buffer; - use vortex_error::VortexResult; - use vortex_sequence::SequenceVTable; - use vortex_sparse::SparseVTable; - - use crate::Compressor; - use crate::CompressorStats; - use crate::FloatCompressor; - use crate::Scheme; - use crate::integer::IntCompressor; - use crate::integer::IntegerStats; - use crate::integer::RLE_INTEGER_SCHEME; - use crate::integer::SequenceScheme; - use crate::integer::SparseScheme; - - #[test] - fn test_empty() { - // Make sure empty array compression does not fail - let result = IntCompressor::compress( - &PrimitiveArray::new(Buffer::::empty(), Validity::NonNullable), - false, - 3, - &[], - ) - .unwrap(); - - assert!(result.is_empty()); - } - - #[test] - fn test_dict_encodable() -> VortexResult<()> { - let mut codes = BufferMut::::with_capacity(65_535); - // Write some runs of length 3 of a handful of different values. Interrupted by some - // one-off values. - - let numbers = [0, 10, 50, 100, 1000, 3000] - .into_iter() - .map(|i| 12340 * i) // must be big enough to not prefer fastlanes.bitpacked - .collect_vec(); - - let mut rng = StdRng::seed_from_u64(1u64); - while codes.len() < 64000 { - let run_length = rng.next_u32() % 5; - let value = numbers[rng.next_u32() as usize % numbers.len()]; - for _ in 0..run_length { - codes.push(value); - } - } - - let primitive = codes.freeze().into_array().to_primitive(); - let compressed = IntCompressor::compress(&primitive, false, 3, &[])?; - assert!(compressed.is::()); - Ok(()) - } - - #[test] - fn sparse_with_nulls() -> VortexResult<()> { - let array = PrimitiveArray::new( - buffer![189u8, 189, 189, 0, 46], - Validity::from_iter(vec![true, true, true, true, false]), - ); - let compressed = SparseScheme.compress(&IntegerStats::generate(&array), false, 3, &[])?; - assert!(compressed.is::()); - let decoded = compressed.clone(); - let expected = - PrimitiveArray::new(buffer![189u8, 189, 189, 0, 0], array.validity().clone()) - .into_array(); - assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); - Ok(()) - } - - #[test] - fn sparse_mostly_nulls() -> VortexResult<()> { - let array = PrimitiveArray::new( - buffer![189u8, 189, 189, 189, 189, 189, 189, 189, 189, 0, 46], - Validity::from_iter(vec![ - false, false, false, false, false, false, false, false, false, false, true, - ]), - ); - let compressed = SparseScheme.compress(&IntegerStats::generate(&array), false, 3, &[])?; - assert!(compressed.is::()); - let decoded = compressed.clone(); - let expected = PrimitiveArray::new( - buffer![0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 46], - array.validity().clone(), - ) - .into_array(); - assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); - Ok(()) - } - - #[test] - fn nullable_sequence() -> VortexResult<()> { - let values = (0i32..20).step_by(7).collect_vec(); - let array = PrimitiveArray::from_option_iter(values.clone().into_iter().map(Some)); - let compressed = SequenceScheme.compress(&IntegerStats::generate(&array), false, 3, &[])?; - assert!(compressed.is::()); - let decoded = compressed; - let expected = PrimitiveArray::from_option_iter(values.into_iter().map(Some)).into_array(); - assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); - Ok(()) - } - - #[test] - fn test_rle_compression() -> VortexResult<()> { - let mut values = Vec::new(); - values.extend(iter::repeat_n(42i32, 100)); - values.extend(iter::repeat_n(123i32, 200)); - values.extend(iter::repeat_n(987i32, 150)); - - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let compressed = - RLE_INTEGER_SCHEME.compress(&IntegerStats::generate(&array), false, 3, &[])?; - - let decoded = compressed; - let expected = Buffer::copy_from(&values).into_array(); - assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); - Ok(()) - } - - #[test_with::env(CI)] - #[test_with::no_env(VORTEX_SKIP_SLOW_TESTS)] - fn compress_large_int() -> VortexResult<()> { - const NUM_LISTS: usize = 10_000; - const ELEMENTS_PER_LIST: usize = 5_000; - - let prim = (0..NUM_LISTS) - .flat_map(|list_idx| { - (0..ELEMENTS_PER_LIST).map(move |elem_idx| (list_idx * 1000 + elem_idx) as f64) - }) - .collect::(); - - drop(FloatCompressor::compress(&prim, false, 3, &[])?); - - Ok(()) - } -} - -/// Tests to verify that each integer compression scheme produces the expected encoding. -#[cfg(test)] -mod scheme_selection_tests { - use std::iter; - - use vortex_array::arrays::ConstantVTable; - use vortex_array::arrays::DictVTable; - use vortex_array::arrays::PrimitiveArray; - use vortex_array::validity::Validity; - use vortex_buffer::Buffer; - use vortex_fastlanes::BitPackedVTable; - use vortex_fastlanes::FoRVTable; - use vortex_fastlanes::RLEVTable; - use vortex_runend::RunEndVTable; - use vortex_sequence::SequenceVTable; - use vortex_sparse::SparseVTable; - - use crate::Compressor; - use crate::integer::IntCompressor; - - #[test] - fn test_constant_compressed() { - let values: Vec = iter::repeat_n(42, 100).collect(); - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let compressed = IntCompressor::compress(&array, false, 3, &[]).unwrap(); - assert!(compressed.is::()); - } - - #[test] - fn test_for_compressed() { - let values: Vec = (0..1000).map(|i| 1_000_000 + ((i * 37) % 100)).collect(); - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let compressed = IntCompressor::compress(&array, false, 3, &[]).unwrap(); - assert!(compressed.is::()); - } - - #[test] - fn test_bitpacking_compressed() { - let values: Vec = (0..1000).map(|i| i % 16).collect(); - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let compressed = IntCompressor::compress(&array, false, 3, &[]).unwrap(); - assert!(compressed.is::()); - } - - #[test] - fn test_sparse_compressed() { - let mut values: Vec = Vec::new(); - for i in 0..1000 { - if i % 20 == 0 { - values.push(2_000_000 + (i * 7) % 1000); - } else { - values.push(1_000_000); - } - } - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let compressed = IntCompressor::compress(&array, false, 3, &[]).unwrap(); - assert!(compressed.is::()); - } - - #[test] - fn test_dict_compressed() { - use rand::RngCore; - use rand::SeedableRng; - use rand::rngs::StdRng; - - let mut codes = Vec::with_capacity(65_535); - let numbers: Vec = [0, 10, 50, 100, 1000, 3000] - .into_iter() - .map(|i| 12340 * i) // must be big enough to not prefer fastlanes.bitpacked - .collect(); - - let mut rng = StdRng::seed_from_u64(1u64); - while codes.len() < 64000 { - let run_length = rng.next_u32() % 5; - let value = numbers[rng.next_u32() as usize % numbers.len()]; - for _ in 0..run_length { - codes.push(value); - } - } - - let array = PrimitiveArray::new(Buffer::copy_from(&codes), Validity::NonNullable); - let compressed = IntCompressor::compress(&array, false, 3, &[]).unwrap(); - assert!(compressed.is::()); - } - - #[test] - fn test_runend_compressed() { - let mut values: Vec = Vec::new(); - for i in 0..100 { - values.extend(iter::repeat_n((i32::MAX - 50).wrapping_add(i), 10)); - } - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let compressed = IntCompressor::compress(&array, false, 3, &[]).unwrap(); - assert!(compressed.is::()); - } - - #[test] - fn test_sequence_compressed() { - let values: Vec = (0..1000).map(|i| i * 7).collect(); - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let compressed = IntCompressor::compress(&array, false, 3, &[]).unwrap(); - assert!(compressed.is::()); - } - - #[test] - fn test_rle_compressed() { - let mut values: Vec = Vec::new(); - for i in 0..10 { - values.extend(iter::repeat_n(i, 100)); - } - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let compressed = IntCompressor::compress(&array, false, 3, &[]).unwrap(); - assert!(compressed.is::()); - } -} +// +// #[cfg(test)] +// mod tests { +// use std::iter; +// +// use itertools::Itertools; +// use rand::RngCore; +// use rand::SeedableRng; +// use rand::rngs::StdRng; +// use vortex_array::Array; +// use vortex_array::IntoArray; +// use vortex_array::ToCanonical; +// use vortex_array::arrays::DictVTable; +// use vortex_array::arrays::PrimitiveArray; +// use vortex_array::assert_arrays_eq; +// use vortex_array::validity::Validity; +// use vortex_array::vtable::ValidityHelper; +// use vortex_buffer::Buffer; +// use vortex_buffer::BufferMut; +// use vortex_buffer::buffer; +// use vortex_error::VortexResult; +// use vortex_sequence::SequenceVTable; +// use vortex_sparse::SparseVTable; +// +// use crate::Compressor; +// use crate::CompressorStats; +// use crate::FloatCompressor; +// use crate::Scheme; +// use crate::integer::IntCompressor; +// use crate::integer::IntegerStats; +// use crate::integer::RLE_INTEGER_SCHEME; +// use crate::integer::SequenceScheme; +// use crate::integer::SparseScheme; +// +// #[test] +// fn test_empty() { +// // Make sure empty array compression does not fail +// let result = IntCompressor::default() +// .compress( +// &PrimitiveArray::new(Buffer::::empty(), Validity::NonNullable), +// false, +// 3, +// ) +// .unwrap(); +// +// assert!(result.is_empty()); +// } +// +// #[test] +// fn test_dict_encodable() -> VortexResult<()> { +// let mut codes = BufferMut::::with_capacity(65_535); +// // Write some runs of length 3 of a handful of different values. Interrupted by some +// // one-off values. +// +// let numbers = [0, 10, 50, 100, 1000, 3000] +// .into_iter() +// .map(|i| 12340 * i) // must be big enough to not prefer fastlanes.bitpacked +// .collect_vec(); +// +// let mut rng = StdRng::seed_from_u64(1u64); +// while codes.len() < 64000 { +// let run_length = rng.next_u32() % 5; +// let value = numbers[rng.next_u32() as usize % numbers.len()]; +// for _ in 0..run_length { +// codes.push(value); +// } +// } +// +// let primitive = codes.freeze().into_array().to_primitive(); +// let compressed = IntCompressor::default().compress(&primitive, false, 3)?; +// assert!(compressed.is::()); +// Ok(()) +// } +// +// #[test] +// fn sparse_with_nulls() -> VortexResult<()> { +// let array = PrimitiveArray::new( +// buffer![189u8, 189, 189, 0, 46], +// Validity::from_iter(vec![true, true, true, true, false]), +// ); +// let compressed = SparseScheme.compress(&IntegerStats::generate(&array), false, 3, &[])?; +// assert!(compressed.is::()); +// let decoded = compressed.clone(); +// let expected = +// PrimitiveArray::new(buffer![189u8, 189, 189, 0, 0], array.validity().clone()) +// .into_array(); +// assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); +// Ok(()) +// } +// +// #[test] +// fn sparse_mostly_nulls() -> VortexResult<()> { +// let array = PrimitiveArray::new( +// buffer![189u8, 189, 189, 189, 189, 189, 189, 189, 189, 0, 46], +// Validity::from_iter(vec![ +// false, false, false, false, false, false, false, false, false, false, true, +// ]), +// ); +// let compressed = SparseScheme.compress(&IntegerStats::generate(&array), false, 3, &[])?; +// assert!(compressed.is::()); +// let decoded = compressed.clone(); +// let expected = PrimitiveArray::new( +// buffer![0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 46], +// array.validity().clone(), +// ) +// .into_array(); +// assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); +// Ok(()) +// } +// +// #[test] +// fn nullable_sequence() -> VortexResult<()> { +// let values = (0i32..20).step_by(7).collect_vec(); +// let array = PrimitiveArray::from_option_iter(values.clone().into_iter().map(Some)); +// let compressed = SequenceScheme.compress(&IntegerStats::generate(&array), false, 3, &[])?; +// assert!(compressed.is::()); +// let decoded = compressed; +// let expected = PrimitiveArray::from_option_iter(values.into_iter().map(Some)).into_array(); +// assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); +// Ok(()) +// } +// +// #[test] +// fn test_rle_compression() -> VortexResult<()> { +// let mut values = Vec::new(); +// values.extend(iter::repeat_n(42i32, 100)); +// values.extend(iter::repeat_n(123i32, 200)); +// values.extend(iter::repeat_n(987i32, 150)); +// +// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); +// let compressed = +// RLE_INTEGER_SCHEME.compress(&IntegerStats::generate(&array), false, 3, &[])?; +// +// let decoded = compressed; +// let expected = Buffer::copy_from(&values).into_array(); +// assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); +// Ok(()) +// } +// +// #[test_with::env(CI)] +// #[test_with::no_env(VORTEX_SKIP_SLOW_TESTS)] +// fn compress_large_int() -> VortexResult<()> { +// const NUM_LISTS: usize = 10_000; +// const ELEMENTS_PER_LIST: usize = 5_000; +// +// let prim = (0..NUM_LISTS) +// .flat_map(|list_idx| { +// (0..ELEMENTS_PER_LIST).map(move |elem_idx| (list_idx * 1000 + elem_idx) as f64) +// }) +// .collect::(); +// +// drop(FloatCompressor::compress_static(&prim, false, 3, &[])?); +// +// Ok(()) +// } +// } +// +// /// Tests to verify that each integer compression scheme produces the expected encoding. +// #[cfg(test)] +// mod scheme_selection_tests { +// use std::iter; +// +// use vortex_array::arrays::ConstantVTable; +// use vortex_array::arrays::DictVTable; +// use vortex_array::arrays::PrimitiveArray; +// use vortex_array::validity::Validity; +// use vortex_buffer::Buffer; +// use vortex_error::VortexResult; +// use vortex_fastlanes::BitPackedVTable; +// use vortex_fastlanes::FoRVTable; +// use vortex_fastlanes::RLEVTable; +// use vortex_runend::RunEndVTable; +// use vortex_sequence::SequenceVTable; +// use vortex_sparse::SparseVTable; +// +// use crate::Compressor; +// use crate::integer::IntCompressor; +// +// #[test] +// fn test_constant_compressed() { +// let values: Vec = iter::repeat_n(42, 100).collect(); +// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); +// let compressed = IntCompressor::default().compress(&array, false, 3).unwrap(); +// assert!(compressed.is::()); +// } +// +// #[test] +// fn test_for_compressed() { +// let values: Vec = (0..1000).map(|i| 1_000_000 + ((i * 37) % 100)).collect(); +// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); +// let compressed = IntCompressor::default().compress(&array, false, 3).unwrap(); +// assert!(compressed.is::()); +// } +// +// #[test] +// fn test_bitpacking_compressed() { +// let values: Vec = (0..1000).map(|i| i % 16).collect(); +// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); +// let compressed = IntCompressor::default().compress(&array, false, 3).unwrap(); +// assert!(compressed.is::()); +// } +// +// #[test] +// fn test_sparse_compressed() { +// let mut values: Vec = Vec::new(); +// for i in 0..1000 { +// if i % 20 == 0 { +// values.push(2_000_000 + (i * 7) % 1000); +// } else { +// values.push(1_000_000); +// } +// } +// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); +// let compressed = IntCompressor::default().compress(&array, false, 3).unwrap(); +// assert!(compressed.is::()); +// } +// +// #[test] +// fn test_dict_compressed() { +// use rand::RngCore; +// use rand::SeedableRng; +// use rand::rngs::StdRng; +// +// let mut codes = Vec::with_capacity(65_535); +// let numbers: Vec = [0, 10, 50, 100, 1000, 3000] +// .into_iter() +// .map(|i| 12340 * i) // must be big enough to not prefer fastlanes.bitpacked +// .collect(); +// +// let mut rng = StdRng::seed_from_u64(1u64); +// while codes.len() < 64000 { +// let run_length = rng.next_u32() % 5; +// let value = numbers[rng.next_u32() as usize % numbers.len()]; +// for _ in 0..run_length { +// codes.push(value); +// } +// } +// +// let array = PrimitiveArray::new(Buffer::copy_from(&codes), Validity::NonNullable); +// let compressed = IntCompressor::default().compress(&array, false, 3).unwrap(); +// assert!(compressed.is::()); +// } +// +// #[test] +// fn test_runend_compressed() { +// let mut values: Vec = Vec::new(); +// for i in 0..100 { +// values.extend(iter::repeat_n((i32::MAX - 50).wrapping_add(i), 10)); +// } +// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); +// let compressed = IntCompressor::default().compress(&array, false, 3).unwrap(); +// assert!(compressed.is::()); +// } +// +// #[test] +// fn test_sequence_compressed() { +// let values: Vec = (0..1000).map(|i| i * 7).collect(); +// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); +// let compressed = IntCompressor::default().compress(&array, false, 3).unwrap(); +// assert!(compressed.is::()); +// } +// +// #[test] +// fn test_rle_compressed() { +// let mut values: Vec = Vec::new(); +// for i in 0..10 { +// values.extend(iter::repeat_n(i, 100)); +// } +// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); +// let compressed = IntCompressor::default().compress(&array, false, 3).unwrap(); +// assert!(compressed.is::()); +// } +// +// #[test] +// fn test_prim_constant() -> VortexResult<()> { +// tracing_subscriber::fmt() +// .with_max_level(tracing::Level::TRACE) +// .init(); +// +// let prim = (0..1000).map(|_x| 40).collect::(); +// let comp = IntCompressor::default(); +// let resul = comp.compress(&prim, false, 2)?; +// println!("res {}", resul); +// +// Ok(()) +// } +// } diff --git a/vortex-btrblocks/src/lib.rs b/vortex-btrblocks/src/lib.rs index 2b50589dcdc..58df412786e 100644 --- a/vortex-btrblocks/src/lib.rs +++ b/vortex-btrblocks/src/lib.rs @@ -37,9 +37,8 @@ use std::fmt::Debug; use std::hash::Hash; -use std::sync::Arc; +use std::hash::Hasher; -use enum_iterator::all; use vortex_array::Array; use vortex_array::ArrayRef; use vortex_array::Canonical; @@ -62,7 +61,6 @@ use vortex_dtype::Nullability; use vortex_dtype::datetime::Timestamp; use vortex_error::VortexExpect; use vortex_error::VortexResult; -use vortex_utils::aliases::hash_set::HashSet; use crate::decimal::compress_decimal; pub use crate::float::FloatCode; @@ -90,6 +88,62 @@ mod temporal; pub use builder::BtrBlocksCompressorBuilder; +use crate::float::FloatScheme; +use crate::integer::IntegerScheme; +use crate::string::StringScheme; + +/// Holds references to exclude lists for each compression code type. +/// +/// This struct is passed through recursive compression calls to specify +/// which schemes should be excluded at each level. +#[derive(Debug, Clone, Copy, Default)] +pub struct Excludes<'a> { + /// Integer schemes to exclude. + pub int: &'a [IntCode], + /// Float schemes to exclude. + pub float: &'a [FloatCode], + /// String schemes to exclude. + pub string: &'a [StringCode], +} + +impl<'a> Excludes<'a> { + /// Creates an empty excludes (no exclusions). + pub const fn none() -> Self { + Self { + int: &[], + float: &[], + string: &[], + } + } + + /// Creates excludes with only integer exclusions. + pub const fn int_only(int: &'a [IntCode]) -> Self { + Self { + int, + float: &[], + string: &[], + } + } + + /// Creates excludes with only float exclusions. + pub const fn float_only(float: &'a [FloatCode]) -> Self { + Self { + int: &[], + float, + string: &[], + } + } + + /// Creates excludes with only string exclusions. + pub const fn string_only(string: &'a [StringCode]) -> Self { + Self { + int: &[], + float: &[], + string, + } + } +} + /// Configures how stats are generated. pub struct GenerateStatsOptions { /// Should distinct values should be counted during stats generation. @@ -171,6 +225,7 @@ pub trait Scheme: Debug { /// Returns the estimated compression ratio as well as the tree of compressors to use. fn expected_compression_ratio( &self, + compressor: &BtrBlocksCompressor, stats: &Self::StatsType, is_sample: bool, allowed_cascading: usize, @@ -178,6 +233,7 @@ pub trait Scheme: Debug { ) -> VortexResult { estimate_compression_ratio_with_sampling( self, + compressor, stats, is_sample, allowed_cascading, @@ -188,6 +244,7 @@ pub trait Scheme: Debug { /// Compress the input with this scheme, yielding a new array. fn compress( &self, + compressor: &BtrBlocksCompressor, stats: &Self::StatsType, is_sample: bool, allowed_cascading: usize, @@ -195,8 +252,21 @@ pub trait Scheme: Debug { ) -> VortexResult; } +impl PartialEq for dyn Scheme { + fn eq(&self, other: &Self) -> bool { + self.code() == other.code() + } +} +impl Eq for dyn Scheme {} +impl Hash for dyn Scheme { + fn hash(&self, state: &mut H) { + self.code().hash(state) + } +} + fn estimate_compression_ratio_with_sampling( - compressor: &T, + scheme: &T, + btr_blocks_compressor: &BtrBlocksCompressor, stats: &T::StatsType, is_sample: bool, allowed_cascading: usize, @@ -230,13 +300,19 @@ fn estimate_compression_ratio_with_sampling( stats.sample(SAMPLE_SIZE, sample_count) }; - let after = compressor - .compress(&sample, true, allowed_cascading, excludes)? + let after = scheme + .compress( + btr_blocks_compressor, + &sample, + true, + allowed_cascading, + excludes, + )? .nbytes(); let before = sample.source().nbytes(); tracing::debug!( - "estimate_compression_ratio_with_sampling(compressor={compressor:#?} is_sample={is_sample}, allowed_cascading={allowed_cascading}) = {}", + "estimate_compression_ratio_with_sampling(compressor={scheme:#?} is_sample={is_sample}, allowed_cascading={allowed_cascading}) = {}", before as f64 / after as f64 ); @@ -262,52 +338,13 @@ pub trait Compressor { /// The statistics type used to analyze arrays for compression. type StatsType: CompressorStats; + /// Generates statistics for the given array to guide compression scheme selection. + fn gen_stats(&self, array: &::Array) -> Self::StatsType; + /// Returns all available compression schemes for this compressor. - fn schemes() -> &'static [&'static Self::SchemeType]; + fn schemes(&self) -> &[&'static Self::SchemeType]; /// Returns the default fallback compression scheme. - fn default_scheme() -> &'static Self::SchemeType; - /// Returns the scheme code for dictionary compression. - fn dict_scheme_code() -> ::CodeType; - - /// Compresses an array using the optimal compression scheme. - /// - /// Generates statistics on the input array, selects the best compression scheme, - /// and applies it. Returns the original array if compression would increase size. - fn compress( - array: &::Array, - is_sample: bool, - allowed_cascading: usize, - excludes: &[::CodeType], - ) -> VortexResult - where - Self::SchemeType: 'static, - { - // Avoid compressing empty arrays. - if array.is_empty() { - return Ok(array.to_array()); - } - - // Generate stats on the array directly. - let stats = if excludes.contains(&Self::dict_scheme_code()) { - Self::StatsType::generate_opts( - array, - GenerateStatsOptions { - count_distinct_values: false, - }, - ) - } else { - Self::StatsType::generate(array) - }; - let best_scheme = Self::choose_scheme(&stats, is_sample, allowed_cascading, excludes)?; - - let output = best_scheme.compress(&stats, is_sample, allowed_cascading, excludes)?; - if output.nbytes() < array.nbytes() { - Ok(output) - } else { - tracing::debug!("resulting tree too large: {}", output.display_tree()); - Ok(array.to_array()) - } - } + fn default_scheme(&self) -> &'static Self::SchemeType; /// Selects the best compression scheme based on expected compression ratios. /// @@ -316,6 +353,8 @@ pub trait Compressor { /// if no scheme provides compression benefits. #[allow(clippy::cognitive_complexity)] fn choose_scheme( + &self, + compressor: &BtrBlocksCompressor, stats: &Self::StatsType, is_sample: bool, allowed_cascading: usize, @@ -327,7 +366,8 @@ pub trait Compressor { // logging helpers let depth = MAX_CASCADE - allowed_cascading; - for scheme in Self::schemes().iter() { + for scheme in self.schemes().iter() { + // Skip excluded schemes if excludes.contains(&scheme.code()) { continue; } @@ -337,10 +377,21 @@ pub trait Compressor { continue; } - tracing::trace!(is_sample, depth, ?scheme, "Trying compression scheme"); + tracing::trace!( + is_sample, + depth, + is_constant = scheme.is_constant(), + ?scheme, + "Trying compression scheme" + ); - let ratio = - scheme.expected_compression_ratio(stats, is_sample, allowed_cascading, excludes)?; + let ratio = scheme.expected_compression_ratio( + compressor, + stats, + is_sample, + allowed_cascading, + excludes, + )?; tracing::trace!( is_sample, depth, @@ -366,52 +417,54 @@ pub trait Compressor { if let Some(best) = best_scheme { Ok(best) } else { - Ok(Self::default_scheme()) + Ok(self.default_scheme()) } } } -/// Configuration for allowed compression schemes. +/// Compresses an array using the given compressor. /// -/// This is immutable after construction and can be shared across multiple compression calls. -/// Use [`BtrBlocksCompressorBuilder`] to create a custom configuration. -#[derive(Debug, Clone)] -pub struct BtrBlocksCompressorConfig { - /// Allowed integer compression schemes. - int_schemes: HashSet, - - /// Allowed float compression schemes. - float_schemes: HashSet, - - /// Allowed string compression schemes. - string_schemes: HashSet, -} +/// Generates statistics on the input array, selects the best compression scheme, +/// and applies it. Returns the original array if compression would increase size. +pub fn compress( + c: &C, + compressor: &BtrBlocksCompressor, + array: &<::ArrayVTable as VTable>::Array, + is_sample: bool, + allowed_cascading: usize, + excludes: &[::CodeType], +) -> VortexResult +where + ::SchemeType: 'static, +{ + // Avoid compressing empty arrays. + if array.is_empty() { + return Ok(array.to_array()); + } -impl Default for BtrBlocksCompressorConfig { - fn default() -> Self { - Self { - int_schemes: all::().collect(), - float_schemes: all::().collect(), - string_schemes: all::().collect(), - } + // Generate stats on the array directly. + let stats = c.gen_stats(array); + let best_scheme = + c.choose_scheme(compressor, &stats, is_sample, allowed_cascading, excludes)?; + + let output = + best_scheme.compress(compressor, &stats, is_sample, allowed_cascading, excludes)?; + if output.nbytes() < array.nbytes() { + Ok(output) + } else { + tracing::debug!("resulting tree too large: {}", output.display_tree()); + Ok(array.to_array()) } } -impl BtrBlocksCompressorConfig { - /// Creates a config from the given allowed schemes. - /// - /// This is used by [`BtrBlocksCompressorBuilder::build`]. - pub(crate) fn from_schemes( - int_schemes: HashSet, - float_schemes: HashSet, - string_schemes: HashSet, - ) -> Self { - Self { - int_schemes, - float_schemes, - string_schemes, - } - } +trait CanonicalCompressor { + fn compress_canonical( + &self, + array: Canonical, + is_sample: bool, + allowed_cascading: usize, + excludes: Excludes, + ) -> VortexResult; } /// The main compressor type implementing BtrBlocks-inspired compression. @@ -441,30 +494,18 @@ impl BtrBlocksCompressorConfig { /// .exclude_int([IntCode::Dict]) /// .build(); /// ``` -#[derive(Debug, Clone)] +#[derive(Clone, Default)] pub struct BtrBlocksCompressor { - /// Immutable configuration for allowed schemes. - config: Arc, - - /// Runtime integer excludes used during recursion. - int_excludes: Vec, + /// Integer compressor with configured schemes. + pub int_schemes: Vec<&'static dyn IntegerScheme>, - /// Runtime float excludes used during recursion. - float_excludes: Vec, + /// Float compressor with configured schemes. + // float_compressor: FloatCompressor, + pub float_schemes: Vec<&'static dyn FloatScheme>, - /// Runtime string excludes used during recursion. - string_excludes: Vec, -} - -impl Default for BtrBlocksCompressor { - fn default() -> Self { - Self { - config: Arc::new(BtrBlocksCompressorConfig::default()), - int_excludes: Vec::new(), - float_excludes: Vec::new(), - string_excludes: Vec::new(), - } - } + /// String compressor with configured schemes. + pub string_schemes: Vec<&'static dyn StringScheme>, + // string_compressor: StringCompressor, } impl BtrBlocksCompressor { @@ -473,75 +514,6 @@ impl BtrBlocksCompressor { Self::default() } - /// Creates a compressor from a config. - /// - /// This is used by [`BtrBlocksCompressorBuilder::build`]. - pub(crate) fn from_config(config: BtrBlocksCompressorConfig) -> Self { - Self { - config: Arc::new(config), - int_excludes: Vec::new(), - float_excludes: Vec::new(), - string_excludes: Vec::new(), - } - } - - fn int_excludes(&self) -> Vec { - all::() - .filter(|c| !self.config.int_schemes.contains(c) || self.int_excludes.contains(c)) - .collect() - } - - fn float_excludes(&self) -> Vec { - all::() - .filter(|c| !self.config.float_schemes.contains(c) || self.float_excludes.contains(c)) - .collect() - } - - fn string_excludes(&self) -> Vec { - all::() - .filter(|c| !self.config.string_schemes.contains(c) || self.string_excludes.contains(c)) - .collect() - } - - /// Returns a new compressor with additional runtime int excludes for recursion. - #[allow(dead_code)] - fn with_int_excludes(&self, excludes: impl IntoIterator) -> Self { - let mut int_excludes = self.int_excludes.clone(); - int_excludes.extend(excludes); - Self { - config: Arc::clone(&self.config), - int_excludes, - float_excludes: self.float_excludes.clone(), - string_excludes: self.string_excludes.clone(), - } - } - - /// Returns a new compressor with additional runtime float excludes for recursion. - #[allow(dead_code)] - fn with_float_excludes(&self, excludes: impl IntoIterator) -> Self { - let mut float_excludes = self.float_excludes.clone(); - float_excludes.extend(excludes); - Self { - config: Arc::clone(&self.config), - int_excludes: self.int_excludes.clone(), - float_excludes, - string_excludes: self.string_excludes.clone(), - } - } - - /// Returns a new compressor with additional runtime string excludes for recursion. - #[allow(dead_code)] - fn with_string_excludes(&self, excludes: impl IntoIterator) -> Self { - let mut string_excludes = self.string_excludes.clone(); - string_excludes.extend(excludes); - Self { - config: Arc::clone(&self.config), - int_excludes: self.int_excludes.clone(), - float_excludes: self.float_excludes.clone(), - string_excludes, - } - } - /// Compresses an array using BtrBlocks-inspired compression. /// /// First canonicalizes and compacts the array, then applies optimal compression schemes. @@ -552,30 +524,48 @@ impl BtrBlocksCompressor { // Compact it, removing any wasted space before we attempt to compress it let compact = canonical.compact()?; - self.compress_canonical(compact) + self.compress_canonical(compact, Excludes::none()) } +} +impl CanonicalCompressor for BtrBlocksCompressor { /// Compresses a canonical array by dispatching to type-specific compressors. /// /// Recursively compresses nested structures and applies optimal schemes for each data type. - pub fn compress_canonical(&self, array: Canonical) -> VortexResult { + fn compress_canonical<'a>( + &self, + array: Canonical, + is_sample: bool, + allowed_cascading: usize, + excludes: Excludes<'a>, + ) -> VortexResult { match array { Canonical::Null(null_array) => Ok(null_array.into_array()), // TODO(aduffy): Sparse, other bool compressors. Canonical::Bool(bool_array) => Ok(bool_array.into_array()), Canonical::Primitive(primitive) => { if primitive.ptype().is_int() { - IntCompressor::compress(&primitive, false, MAX_CASCADE, &self.int_excludes()) + // compress( + // &self.int_schemes, + // &primitive, + // false, + // MAX_CASCADE, + // excludes.int, + // ) + todo!() } else { - FloatCompressor::compress( + compress( + &FloatCompressor { + btr_blocks_compressor: &self, + }, &primitive, false, MAX_CASCADE, - &self.float_excludes(), + excludes.float, ) } } - Canonical::Decimal(decimal) => compress_decimal(&decimal), + Canonical::Decimal(decimal) => compress_decimal(self, &decimal), Canonical::Struct(struct_array) => { let fields = struct_array .unmasked_fields() @@ -607,7 +597,7 @@ impl BtrBlocksCompressor { // we guarantee above that all elements are referenced by offsets, we may narrow the // widths. - let compressed_offsets = IntCompressor::compress_no_dict( + let compressed_offsets = IntCompressor::compress_no_dict_static( &list_array.offsets().to_primitive().narrow()?, false, MAX_CASCADE, @@ -637,12 +627,14 @@ impl BtrBlocksCompressor { .dtype() .eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable)) { - StringCompressor::compress( - &strings, - false, - MAX_CASCADE, - &self.string_excludes(), - ) + // compress( + // &self.string_schemes, + // &strings, + // false, + // MAX_CASCADE, + // excludes.string, + // ) + todo!() } else { // Binary arrays do not compress Ok(strings.into_array()) @@ -681,3 +673,53 @@ impl BtrBlocksCompressor { } } } + +/// Context passed through recursive compression calls. +#[derive(Debug, Clone, Copy)] +pub struct CompressorContext<'a> { + /// Whether we're compressing a sample (for ratio estimation). + pub is_sample: bool, + /// Remaining cascade depth allowed. + pub allowed_cascading: usize, + /// Schemes to exclude at this level. + pub excludes: Excludes<'a>, +} + +impl<'a> CompressorContext<'a> { + /// Creates a new context for top-level compression. + pub fn new(allowed_cascading: usize) -> Self { + Self { + is_sample: false, + allowed_cascading, + excludes: Excludes::none(), + } + } + + /// Creates a context for sample-based compression ratio estimation. + pub fn for_sample(allowed_cascading: usize) -> Self { + Self { + is_sample: true, + allowed_cascading, + excludes: Excludes::none(), + } + } + + /// Returns a new context with decremented cascade depth. + pub fn decrement_cascade(self) -> Self { + Self { + allowed_cascading: self.allowed_cascading.saturating_sub(1), + ..self + } + } + + /// Returns a new context with additional integer excludes. + pub fn with_int_excludes(self, int: &'a [IntCode]) -> Self { + Self { + excludes: Excludes { + int, + ..self.excludes + }, + ..self + } + } +} diff --git a/vortex-btrblocks/src/patches.rs b/vortex-btrblocks/src/patches.rs index 9890ab7bd07..d9bee570824 100644 --- a/vortex-btrblocks/src/patches.rs +++ b/vortex-btrblocks/src/patches.rs @@ -8,6 +8,8 @@ use vortex_array::arrays::ConstantArray; use vortex_array::patches::Patches; use vortex_error::VortexResult; +use crate::BtrBlocksCompressor; + /// Compresses the given patches by downscaling integers and checking for constant values. pub fn compress_patches(patches: &Patches) -> VortexResult { // Downscale the patch indices. diff --git a/vortex-btrblocks/src/rle.rs b/vortex-btrblocks/src/rle.rs index 507e8cf8c0e..a66397ab896 100644 --- a/vortex-btrblocks/src/rle.rs +++ b/vortex-btrblocks/src/rle.rs @@ -3,6 +3,7 @@ use std::fmt::Debug; use std::hash::Hash; +use std::marker::PhantomData; use vortex_array::ArrayRef; use vortex_array::IntoArray; @@ -11,6 +12,7 @@ use vortex_array::arrays::PrimitiveArray; use vortex_error::VortexResult; use vortex_fastlanes::RLEArray; +use crate::BtrBlocksCompressor; use crate::CompressorStats; use crate::Scheme; use crate::estimate_compression_ratio_with_sampling; @@ -19,53 +21,69 @@ use crate::integer::IntCompressor; /// Threshold for the average run length in an array before we consider run-length encoding. pub const RUN_LENGTH_THRESHOLD: u32 = 4; +/// Trait for accessing RLE-specific statistics. pub trait RLEStats { fn value_count(&self) -> u32; fn average_run_length(&self) -> u32; fn source(&self) -> &PrimitiveArray; } -/// RLE scheme that is generic over stats and code. -#[derive(Debug, Clone, Copy)] -pub struct RLEScheme { - pub code: Code, - /// Function to compress values - pub compress_values_fn: fn(&PrimitiveArray, bool, usize, &[Code]) -> VortexResult, - /// Phantom data to tie the scheme to specific stats type - _phantom: std::marker::PhantomData, +/// Configuration trait for RLE schemes. +/// +/// Implement this trait to define the behavior of an RLE scheme for a specific +/// stats and code type combination. +pub trait RLEConfig: Debug + Send + Sync + 'static { + /// The statistics type used by this RLE scheme. + type Stats: RLEStats + CompressorStats; + /// The code type used to identify schemes. + type Code: Copy + Clone + Debug + Hash + PartialEq + Eq; + + /// The unique code identifying this RLE scheme. + const CODE: Self::Code; + + /// Compress the values array after RLE encoding. + fn compress_values( + values: &PrimitiveArray, + is_sample: bool, + allowed_cascading: usize, + excludes: &[Self::Code], + ) -> VortexResult; } -impl RLEScheme { - pub const fn new( - code: C, - compress_values_fn: fn(&PrimitiveArray, bool, usize, &[C]) -> VortexResult, - ) -> Self { - Self { - code, - compress_values_fn, - _phantom: std::marker::PhantomData, - } +/// RLE scheme that is generic over a configuration type. +/// +/// This is a ZST (zero-sized type) - all behavior is defined by the `RLEConfig` trait. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct RLEScheme(PhantomData); + +impl RLEScheme { + /// Creates a new RLE scheme. + pub const fn new() -> Self { + Self(PhantomData) } } -impl Scheme for RLEScheme -where - S: RLEStats + CompressorStats, - C: Copy + Clone + Debug + Hash + PartialEq + Eq, -{ - type StatsType = S; - type CodeType = C; +impl Default for RLEScheme { + fn default() -> Self { + Self::new() + } +} - fn code(&self) -> C { - self.code +impl Scheme for RLEScheme { + type StatsType = C::Stats; + type CodeType = C::Code; + + fn code(&self) -> C::Code { + C::CODE } fn expected_compression_ratio( &self, + compressor: &BtrBlocksCompressor, stats: &Self::StatsType, is_sample: bool, allowed_cascading: usize, - excludes: &[C], + excludes: &[C::Code], ) -> VortexResult { // RLE is only useful when we cascade it with another encoding. if allowed_cascading == 0 { @@ -85,6 +103,7 @@ where // Run compression on a sample to see how it performs. estimate_compression_ratio_with_sampling( self, + compressor, stats, is_sample, allowed_cascading, @@ -94,10 +113,11 @@ where fn compress( &self, + compressor: &BtrBlocksCompressor, stats: &Self::StatsType, is_sample: bool, allowed_cascading: usize, - excludes: &[C], + excludes: &[C::Code], ) -> VortexResult { let rle_array = RLEArray::encode(RLEStats::source(stats))?; @@ -109,34 +129,21 @@ where let mut new_excludes = vec![self.code()]; new_excludes.extend_from_slice(excludes); - let compressed_values = (self.compress_values_fn)( + let compressed_values = C::compress_values( &rle_array.values().to_primitive(), is_sample, allowed_cascading - 1, &new_excludes, )?; - // NOTE(aduffy): this encoding appears to be faulty, and was causing Undefined Behavior - // checks to trigger in the gharchive benchmark dataset decompression. - // Delta in an unstable encoding, once we deem it stable we can switch over to this always. - // #[cfg(feature = "unstable_encodings")] - // // For indices and offsets, we always use integer compression without dictionary encoding. - // let compressed_indices = try_compress_delta( - // &rle_array.indices().to_primitive().narrow()?, - // is_sample, - // allowed_cascading - 1, - // &[], - // )?; - - // #[cfg(not(feature = "unstable_encodings"))] - let compressed_indices = IntCompressor::compress_no_dict( + let compressed_indices = IntCompressor::compress_no_dict_static( &rle_array.indices().to_primitive().narrow()?, is_sample, allowed_cascading - 1, &[], )?; - let compressed_offsets = IntCompressor::compress_no_dict( + let compressed_offsets = IntCompressor::compress_no_dict_static( &rle_array.values_idx_offsets().to_primitive().narrow()?, is_sample, allowed_cascading - 1, @@ -157,21 +164,3 @@ where } } } - -// #[cfg(feature = "unstable_encodings")] -// fn try_compress_delta( -// primitive_array: &PrimitiveArray, -// is_sample: bool, -// allowed_cascading: usize, -// excludes: &[IntCode], -// ) -> VortexResult { -// use vortex_fastlanes::{DeltaArray, delta_compress}; -// -// let (bases, deltas) = delta_compress(primitive_array)?; -// let compressed_bases = IntCompressor::compress(&bases, is_sample, allowed_cascading, excludes)?; -// let compressed_deltas = -// IntCompressor::compress_no_dict(&deltas, is_sample, allowed_cascading, excludes)?; -// -// DeltaArray::try_from_delta_compress_parts(compressed_bases, compressed_deltas) -// .map(DeltaArray::into_array) -// } diff --git a/vortex-btrblocks/src/string.rs b/vortex-btrblocks/src/string.rs index 560003be95c..67f86204516 100644 --- a/vortex-btrblocks/src/string.rs +++ b/vortex-btrblocks/src/string.rs @@ -1,8 +1,12 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use std::hash::Hash; +use std::hash::Hasher; + use enum_iterator::Sequence; use vortex_array::ArrayRef; +use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::ToCanonical; use vortex_array::arrays::ConstantArray; @@ -13,6 +17,7 @@ use vortex_array::arrays::VarBinViewArray; use vortex_array::arrays::VarBinViewVTable; use vortex_array::builders::dict::dict_encode; use vortex_array::compute::is_constant; +use vortex_array::vtable::VTable; use vortex_array::vtable::ValidityHelper; use vortex_error::VortexExpect; use vortex_error::VortexResult; @@ -25,10 +30,14 @@ use vortex_sparse::SparseArray; use vortex_sparse::SparseVTable; use vortex_utils::aliases::hash_set::HashSet; +use crate::BtrBlocksCompressor; +use crate::CanonicalCompressor; use crate::Compressor; use crate::CompressorStats; +use crate::Excludes; use crate::GenerateStatsOptions; use crate::Scheme; +use crate::compress; use crate::estimate_compression_ratio_with_sampling; use crate::integer; use crate::integer::IntCompressor; @@ -106,50 +115,136 @@ impl CompressorStats for StringStats { } } +/// All available string compression schemes. +pub const ALL_STRING_SCHEMES: &[&dyn StringScheme] = &[ + &UncompressedScheme, + &DictScheme, + &FSSTScheme, + &ConstantScheme, + &NullDominated, +]; + /// [`Compressor`] for strings. -pub struct StringCompressor; +#[derive(Clone)] +pub struct StringCompressor { + schemes: Vec<&'static dyn StringScheme>, +} + +impl Default for StringCompressor { + fn default() -> Self { + Self { + schemes: ALL_STRING_SCHEMES.to_vec(), + } + } +} + +impl StringCompressor { + /// Creates a new compressor with all schemes enabled. + pub fn new() -> Self { + Self::default() + } + + /// Creates a compressor with only the specified schemes. + pub fn with_schemes(schemes: Vec<&'static dyn StringScheme>) -> Self { + Self { schemes } + } + + /// Creates a compressor excluding schemes with the given codes. + pub fn excluding(excludes: &[StringCode]) -> Self { + Self { + schemes: ALL_STRING_SCHEMES + .iter() + .filter(|s| !excludes.contains(&s.code())) + .copied() + .collect(), + } + } + + /// Compress with default settings (static helper for internal use). + pub(crate) fn compress_static( + array: &VarBinViewArray, + is_sample: bool, + allowed_cascading: usize, + excludes: &[StringCode], + ) -> VortexResult { + let compressor = if excludes.is_empty() { + Self::default() + } else { + Self::excluding(excludes) + }; + compress(&compressor, array, is_sample, allowed_cascading, excludes) + } +} impl Compressor for StringCompressor { type ArrayVTable = VarBinViewVTable; type SchemeType = dyn StringScheme; type StatsType = StringStats; - fn schemes() -> &'static [&'static Self::SchemeType] { - &[ - &UncompressedScheme, - &DictScheme, - &FSSTScheme, - &ConstantScheme, - &NullDominated, - ] + fn gen_stats(&self, array: &::Array) -> Self::StatsType { + if self.schemes.iter().any(|s| s.code() == DictScheme.code()) { + StringStats::generate_opts( + array, + GenerateStatsOptions { + count_distinct_values: true, + }, + ) + } else { + StringStats::generate_opts( + array, + GenerateStatsOptions { + count_distinct_values: false, + }, + ) + } + } + + fn schemes(&self) -> &[&'static dyn StringScheme] { + &self.schemes } - fn default_scheme() -> &'static Self::SchemeType { + fn default_scheme(&self) -> &'static Self::SchemeType { &UncompressedScheme } +} - fn dict_scheme_code() -> StringCode { - StringCode::Dict +pub trait StringScheme: + Scheme + Send + Sync +{ +} + +impl StringScheme for T where + T: Scheme + Send + Sync +{ +} + +impl PartialEq for dyn StringScheme { + fn eq(&self, other: &Self) -> bool { + self.code() == other.code() } } -pub trait StringScheme: Scheme {} +impl Eq for dyn StringScheme {} -impl StringScheme for T where T: Scheme {} +impl Hash for dyn StringScheme { + fn hash(&self, state: &mut H) { + self.code().hash(state) + } +} -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct UncompressedScheme; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct DictScheme; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct FSSTScheme; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct ConstantScheme; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct NullDominated; /// Unique identifier for string compression schemes. @@ -177,6 +272,7 @@ impl Scheme for UncompressedScheme { fn expected_compression_ratio( &self, + compressor: &BtrBlocksCompressor, _stats: &Self::StatsType, _is_sample: bool, _allowed_cascading: usize, @@ -187,6 +283,7 @@ impl Scheme for UncompressedScheme { fn compress( &self, + compressor: &BtrBlocksCompressor, stats: &Self::StatsType, _is_sample: bool, _allowed_cascading: usize, @@ -206,6 +303,7 @@ impl Scheme for DictScheme { fn expected_compression_ratio( &self, + compressor: &BtrBlocksCompressor, stats: &Self::StatsType, is_sample: bool, allowed_cascading: usize, @@ -223,6 +321,7 @@ impl Scheme for DictScheme { estimate_compression_ratio_with_sampling( self, + compressor, stats, is_sample, allowed_cascading, @@ -232,6 +331,7 @@ impl Scheme for DictScheme { fn compress( &self, + compressor: &BtrBlocksCompressor, stats: &Self::StatsType, is_sample: bool, allowed_cascading: usize, @@ -245,20 +345,20 @@ impl Scheme for DictScheme { } // Find best compressor for codes and values separately - let compressed_codes = IntCompressor::compress( - &dict.codes().to_primitive(), + let compressed_codes = compressor.compress_canonical( + Canonical::Primitive(dict.codes().to_primitive()), is_sample, allowed_cascading - 1, - &[integer::DictScheme.code(), integer::SequenceScheme.code()], + Excludes::int_only(&[integer::DictScheme.code(), integer::SequenceScheme.code()]), )?; // Attempt to compress the values with non-Dict compression. // Currently this will only be FSST. - let compressed_values = StringCompressor::compress( - &dict.values().to_varbinview(), + let compressed_values = compressor.compress_canonical( + Canonical::VarBinView(dict.values().to_varbinview()), is_sample, allowed_cascading - 1, - &[DictScheme.code()], + Excludes::string_only(&[DictScheme.code()]), )?; // SAFETY: compressing codes or values does not alter the invariants @@ -282,26 +382,29 @@ impl Scheme for FSSTScheme { fn compress( &self, + compressor: &BtrBlocksCompressor, stats: &Self::StatsType, is_sample: bool, allowed_cascading: usize, _excludes: &[StringCode], ) -> VortexResult { - let compressor = fsst_train_compressor(&stats.src); - let fsst = fsst_compress(&stats.src, &compressor); + let fsst = { + let compressor = fsst_train_compressor(&stats.src); + fsst_compress(&stats.src, &compressor) + }; - let compressed_original_lengths = IntCompressor::compress( - &fsst.uncompressed_lengths().to_primitive().narrow()?, + let compressed_original_lengths = compressor.compress_canonical( + Canonical::Primitive(fsst.uncompressed_lengths().to_primitive().narrow()?), is_sample, allowed_cascading, - &[], + Excludes::int_only(&[]), )?; - let compressed_codes_offsets = IntCompressor::compress( - &fsst.codes().offsets().to_primitive().narrow()?, + let compressed_codes_offsets = compressor.compress_canonical( + Canonical::Primitive(fsst.codes().offsets().to_primitive().narrow()?), is_sample, allowed_cascading, - &[], + Excludes::int_only(&[]), )?; let compressed_codes = VarBinArray::try_new( compressed_codes_offsets, @@ -336,6 +439,7 @@ impl Scheme for ConstantScheme { fn expected_compression_ratio( &self, + compressor: &BtrBlocksCompressor, stats: &Self::StatsType, is_sample: bool, _allowed_cascading: usize, @@ -356,6 +460,7 @@ impl Scheme for ConstantScheme { fn compress( &self, + compressor: &BtrBlocksCompressor, stats: &Self::StatsType, _is_sample: bool, _allowed_cascading: usize, @@ -393,6 +498,7 @@ impl Scheme for NullDominated { fn expected_compression_ratio( &self, + compressor: &BtrBlocksCompressor, stats: &Self::StatsType, _is_sample: bool, allowed_cascading: usize, @@ -419,6 +525,7 @@ impl Scheme for NullDominated { fn compress( &self, + compressor: &BtrBlocksCompressor, stats: &Self::StatsType, is_sample: bool, allowed_cascading: usize, @@ -435,7 +542,7 @@ impl Scheme for NullDominated { // Don't attempt to compress the non-null values let indices = sparse.patches().indices().to_primitive().narrow()?; - let compressed_indices = IntCompressor::compress_no_dict( + let compressed_indices = IntCompressor::compress_no_dict_static( &indices, is_sample, allowed_cascading - 1, @@ -465,7 +572,6 @@ mod tests { use vortex_dtype::Nullability; use vortex_error::VortexResult; - use crate::Compressor; use crate::MAX_CASCADE; use crate::string::StringCompressor; @@ -480,7 +586,7 @@ mod tests { } let strings = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); - let compressed = StringCompressor::compress(&strings, false, 3, &[])?; + let compressed = StringCompressor::compress_static(&strings, false, 3, &[])?; assert_eq!(compressed.len(), 2048); let display = compressed @@ -501,7 +607,7 @@ mod tests { let strings = strings.finish_into_varbinview(); - let compressed = StringCompressor::compress(&strings, false, MAX_CASCADE, &[])?; + let compressed = StringCompressor::compress_static(&strings, false, MAX_CASCADE, &[])?; assert_eq!(compressed.len(), 100); let display = compressed @@ -525,14 +631,13 @@ mod scheme_selection_tests { use vortex_error::VortexResult; use vortex_fsst::FSSTVTable; - use crate::Compressor; use crate::string::StringCompressor; #[test] fn test_constant_compressed() -> VortexResult<()> { let strings: Vec> = vec![Some("constant_value"); 100]; let array = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); - let compressed = StringCompressor::compress(&array, false, 3, &[])?; + let compressed = StringCompressor::compress_static(&array, false, 3, &[])?; assert!(compressed.is::()); Ok(()) } @@ -545,7 +650,7 @@ mod scheme_selection_tests { strings.push(Some(distinct_values[i % 3])); } let array = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); - let compressed = StringCompressor::compress(&array, false, 3, &[])?; + let compressed = StringCompressor::compress_static(&array, false, 3, &[])?; assert!(compressed.is::()); Ok(()) } @@ -559,7 +664,7 @@ mod scheme_selection_tests { ))); } let array = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); - let compressed = StringCompressor::compress(&array, false, 3, &[])?; + let compressed = StringCompressor::compress_static(&array, false, 3, &[])?; assert!(compressed.is::()); Ok(()) } diff --git a/vortex-btrblocks/src/temporal.rs b/vortex-btrblocks/src/temporal.rs index 67468c1b311..d5d688a237d 100644 --- a/vortex-btrblocks/src/temporal.rs +++ b/vortex-btrblocks/src/temporal.rs @@ -4,6 +4,7 @@ //! Specialized compressor for DateTimeParts metadata. use vortex_array::ArrayRef; +use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::ToCanonical; use vortex_array::arrays::TemporalArray; @@ -12,12 +13,17 @@ use vortex_datetime_parts::TemporalParts; use vortex_datetime_parts::split_temporal; use vortex_error::VortexResult; -use crate::Compressor; +use crate::BtrBlocksCompressor; +use crate::CanonicalCompressor; +use crate::Excludes; use crate::MAX_CASCADE; use crate::integer::IntCompressor; /// Compress a temporal array into a `DateTimePartsArray`. -pub fn compress_temporal(array: TemporalArray) -> VortexResult { +pub fn compress_temporal( + compressor: &BtrBlocksCompressor, + array: TemporalArray, +) -> VortexResult { let dtype = array.dtype().clone(); let TemporalParts { days, @@ -25,15 +31,19 @@ pub fn compress_temporal(array: TemporalArray) -> VortexResult { subseconds, } = split_temporal(array)?; - let days = - IntCompressor::compress(&days.to_primitive().narrow()?, false, MAX_CASCADE - 1, &[])?; - let seconds = IntCompressor::compress( + let days = compressor.compress_canonical( + Canonical::Primitive(days.to_primitive().narrow()?), + false, + MAX_CASCADE - 1, + Excludes::int_only(&[]), + )?; + let seconds = IntCompressor::compress_static( &seconds.to_primitive().narrow()?, false, MAX_CASCADE - 1, &[], )?; - let subseconds = IntCompressor::compress( + let subseconds = IntCompressor::compress_static( &subseconds.to_primitive().narrow()?, false, MAX_CASCADE - 1, From 6db850c4dc8f7f87ffdf6180cd2137e269d3051c Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 30 Jan 2026 16:01:48 +0000 Subject: [PATCH 03/14] wip Signed-off-by: Joe Isaacs --- vortex-btrblocks/benches/compress.rs | 6 +- vortex-btrblocks/src/builder.rs | 3 - vortex-btrblocks/src/decimal.rs | 1 - vortex-btrblocks/src/float.rs | 56 ++-------- vortex-btrblocks/src/integer.rs | 159 +++++++-------------------- vortex-btrblocks/src/lib.rs | 110 +++++++++++++----- vortex-btrblocks/src/patches.rs | 2 - vortex-btrblocks/src/rle.rs | 19 ++-- vortex-btrblocks/src/string.rs | 103 +++++------------ vortex-btrblocks/src/temporal.rs | 13 +-- 10 files changed, 177 insertions(+), 295 deletions(-) diff --git a/vortex-btrblocks/benches/compress.rs b/vortex-btrblocks/benches/compress.rs index d352aa6064b..75139361ed5 100644 --- a/vortex-btrblocks/benches/compress.rs +++ b/vortex-btrblocks/benches/compress.rs @@ -15,8 +15,7 @@ mod benchmarks { use vortex_array::ArrayRef; use vortex_array::IntoArray; use vortex_array::ToCanonical; - use vortex_btrblocks::IntCompressor; - use vortex_btrblocks::compress; + use vortex_btrblocks::BtrBlocksCompressor; use vortex_buffer::buffer_mut; use vortex_utils::aliases::hash_set::HashSet; @@ -42,11 +41,12 @@ mod benchmarks { #[divan::bench] fn btrblocks(bencher: Bencher) { let array = make_clickbench_window_name().to_primitive(); + let compressor = BtrBlocksCompressor::default(); bencher .with_inputs(|| &array) .input_counter(|array| ItemsCount::new(array.len())) .input_counter(|array| BytesCount::of_many::(array.len())) - .bench_refs(|array| compress(&IntCompressor::default(), array, false, 3, &[]).unwrap()); + .bench_refs(|array| compressor.compress(array.as_ref()).unwrap()); } } diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index 06f04bc9086..9f6f06a24ea 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -8,11 +8,8 @@ use vortex_utils::aliases::hash_set::HashSet; use crate::BtrBlocksCompressor; use crate::FloatCode; -use crate::FloatCompressor; use crate::IntCode; -use crate::IntCompressor; use crate::StringCode; -use crate::StringCompressor; use crate::float::ALL_FLOAT_SCHEMES; use crate::float::FloatScheme; use crate::integer::ALL_INT_SCHEMES; diff --git a/vortex-btrblocks/src/decimal.rs b/vortex-btrblocks/src/decimal.rs index 431ffe57bf4..479b0d1c35c 100644 --- a/vortex-btrblocks/src/decimal.rs +++ b/vortex-btrblocks/src/decimal.rs @@ -14,7 +14,6 @@ use vortex_scalar::DecimalType; use crate::BtrBlocksCompressor; use crate::CanonicalCompressor; use crate::Excludes; -use crate::IntCompressor; use crate::MAX_CASCADE; // TODO(joe): add support splitting i128/256 buffers into chunks primitive values for compression. diff --git a/vortex-btrblocks/src/float.rs b/vortex-btrblocks/src/float.rs index b2f2eaff340..b303a51e435 100644 --- a/vortex-btrblocks/src/float.rs +++ b/vortex-btrblocks/src/float.rs @@ -20,7 +20,6 @@ use vortex_array::arrays::ConstantArray; use vortex_array::arrays::DictArray; use vortex_array::arrays::DictArrayParts; use vortex_array::arrays::MaskedArray; -use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::PrimitiveVTable; use vortex_array::vtable::VTable; use vortex_array::vtable::ValidityHelper; @@ -40,15 +39,10 @@ use crate::Excludes; use crate::GenerateStatsOptions; use crate::IntCode; use crate::Scheme; -use crate::compress; use crate::estimate_compression_ratio_with_sampling; use crate::float::dictionary::dictionary_encode; use crate::integer; -use crate::integer::IntCompressor; -use crate::integer::IntegerStats; use crate::patches::compress_patches; -use crate::rle; -use crate::rle::RLEScheme; pub trait FloatScheme: Scheme + Send + Sync {} @@ -81,10 +75,10 @@ pub const ALL_FLOAT_SCHEMES: &[&dyn FloatScheme] = &[ ]; /// [`Compressor`] for floating-point numbers. -#[derive(Clone)] +#[derive(Clone, Copy)] pub struct FloatCompressor<'a> { - /// tmp - pub btr_blocks_compressor: &'a BtrBlocksCompressor, // schemes: Vec<&'static dyn FloatScheme>, + /// Reference to the parent compressor. + pub btr_blocks_compressor: &'a dyn CanonicalCompressor, } // impl Default for FloatCompressor { @@ -141,7 +135,7 @@ impl<'a> Compressor for FloatCompressor<'a> { fn gen_stats(&self, array: &::Array) -> Self::StatsType { if self .btr_blocks_compressor - .float_schemes + .float_schemes() .iter() .any(|s| s.code() == DictScheme.code()) { @@ -162,7 +156,7 @@ impl<'a> Compressor for FloatCompressor<'a> { } fn schemes(&self) -> &[&'static dyn FloatScheme] { - &self.btr_blocks_compressor.float_schemes + self.btr_blocks_compressor.float_schemes() } fn default_scheme(&self) -> &'static Self::SchemeType { @@ -209,10 +203,6 @@ struct DictScheme; #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct NullDominated; -/// Configuration for float RLE compression. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct FloatRLEConfig; - // impl rle::RLEConfig for FloatRLEConfig { // type Stats = FloatStats; // type Code = FloatCode; @@ -390,16 +380,9 @@ impl Scheme for ALPScheme { Canonical::Primitive(alp_ints), is_sample, allowed_cascading - 1, - Excludes::float_only(excludes), + Excludes::int_only(&int_excludes), )?; - // let compressed_alp_ints = IntCompressor::compress_static( - // &alp_ints, - // is_sample, - // allowed_cascading - 1, - // &int_excludes, - // )?; - let patches = alp.patches().map(compress_patches).transpose()?; Ok(ALPArray::new(compressed_alp_ints, alp.exponents(), patches).into_array()) @@ -438,7 +421,7 @@ impl Scheme for ALPRDScheme { fn compress( &self, - compressor: &BtrBlocksCompressor, + _compressor: &BtrBlocksCompressor, stats: &Self::StatsType, _is_sample: bool, _allowed_cascading: usize, @@ -504,35 +487,12 @@ impl Scheme for DictScheme { stats: &Self::StatsType, is_sample: bool, allowed_cascading: usize, - excludes: &[Self::CodeType], + _excludes: &[Self::CodeType], ) -> VortexResult { let dict = dictionary_encode(stats); let has_all_values_referenced = dict.has_all_values_referenced(); let DictArrayParts { codes, values, .. } = dict.into_parts(); - // Only compress the codes. - // let codes_stats = IntegerStats::generate_opts( - // &codes.to_primitive().narrow()?, - // GenerateStatsOptions { - // count_distinct_values: false, - // }, - // ); - // - // let codes_excludes = &[integer::DictScheme.code(), integer::SequenceScheme.code()]; - // let codes_compressor = IntCompressor::excluding(codes_excludes); - // let codes_scheme = codes_compressor.choose_scheme( - // &codes_stats, - // is_sample, - // allowed_cascading - 1, - // codes_excludes, - // )?; - // let compressed_codes = codes_scheme.compress( - // compressor, - // &codes_stats, - // is_sample, - // allowed_cascading - 1, - // &[integer::DictScheme.code()], - // )?; let compressed_codes = compressor.compress_canonical( Canonical::Primitive(codes.to_primitive()), is_sample, diff --git a/vortex-btrblocks/src/integer.rs b/vortex-btrblocks/src/integer.rs index 99f883a02bd..b379e5ed6a6 100644 --- a/vortex-btrblocks/src/integer.rs +++ b/vortex-btrblocks/src/integer.rs @@ -44,7 +44,6 @@ use crate::CompressorStats; use crate::Excludes; use crate::GenerateStatsOptions; use crate::Scheme; -use crate::compress; use crate::estimate_compression_ratio_with_sampling; use crate::integer::dictionary::dictionary_encode; use crate::patches::compress_patches; @@ -65,105 +64,32 @@ pub const ALL_INT_SCHEMES: &[&dyn IntegerScheme] = &[ ]; /// [`Compressor`] for signed and unsigned integers. -#[derive(Clone)] -pub struct IntCompressor { - schemes: Vec<&'static dyn IntegerScheme>, - default: &'static dyn IntegerScheme, +#[derive(Clone, Copy)] +pub struct IntCompressor<'a> { + /// Reference to the parent compressor. + pub btr_blocks_compressor: &'a dyn CanonicalCompressor, } -impl Default for IntCompressor { - fn default() -> Self { - Self { - schemes: ALL_INT_SCHEMES.to_vec(), - default: &UncompressedScheme, - } - } -} - -impl IntCompressor { - /// Creates a new compressor with all schemes enabled. - pub fn new() -> Self { - Self::default() - } - - /// Creates a compressor with only the specified schemes. - pub fn with_schemes(schemes: Vec<&'static dyn IntegerScheme>) -> Self { - Self { - schemes, - default: &UncompressedScheme, - } - } - - /// Creates a compressor excluding schemes with the given codes. - pub fn excluding(excludes: &[IntCode]) -> Self { - Self::with_schemes( - ALL_INT_SCHEMES - .iter() - .filter(|s| !excludes.contains(&s.code())) - .copied() - .collect(), - ) - } - - /// Creates a compressor without dictionary encoding. - pub fn no_dict() -> Self { - Self::excluding(&[IntCode::Dict]) - } - - /// Compress without dictionary encoding (static helper for internal use). - // pub(crate) fn compress_no_dict_static( - // array: &PrimitiveArray, - // is_sample: bool, - // allowed_cascading: usize, - // excludes: &[IntCode], - // ) -> VortexResult { - // let compressor = Self::excluding(&[IntCode::Dict]); - // let compressor = if excludes.is_empty() { - // compressor - // } else { - // Self::with_schemes( - // compressor - // .schemes - // .iter() - // .filter(|s| !excludes.contains(&s.code())) - // .copied() - // .collect(), - // ) - // }; - // compress(&compressor, array, is_sample, allowed_cascading, excludes) - // } - - /// Compress with default settings (static helper for internal use). - pub(crate) fn compress_static( - array: &PrimitiveArray, - is_sample: bool, - allowed_cascading: usize, - excludes: &[IntCode], - ) -> VortexResult { - let compressor = if excludes.is_empty() { - Self::default() - } else { - Self::excluding(excludes) - }; - compress(&compressor, array, is_sample, allowed_cascading, excludes) - } -} - -impl Compressor for IntCompressor { +impl<'a> Compressor for IntCompressor<'a> { type ArrayVTable = PrimitiveVTable; type SchemeType = dyn IntegerScheme; type StatsType = IntegerStats; fn schemes(&self) -> &[&'static dyn IntegerScheme] { - &self.schemes + self.btr_blocks_compressor.int_schemes() } fn default_scheme(&self) -> &'static Self::SchemeType { - self.default + &UncompressedScheme } fn gen_stats(&self, array: &::Array) -> Self::StatsType { - if self.schemes.iter().any(|s| s.code() == IntCode::Dict) { + if self + .btr_blocks_compressor + .int_schemes() + .iter() + .any(|s| s.code() == IntCode::Dict) + { IntegerStats::generate_opts( array, GenerateStatsOptions { @@ -275,12 +201,18 @@ impl rle::RLEConfig for IntRLEConfig { const CODE: IntCode = IntCode::Rle; fn compress_values( + compressor: &BtrBlocksCompressor, values: &PrimitiveArray, is_sample: bool, allowed_cascading: usize, excludes: &[IntCode], ) -> VortexResult { - IntCompressor::compress_no_dict_static(values, is_sample, allowed_cascading, excludes) + compressor.compress_canonical( + Canonical::Primitive(values.clone()), + is_sample, + allowed_cascading, + Excludes::int_only(excludes), + ) } } @@ -621,7 +553,7 @@ impl Scheme for SparseScheme { // We can avoid asserting the encoding tree instead. fn expected_compression_ratio( &self, - compressor: &BtrBlocksCompressor, + _compressor: &BtrBlocksCompressor, stats: &IntegerStats, _is_sample: bool, allowed_cascading: usize, @@ -693,23 +625,23 @@ impl Scheme for SparseScheme { if let Some(sparse) = sparse_encoded.as_opt::() { // Compress the values - let mut new_excludes = vec![SparseScheme.code()]; + let mut new_excludes = vec![SparseScheme.code(), IntCode::Dict]; new_excludes.extend_from_slice(excludes); - let compressed_values = IntCompressor::compress_no_dict_static( - &sparse.patches().values().to_primitive(), + let compressed_values = compressor.compress_canonical( + Canonical::Primitive(sparse.patches().values().to_primitive()), is_sample, allowed_cascading - 1, - &new_excludes, + Excludes::int_only(&new_excludes), )?; let indices = sparse.patches().indices().to_primitive().narrow()?; - let compressed_indices = IntCompressor::compress_no_dict_static( - &indices, + let compressed_indices = compressor.compress_canonical( + Canonical::Primitive(indices), is_sample, allowed_cascading - 1, - &new_excludes, + Excludes::int_only(&new_excludes), )?; SparseArray::try_new( @@ -735,7 +667,7 @@ impl Scheme for DictScheme { fn expected_compression_ratio( &self, - compressor: &BtrBlocksCompressor, + _compressor: &BtrBlocksCompressor, stats: &IntegerStats, _is_sample: bool, allowed_cascading: usize, @@ -794,11 +726,11 @@ impl Scheme for DictScheme { let mut new_excludes = vec![IntCode::Dict, IntCode::Sequence]; new_excludes.extend_from_slice(excludes); - let compressed_codes = IntCompressor::compress_no_dict_static( - &dict.codes().to_primitive().narrow()?, + let compressed_codes = compressor.compress_canonical( + Canonical::Primitive(dict.codes().to_primitive().narrow()?), is_sample, allowed_cascading - 1, - &new_excludes, + Excludes::int_only(&new_excludes), )?; // SAFETY: compressing codes does not change their values @@ -864,33 +796,18 @@ impl Scheme for RunEndScheme { let mut new_excludes = vec![RunEndScheme.code(), DictScheme.code()]; new_excludes.extend_from_slice(excludes); - let ends_stats = IntegerStats::generate_opts( - &ends.to_primitive(), - GenerateStatsOptions { - count_distinct_values: false, - }, - ); - let ends_compressor = IntCompressor::excluding(&new_excludes); - let ends_scheme = ends_compressor.choose_scheme( - compressor, - &ends_stats, - is_sample, - allowed_cascading - 1, - &new_excludes, - )?; - let compressed_ends = ends_scheme.compress( - compressor, - &ends_stats, + let compressed_ends = compressor.compress_canonical( + Canonical::Primitive(ends.to_primitive()), is_sample, allowed_cascading - 1, - &new_excludes, + Excludes::int_only(&new_excludes), )?; - let compressed_values = IntCompressor::compress_no_dict_static( - &values.to_primitive(), + let compressed_values = compressor.compress_canonical( + Canonical::Primitive(values.to_primitive()), is_sample, allowed_cascading - 1, - &new_excludes, + Excludes::int_only(&new_excludes), )?; // SAFETY: compression doesn't affect invariants diff --git a/vortex-btrblocks/src/lib.rs b/vortex-btrblocks/src/lib.rs index 58df412786e..823bdf47aba 100644 --- a/vortex-btrblocks/src/lib.rs +++ b/vortex-btrblocks/src/lib.rs @@ -457,7 +457,12 @@ where } } -trait CanonicalCompressor { +/// Trait for compressors that can compress canonical arrays. +/// +/// Provides access to configured compression schemes and the ability to +/// compress canonical arrays recursively. +pub trait CanonicalCompressor { + /// Compresses a canonical array with the specified options. fn compress_canonical( &self, array: Canonical, @@ -465,6 +470,15 @@ trait CanonicalCompressor { allowed_cascading: usize, excludes: Excludes, ) -> VortexResult; + + /// Returns the enabled integer compression schemes. + fn int_schemes(&self) -> &[&'static dyn IntegerScheme]; + + /// Returns the enabled float compression schemes. + fn float_schemes(&self) -> &[&'static dyn FloatScheme]; + + /// Returns the enabled string compression schemes. + fn string_schemes(&self) -> &[&'static dyn StringScheme]; } /// The main compressor type implementing BtrBlocks-inspired compression. @@ -494,18 +508,22 @@ trait CanonicalCompressor { /// .exclude_int([IntCode::Dict]) /// .build(); /// ``` -#[derive(Clone, Default)] +#[derive(Clone)] pub struct BtrBlocksCompressor { /// Integer compressor with configured schemes. pub int_schemes: Vec<&'static dyn IntegerScheme>, /// Float compressor with configured schemes. - // float_compressor: FloatCompressor, pub float_schemes: Vec<&'static dyn FloatScheme>, /// String compressor with configured schemes. pub string_schemes: Vec<&'static dyn StringScheme>, - // string_compressor: StringCompressor, +} + +impl Default for BtrBlocksCompressor { + fn default() -> Self { + BtrBlocksCompressorBuilder::new().build() + } } impl BtrBlocksCompressor { @@ -514,6 +532,21 @@ impl BtrBlocksCompressor { Self::default() } + /// Returns an iterator over the enabled integer compression scheme codes. + pub fn int_codes(&self) -> impl Iterator + '_ { + self.int_schemes.iter().map(|s| s.code()) + } + + /// Returns an iterator over the enabled float compression scheme codes. + pub fn float_codes(&self) -> impl Iterator + '_ { + self.float_schemes.iter().map(|s| s.code()) + } + + /// Returns an iterator over the enabled string compression scheme codes. + pub fn string_codes(&self) -> impl Iterator + '_ { + self.string_schemes.iter().map(|s| s.code()) + } + /// Compresses an array using BtrBlocks-inspired compression. /// /// First canonicalizes and compacts the array, then applies optimal compression schemes. @@ -524,7 +557,7 @@ impl BtrBlocksCompressor { // Compact it, removing any wasted space before we attempt to compress it let compact = canonical.compact()?; - self.compress_canonical(compact, Excludes::none()) + self.compress_canonical(compact, false, MAX_CASCADE, Excludes::none()) } } @@ -545,22 +578,25 @@ impl CanonicalCompressor for BtrBlocksCompressor { Canonical::Bool(bool_array) => Ok(bool_array.into_array()), Canonical::Primitive(primitive) => { if primitive.ptype().is_int() { - // compress( - // &self.int_schemes, - // &primitive, - // false, - // MAX_CASCADE, - // excludes.int, - // ) - todo!() + compress( + &IntCompressor { + btr_blocks_compressor: self, + }, + self, + &primitive, + is_sample, + allowed_cascading, + excludes.int, + ) } else { compress( &FloatCompressor { - btr_blocks_compressor: &self, + btr_blocks_compressor: self, }, + self, &primitive, - false, - MAX_CASCADE, + is_sample, + allowed_cascading, excludes.float, ) } @@ -597,11 +633,11 @@ impl CanonicalCompressor for BtrBlocksCompressor { // we guarantee above that all elements are referenced by offsets, we may narrow the // widths. - let compressed_offsets = IntCompressor::compress_no_dict_static( - &list_array.offsets().to_primitive().narrow()?, - false, - MAX_CASCADE, - &[], + let compressed_offsets = self.compress_canonical( + Canonical::Primitive(list_array.offsets().to_primitive().narrow()?), + is_sample, + allowed_cascading, + Excludes::int_only(&[IntCode::Dict]), )?; Ok(ListArray::try_new( @@ -627,14 +663,16 @@ impl CanonicalCompressor for BtrBlocksCompressor { .dtype() .eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable)) { - // compress( - // &self.string_schemes, - // &strings, - // false, - // MAX_CASCADE, - // excludes.string, - // ) - todo!() + compress( + &StringCompressor { + btr_blocks_compressor: self, + }, + self, + &strings, + is_sample, + allowed_cascading, + excludes.string, + ) } else { // Binary arrays do not compress Ok(strings.into_array()) @@ -659,7 +697,7 @@ impl CanonicalCompressor for BtrBlocksCompressor { } let temporal_array = TemporalArray::try_from(ext_array)?; - return compress_temporal(temporal_array); + return compress_temporal(self, temporal_array); } // Compress the underlying storage array. @@ -672,6 +710,18 @@ impl CanonicalCompressor for BtrBlocksCompressor { } } } + + fn int_schemes(&self) -> &[&'static dyn IntegerScheme] { + &self.int_schemes + } + + fn float_schemes(&self) -> &[&'static dyn FloatScheme] { + &self.float_schemes + } + + fn string_schemes(&self) -> &[&'static dyn StringScheme] { + &self.string_schemes + } } /// Context passed through recursive compression calls. diff --git a/vortex-btrblocks/src/patches.rs b/vortex-btrblocks/src/patches.rs index d9bee570824..9890ab7bd07 100644 --- a/vortex-btrblocks/src/patches.rs +++ b/vortex-btrblocks/src/patches.rs @@ -8,8 +8,6 @@ use vortex_array::arrays::ConstantArray; use vortex_array::patches::Patches; use vortex_error::VortexResult; -use crate::BtrBlocksCompressor; - /// Compresses the given patches by downscaling integers and checking for constant values. pub fn compress_patches(patches: &Patches) -> VortexResult { // Downscale the patch indices. diff --git a/vortex-btrblocks/src/rle.rs b/vortex-btrblocks/src/rle.rs index a66397ab896..d97ba4f8d42 100644 --- a/vortex-btrblocks/src/rle.rs +++ b/vortex-btrblocks/src/rle.rs @@ -6,6 +6,7 @@ use std::hash::Hash; use std::marker::PhantomData; use vortex_array::ArrayRef; +use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::ToCanonical; use vortex_array::arrays::PrimitiveArray; @@ -13,10 +14,12 @@ use vortex_error::VortexResult; use vortex_fastlanes::RLEArray; use crate::BtrBlocksCompressor; +use crate::CanonicalCompressor; use crate::CompressorStats; +use crate::Excludes; +use crate::IntCode; use crate::Scheme; use crate::estimate_compression_ratio_with_sampling; -use crate::integer::IntCompressor; /// Threshold for the average run length in an array before we consider run-length encoding. pub const RUN_LENGTH_THRESHOLD: u32 = 4; @@ -43,6 +46,7 @@ pub trait RLEConfig: Debug + Send + Sync + 'static { /// Compress the values array after RLE encoding. fn compress_values( + compressor: &BtrBlocksCompressor, values: &PrimitiveArray, is_sample: bool, allowed_cascading: usize, @@ -130,24 +134,25 @@ impl Scheme for RLEScheme { new_excludes.extend_from_slice(excludes); let compressed_values = C::compress_values( + compressor, &rle_array.values().to_primitive(), is_sample, allowed_cascading - 1, &new_excludes, )?; - let compressed_indices = IntCompressor::compress_no_dict_static( - &rle_array.indices().to_primitive().narrow()?, + let compressed_indices = compressor.compress_canonical( + Canonical::Primitive(rle_array.indices().to_primitive().narrow()?), is_sample, allowed_cascading - 1, - &[], + Excludes::int_only(&[IntCode::Dict]), )?; - let compressed_offsets = IntCompressor::compress_no_dict_static( - &rle_array.values_idx_offsets().to_primitive().narrow()?, + let compressed_offsets = compressor.compress_canonical( + Canonical::Primitive(rle_array.values_idx_offsets().to_primitive().narrow()?), is_sample, allowed_cascading - 1, - &[], + Excludes::int_only(&[IntCode::Dict]), )?; // SAFETY: Recursive compression doesn't affect the invariants. diff --git a/vortex-btrblocks/src/string.rs b/vortex-btrblocks/src/string.rs index 67f86204516..2fb3bb617f8 100644 --- a/vortex-btrblocks/src/string.rs +++ b/vortex-btrblocks/src/string.rs @@ -36,11 +36,10 @@ use crate::Compressor; use crate::CompressorStats; use crate::Excludes; use crate::GenerateStatsOptions; +use crate::IntCode; use crate::Scheme; -use crate::compress; use crate::estimate_compression_ratio_with_sampling; use crate::integer; -use crate::integer::IntCompressor; use crate::sample::sample; /// Array of variable-length byte arrays, and relevant stats for compression. @@ -125,64 +124,24 @@ pub const ALL_STRING_SCHEMES: &[&dyn StringScheme] = &[ ]; /// [`Compressor`] for strings. -#[derive(Clone)] -pub struct StringCompressor { - schemes: Vec<&'static dyn StringScheme>, +#[derive(Clone, Copy)] +pub struct StringCompressor<'a> { + /// Reference to the parent compressor. + pub btr_blocks_compressor: &'a dyn CanonicalCompressor, } -impl Default for StringCompressor { - fn default() -> Self { - Self { - schemes: ALL_STRING_SCHEMES.to_vec(), - } - } -} - -impl StringCompressor { - /// Creates a new compressor with all schemes enabled. - pub fn new() -> Self { - Self::default() - } - - /// Creates a compressor with only the specified schemes. - pub fn with_schemes(schemes: Vec<&'static dyn StringScheme>) -> Self { - Self { schemes } - } - - /// Creates a compressor excluding schemes with the given codes. - pub fn excluding(excludes: &[StringCode]) -> Self { - Self { - schemes: ALL_STRING_SCHEMES - .iter() - .filter(|s| !excludes.contains(&s.code())) - .copied() - .collect(), - } - } - - /// Compress with default settings (static helper for internal use). - pub(crate) fn compress_static( - array: &VarBinViewArray, - is_sample: bool, - allowed_cascading: usize, - excludes: &[StringCode], - ) -> VortexResult { - let compressor = if excludes.is_empty() { - Self::default() - } else { - Self::excluding(excludes) - }; - compress(&compressor, array, is_sample, allowed_cascading, excludes) - } -} - -impl Compressor for StringCompressor { +impl<'a> Compressor for StringCompressor<'a> { type ArrayVTable = VarBinViewVTable; type SchemeType = dyn StringScheme; type StatsType = StringStats; fn gen_stats(&self, array: &::Array) -> Self::StatsType { - if self.schemes.iter().any(|s| s.code() == DictScheme.code()) { + if self + .btr_blocks_compressor + .string_schemes() + .iter() + .any(|s| s.code() == DictScheme.code()) + { StringStats::generate_opts( array, GenerateStatsOptions { @@ -200,7 +159,7 @@ impl Compressor for StringCompressor { } fn schemes(&self) -> &[&'static dyn StringScheme] { - &self.schemes + self.btr_blocks_compressor.string_schemes() } fn default_scheme(&self) -> &'static Self::SchemeType { @@ -272,7 +231,7 @@ impl Scheme for UncompressedScheme { fn expected_compression_ratio( &self, - compressor: &BtrBlocksCompressor, + _compressor: &BtrBlocksCompressor, _stats: &Self::StatsType, _is_sample: bool, _allowed_cascading: usize, @@ -283,7 +242,7 @@ impl Scheme for UncompressedScheme { fn compress( &self, - compressor: &BtrBlocksCompressor, + _compressor: &BtrBlocksCompressor, stats: &Self::StatsType, _is_sample: bool, _allowed_cascading: usize, @@ -439,7 +398,7 @@ impl Scheme for ConstantScheme { fn expected_compression_ratio( &self, - compressor: &BtrBlocksCompressor, + _compressor: &BtrBlocksCompressor, stats: &Self::StatsType, is_sample: bool, _allowed_cascading: usize, @@ -460,7 +419,7 @@ impl Scheme for ConstantScheme { fn compress( &self, - compressor: &BtrBlocksCompressor, + _compressor: &BtrBlocksCompressor, stats: &Self::StatsType, _is_sample: bool, _allowed_cascading: usize, @@ -498,7 +457,7 @@ impl Scheme for NullDominated { fn expected_compression_ratio( &self, - compressor: &BtrBlocksCompressor, + _compressor: &BtrBlocksCompressor, stats: &Self::StatsType, _is_sample: bool, allowed_cascading: usize, @@ -537,16 +496,15 @@ impl Scheme for NullDominated { let sparse_encoded = SparseArray::encode(stats.src.as_ref(), None)?; if let Some(sparse) = sparse_encoded.as_opt::() { - // Compress the values - let new_excludes = vec![integer::SparseScheme.code()]; + // Compress the indices only (not the values for strings) + let new_excludes = vec![integer::SparseScheme.code(), IntCode::Dict]; - // Don't attempt to compress the non-null values let indices = sparse.patches().indices().to_primitive().narrow()?; - let compressed_indices = IntCompressor::compress_no_dict_static( - &indices, + let compressed_indices = compressor.compress_canonical( + Canonical::Primitive(indices), is_sample, allowed_cascading - 1, - &new_excludes, + Excludes::int_only(&new_excludes), )?; SparseArray::try_new( @@ -572,8 +530,7 @@ mod tests { use vortex_dtype::Nullability; use vortex_error::VortexResult; - use crate::MAX_CASCADE; - use crate::string::StringCompressor; + use crate::BtrBlocksCompressor; #[test] fn test_strings() -> VortexResult<()> { @@ -586,7 +543,7 @@ mod tests { } let strings = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); - let compressed = StringCompressor::compress_static(&strings, false, 3, &[])?; + let compressed = BtrBlocksCompressor::default().compress(strings.as_ref())?; assert_eq!(compressed.len(), 2048); let display = compressed @@ -607,7 +564,7 @@ mod tests { let strings = strings.finish_into_varbinview(); - let compressed = StringCompressor::compress_static(&strings, false, MAX_CASCADE, &[])?; + let compressed = BtrBlocksCompressor::default().compress(strings.as_ref())?; assert_eq!(compressed.len(), 100); let display = compressed @@ -631,13 +588,13 @@ mod scheme_selection_tests { use vortex_error::VortexResult; use vortex_fsst::FSSTVTable; - use crate::string::StringCompressor; + use crate::BtrBlocksCompressor; #[test] fn test_constant_compressed() -> VortexResult<()> { let strings: Vec> = vec![Some("constant_value"); 100]; let array = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); - let compressed = StringCompressor::compress_static(&array, false, 3, &[])?; + let compressed = BtrBlocksCompressor::default().compress(array.as_ref())?; assert!(compressed.is::()); Ok(()) } @@ -650,7 +607,7 @@ mod scheme_selection_tests { strings.push(Some(distinct_values[i % 3])); } let array = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); - let compressed = StringCompressor::compress_static(&array, false, 3, &[])?; + let compressed = BtrBlocksCompressor::default().compress(array.as_ref())?; assert!(compressed.is::()); Ok(()) } @@ -664,7 +621,7 @@ mod scheme_selection_tests { ))); } let array = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); - let compressed = StringCompressor::compress_static(&array, false, 3, &[])?; + let compressed = BtrBlocksCompressor::default().compress(array.as_ref())?; assert!(compressed.is::()); Ok(()) } diff --git a/vortex-btrblocks/src/temporal.rs b/vortex-btrblocks/src/temporal.rs index d5d688a237d..12fe24926da 100644 --- a/vortex-btrblocks/src/temporal.rs +++ b/vortex-btrblocks/src/temporal.rs @@ -17,7 +17,6 @@ use crate::BtrBlocksCompressor; use crate::CanonicalCompressor; use crate::Excludes; use crate::MAX_CASCADE; -use crate::integer::IntCompressor; /// Compress a temporal array into a `DateTimePartsArray`. pub fn compress_temporal( @@ -37,17 +36,17 @@ pub fn compress_temporal( MAX_CASCADE - 1, Excludes::int_only(&[]), )?; - let seconds = IntCompressor::compress_static( - &seconds.to_primitive().narrow()?, + let seconds = compressor.compress_canonical( + Canonical::Primitive(seconds.to_primitive().narrow()?), false, MAX_CASCADE - 1, - &[], + Excludes::int_only(&[]), )?; - let subseconds = IntCompressor::compress_static( - &subseconds.to_primitive().narrow()?, + let subseconds = compressor.compress_canonical( + Canonical::Primitive(subseconds.to_primitive().narrow()?), false, MAX_CASCADE - 1, - &[], + Excludes::int_only(&[]), )?; Ok(DateTimePartsArray::try_new(dtype, days, seconds, subseconds)?.into_array()) From ad1cd568fa85fd5fc9ac6fc7da8e89489a5da6ec Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 30 Jan 2026 16:09:45 +0000 Subject: [PATCH 04/14] wip Signed-off-by: Joe Isaacs --- vortex-btrblocks/src/float.rs | 98 ++++++++++++----------------------- 1 file changed, 32 insertions(+), 66 deletions(-) diff --git a/vortex-btrblocks/src/float.rs b/vortex-btrblocks/src/float.rs index b303a51e435..bfff723da2f 100644 --- a/vortex-btrblocks/src/float.rs +++ b/vortex-btrblocks/src/float.rs @@ -43,6 +43,8 @@ use crate::estimate_compression_ratio_with_sampling; use crate::float::dictionary::dictionary_encode; use crate::integer; use crate::patches::compress_patches; +use crate::rle; +use crate::rle::RLEScheme; pub trait FloatScheme: Scheme + Send + Sync {} @@ -69,9 +71,9 @@ pub const ALL_FLOAT_SCHEMES: &[&dyn FloatScheme] = &[ &ConstantScheme, &ALPScheme, &ALPRDScheme, - // &DictScheme, + &DictScheme, &NullDominated, - // &RLE_FLOAT_SCHEME, + &RLE_FLOAT_SCHEME, ]; /// [`Compressor`] for floating-point numbers. @@ -81,52 +83,6 @@ pub struct FloatCompressor<'a> { pub btr_blocks_compressor: &'a dyn CanonicalCompressor, } -// impl Default for FloatCompressor { -// fn default() -> Self { -// Self { -// schemes: ALL_FLOAT_SCHEMES.to_vec(), -// } -// } -// } - -// impl<'a> FloatCompressor<'a> { -// /// Creates a new compressor with all schemes enabled. -// // pub fn new() -> Self { -// // Self:: -// // } -// -// /// Creates a compressor with only the specified schemes. -// pub fn with_schemes(schemes: Vec<&'static dyn FloatScheme>) -> Self { -// Self { schemes } -// } -// -// /// Creates a compressor excluding schemes with the given codes. -// pub fn excluding(excludes: &[FloatCode]) -> Self { -// Self { -// schemes: ALL_FLOAT_SCHEMES -// .iter() -// .filter(|s| !excludes.contains(&s.code())) -// .copied() -// .collect(), -// } -// } -// -// /// Compress with default settings (static helper for internal use). -// pub(crate) fn compress_static( -// array: &PrimitiveArray, -// is_sample: bool, -// allowed_cascading: usize, -// excludes: &[FloatCode], -// ) -> VortexResult { -// let compressor = if excludes.is_empty() { -// Self::default() -// } else { -// Self::excluding(excludes) -// }; -// compress(&compressor, array, is_sample, allowed_cascading, excludes) -// } -// } - impl<'a> Compressor for FloatCompressor<'a> { type ArrayVTable = PrimitiveVTable; type SchemeType = dyn FloatScheme; @@ -203,24 +159,34 @@ struct DictScheme; #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct NullDominated; -// impl rle::RLEConfig for FloatRLEConfig { -// type Stats = FloatStats; -// type Code = FloatCode; -// -// const CODE: FloatCode = FloatCode::Rle; -// -// fn compress_values( -// values: &PrimitiveArray, -// is_sample: bool, -// allowed_cascading: usize, -// excludes: &[FloatCode], -// ) -> VortexResult { -// -// } -// } -// -// /// RLE scheme for float compression. -// pub const RLE_FLOAT_SCHEME: RLEScheme = RLEScheme::new(); +/// Configuration for float RLE compression. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct FloatRLEConfig; + +impl rle::RLEConfig for FloatRLEConfig { + type Stats = FloatStats; + type Code = FloatCode; + + const CODE: FloatCode = FloatCode::Rle; + + fn compress_values( + compressor: &BtrBlocksCompressor, + values: &vortex_array::arrays::PrimitiveArray, + is_sample: bool, + allowed_cascading: usize, + excludes: &[FloatCode], + ) -> VortexResult { + compressor.compress_canonical( + Canonical::Primitive(values.clone()), + is_sample, + allowed_cascading, + Excludes::float_only(excludes), + ) + } +} + +/// RLE scheme for float compression. +pub const RLE_FLOAT_SCHEME: RLEScheme = RLEScheme::new(); impl Scheme for UncompressedScheme { type StatsType = FloatStats; From 55ea45c078ef97c3fdf6bd00a4f0aa2955fc4cb4 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 30 Jan 2026 17:18:25 +0000 Subject: [PATCH 05/14] wip Signed-off-by: Joe Isaacs --- vortex-btrblocks/src/builder.rs | 90 +-- vortex-btrblocks/src/canonical_compressor.rs | 265 +++++++ vortex-btrblocks/src/compressor.rs | 148 ++++ vortex-btrblocks/src/ctx.rs | 136 ++++ vortex-btrblocks/src/decimal.rs | 5 +- vortex-btrblocks/src/float.rs | 98 +-- vortex-btrblocks/src/integer.rs | 138 ++-- vortex-btrblocks/src/lib.rs | 744 +------------------ vortex-btrblocks/src/rle.rs | 34 +- vortex-btrblocks/src/scheme.rs | 132 ++++ vortex-btrblocks/src/stats.rs | 63 ++ vortex-btrblocks/src/string.rs | 62 +- vortex-btrblocks/src/temporal.rs | 13 +- vortex-layout/src/layouts/compressed.rs | 2 +- 14 files changed, 888 insertions(+), 1042 deletions(-) create mode 100644 vortex-btrblocks/src/canonical_compressor.rs create mode 100644 vortex-btrblocks/src/compressor.rs create mode 100644 vortex-btrblocks/src/ctx.rs create mode 100644 vortex-btrblocks/src/scheme.rs create mode 100644 vortex-btrblocks/src/stats.rs diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index 9f6f06a24ea..9df7575b252 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -28,15 +28,15 @@ use crate::string::StringScheme; /// use vortex_btrblocks::{BtrBlocksCompressorBuilder, IntCode, FloatCode}; /// /// // Default compressor - all schemes allowed -/// let compressor = BtrBlocksCompressorBuilder::new().build(); +/// let compressor = BtrBlocksCompressorBuilder::default().build(); /// /// // Exclude specific schemes -/// let compressor = BtrBlocksCompressorBuilder::new() +/// let compressor = BtrBlocksCompressorBuilder::default() /// .exclude_int([IntCode::Dict]) /// .build(); /// /// // Exclude then re-include -/// let compressor = BtrBlocksCompressorBuilder::new() +/// let compressor = BtrBlocksCompressorBuilder::default() /// .exclude_int([IntCode::Dict, IntCode::Rle]) /// .include_int([IntCode::Dict]) /// .build(); @@ -50,83 +50,37 @@ pub struct BtrBlocksCompressorBuilder { impl Default for BtrBlocksCompressorBuilder { fn default() -> Self { - Self::new() - } -} - -impl BtrBlocksCompressorBuilder { - /// Creates a new builder with all schemes enabled. - pub fn new() -> Self { Self { int_schemes: ALL_INT_SCHEMES.iter().copied().collect(), float_schemes: ALL_FLOAT_SCHEMES.iter().copied().collect(), string_schemes: ALL_STRING_SCHEMES.iter().copied().collect(), } } +} - /// Excludes the specified integer compression schemes (set difference). - /// - /// # Example - /// - /// ```rust - /// use vortex_btrblocks::{BtrBlocksCompressorBuilder, IntCode}; - /// - /// let compressor = BtrBlocksCompressorBuilder::new() - /// .exclude_int([IntCode::Dict, IntCode::Rle]) - /// .build(); - /// ``` +impl BtrBlocksCompressorBuilder { + /// Excludes the specified integer compression schemes. pub fn exclude_int(mut self, codes: impl IntoIterator) -> Self { let codes: HashSet<_> = codes.into_iter().collect(); self.int_schemes.retain(|s| !codes.contains(&s.code())); self } - /// Excludes the specified float compression schemes (set difference). - /// - /// # Example - /// - /// ```rust - /// use vortex_btrblocks::{BtrBlocksCompressorBuilder, FloatCode}; - /// - /// let compressor = BtrBlocksCompressorBuilder::new() - /// .exclude_float([FloatCode::Dict, FloatCode::Alp]) - /// .build(); - /// ``` + /// Excludes the specified float compression schemes. pub fn exclude_float(mut self, codes: impl IntoIterator) -> Self { let codes: HashSet<_> = codes.into_iter().collect(); self.float_schemes.retain(|s| !codes.contains(&s.code())); self } - /// Excludes the specified string compression schemes (set difference). - /// - /// # Example - /// - /// ```rust - /// use vortex_btrblocks::{BtrBlocksCompressorBuilder, StringCode}; - /// - /// let compressor = BtrBlocksCompressorBuilder::new() - /// .exclude_string([StringCode::Dict, StringCode::Fsst]) - /// .build(); - /// ``` + /// Excludes the specified string compression schemes. pub fn exclude_string(mut self, codes: impl IntoIterator) -> Self { let codes: HashSet<_> = codes.into_iter().collect(); self.string_schemes.retain(|s| !codes.contains(&s.code())); self } - /// Includes the specified integer compression schemes (set union). - /// - /// # Example - /// - /// ```rust - /// use vortex_btrblocks::{BtrBlocksCompressorBuilder, IntCode}; - /// - /// let compressor = BtrBlocksCompressorBuilder::new() - /// .exclude_int([IntCode::Dict, IntCode::Rle]) - /// .include_int([IntCode::Dict]) // re-enables Dict - /// .build(); - /// ``` + /// Includes the specified integer compression schemes. pub fn include_int(mut self, codes: impl IntoIterator) -> Self { let codes: HashSet<_> = codes.into_iter().collect(); for scheme in ALL_INT_SCHEMES { @@ -137,18 +91,7 @@ impl BtrBlocksCompressorBuilder { self } - /// Includes the specified float compression schemes (set union). - /// - /// # Example - /// - /// ```rust - /// use vortex_btrblocks::{BtrBlocksCompressorBuilder, FloatCode}; - /// - /// let compressor = BtrBlocksCompressorBuilder::new() - /// .exclude_float([FloatCode::Alp, FloatCode::AlpRd]) - /// .include_float([FloatCode::Alp]) // re-enables Alp - /// .build(); - /// ``` + /// Includes the specified float compression schemes. pub fn include_float(mut self, codes: impl IntoIterator) -> Self { let codes: HashSet<_> = codes.into_iter().collect(); for scheme in ALL_FLOAT_SCHEMES { @@ -159,18 +102,7 @@ impl BtrBlocksCompressorBuilder { self } - /// Includes the specified string compression schemes (set union). - /// - /// # Example - /// - /// ```rust - /// use vortex_btrblocks::{BtrBlocksCompressorBuilder, StringCode}; - /// - /// let compressor = BtrBlocksCompressorBuilder::new() - /// .exclude_string([StringCode::Dict, StringCode::Fsst]) - /// .include_string([StringCode::Dict]) // re-enables Dict - /// .build(); - /// ``` + /// Includes the specified string compression schemes. pub fn include_string(mut self, codes: impl IntoIterator) -> Self { let codes: HashSet<_> = codes.into_iter().collect(); for scheme in ALL_STRING_SCHEMES { diff --git a/vortex-btrblocks/src/canonical_compressor.rs b/vortex-btrblocks/src/canonical_compressor.rs new file mode 100644 index 00000000000..5a3e6c7beaa --- /dev/null +++ b/vortex-btrblocks/src/canonical_compressor.rs @@ -0,0 +1,265 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Canonical array compression implementation. + +use vortex_array::Array; +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_array::ToCanonical; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::ExtensionArray; +use vortex_array::arrays::FixedSizeListArray; +use vortex_array::arrays::ListArray; +use vortex_array::arrays::StructArray; +use vortex_array::arrays::TemporalArray; +use vortex_array::arrays::list_from_list_view; +use vortex_array::compute::Cost; +use vortex_array::compute::IsConstantOpts; +use vortex_array::compute::is_constant_opts; +use vortex_array::vtable::ValidityHelper; +use vortex_dtype::DType; +use vortex_dtype::Nullability; +use vortex_dtype::datetime::TemporalMetadata; +use vortex_error::VortexResult; + +use crate::BtrBlocksCompressorBuilder; +use crate::CompressorContext; +use crate::CompressorExt; +use crate::Excludes; +use crate::FloatCompressor; +use crate::IntCode; +use crate::IntCompressor; +use crate::StringCompressor; +use crate::decimal::compress_decimal; +use crate::float::FloatScheme; +use crate::integer::IntegerScheme; +use crate::string::StringScheme; +use crate::temporal::compress_temporal; + +/// Trait for compressors that can compress canonical arrays. +/// +/// Provides access to configured compression schemes and the ability to +/// compress canonical arrays recursively. +pub trait CanonicalCompressor { + /// Compresses a canonical array with the specified options. + fn compress_canonical( + &self, + array: Canonical, + ctx: CompressorContext, + excludes: Excludes, + ) -> VortexResult; + + /// Returns the enabled integer compression schemes. + fn int_schemes(&self) -> &[&'static dyn IntegerScheme]; + + /// Returns the enabled float compression schemes. + fn float_schemes(&self) -> &[&'static dyn FloatScheme]; + + /// Returns the enabled string compression schemes. + fn string_schemes(&self) -> &[&'static dyn StringScheme]; +} + +/// The main compressor type implementing BtrBlocks-inspired compression. +/// +/// This compressor applies adaptive compression schemes to arrays based on their data types +/// and characteristics. It recursively compresses nested structures like structs and lists, +/// and chooses optimal compression schemes for primitive types. +/// +/// The compressor works by: +/// 1. Canonicalizing input arrays to a standard representation +/// 2. Analyzing data characteristics to choose optimal compression schemes +/// 3. Recursively compressing nested structures +/// 4. Applying type-specific compression for primitives, strings, and temporal data +/// +/// Use [`BtrBlocksCompressorBuilder`] to configure which compression schemes are enabled. +/// +/// # Examples +/// +/// ```rust +/// use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, IntCode}; +/// +/// // Default compressor - all schemes allowed +/// let compressor = BtrBlocksCompressor::default(); +/// +/// // Exclude specific schemes using the builder +/// let compressor = BtrBlocksCompressorBuilder::default() +/// .exclude_int([IntCode::Dict]) +/// .build(); +/// ``` +#[derive(Clone)] +pub struct BtrBlocksCompressor { + /// Integer compressor with configured schemes. + pub int_schemes: Vec<&'static dyn IntegerScheme>, + + /// Float compressor with configured schemes. + pub float_schemes: Vec<&'static dyn FloatScheme>, + + /// String compressor with configured schemes. + pub string_schemes: Vec<&'static dyn StringScheme>, +} + +impl Default for BtrBlocksCompressor { + fn default() -> Self { + BtrBlocksCompressorBuilder::default().build() + } +} + +impl BtrBlocksCompressor { + /// Compresses an array using BtrBlocks-inspired compression. + /// + /// First canonicalizes and compacts the array, then applies optimal compression schemes. + pub fn compress(&self, array: &dyn Array) -> VortexResult { + // Canonicalize the array + let canonical = array.to_canonical()?; + + // Compact it, removing any wasted space before we attempt to compress it + let compact = canonical.compact()?; + + self.compress_canonical(compact, CompressorContext::default(), Excludes::none()) + } +} + +impl CanonicalCompressor for BtrBlocksCompressor { + /// Compresses a canonical array by dispatching to type-specific compressors. + /// + /// Recursively compresses nested structures and applies optimal schemes for each data type. + fn compress_canonical( + &self, + array: Canonical, + ctx: CompressorContext, + excludes: Excludes, + ) -> VortexResult { + match array { + Canonical::Null(null_array) => Ok(null_array.into_array()), + // TODO(aduffy): Sparse, other bool compressors. + Canonical::Bool(bool_array) => Ok(bool_array.into_array()), + Canonical::Primitive(primitive) => { + if primitive.ptype().is_int() { + IntCompressor { + btr_blocks_compressor: self, + } + .compress(self, &primitive, ctx, excludes.int) + } else { + FloatCompressor { + btr_blocks_compressor: self, + } + .compress(self, &primitive, ctx, excludes.float) + } + } + Canonical::Decimal(decimal) => compress_decimal(self, &decimal), + Canonical::Struct(struct_array) => { + let fields = struct_array + .unmasked_fields() + .iter() + .map(|field| self.compress(field)) + .collect::, _>>()?; + + Ok(StructArray::try_new( + struct_array.names().clone(), + fields, + struct_array.len(), + struct_array.validity().clone(), + )? + .into_array()) + } + Canonical::List(list_view_array) => { + // TODO(joe): We might want to write list views in the future and chose between + // list and list view. + let list_array = list_from_list_view(list_view_array)?; + + // Reset the offsets to remove garbage data that might prevent us from narrowing our + // offsets (there could be a large amount of trailing garbage data that the current + // views do not reference at all). + let list_array = list_array.reset_offsets(true)?; + + let compressed_elems = self.compress(list_array.elements())?; + + // Note that since the type of our offsets are not encoded in our `DType`, and since + // we guarantee above that all elements are referenced by offsets, we may narrow the + // widths. + + let compressed_offsets = self.compress_canonical( + Canonical::Primitive(list_array.offsets().to_primitive().narrow()?), + ctx, + Excludes::int_only(&[IntCode::Dict]), + )?; + + Ok(ListArray::try_new( + compressed_elems, + compressed_offsets, + list_array.validity().clone(), + )? + .into_array()) + } + Canonical::FixedSizeList(fsl_array) => { + let compressed_elems = self.compress(fsl_array.elements())?; + + Ok(FixedSizeListArray::try_new( + compressed_elems, + fsl_array.list_size(), + fsl_array.validity().clone(), + fsl_array.len(), + )? + .into_array()) + } + Canonical::VarBinView(strings) => { + if strings + .dtype() + .eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable)) + { + StringCompressor { + btr_blocks_compressor: self, + } + .compress(self, &strings, ctx, excludes.string) + } else { + // Binary arrays do not compress + Ok(strings.into_array()) + } + } + Canonical::Extension(ext_array) => { + // We compress Timestamp-level arrays with DateTimeParts compression + if let Ok(temporal_array) = TemporalArray::try_from(ext_array.to_array()) + && let TemporalMetadata::Timestamp(..) = temporal_array.temporal_metadata() + { + if is_constant_opts( + temporal_array.as_ref(), + &IsConstantOpts { + cost: Cost::Canonicalize, + }, + )? + .unwrap_or_default() + { + return Ok(ConstantArray::new( + temporal_array.as_ref().scalar_at(0)?, + ext_array.len(), + ) + .into_array()); + } + return compress_temporal(self, temporal_array); + } + + // Compress the underlying storage array. + let compressed_storage = self.compress(ext_array.storage())?; + + Ok( + ExtensionArray::new(ext_array.ext_dtype().clone(), compressed_storage) + .into_array(), + ) + } + } + } + + fn int_schemes(&self) -> &[&'static dyn IntegerScheme] { + &self.int_schemes + } + + fn float_schemes(&self) -> &[&'static dyn FloatScheme] { + &self.float_schemes + } + + fn string_schemes(&self) -> &[&'static dyn StringScheme] { + &self.string_schemes + } +} diff --git a/vortex-btrblocks/src/compressor.rs b/vortex-btrblocks/src/compressor.rs new file mode 100644 index 00000000000..15ca78cfcf0 --- /dev/null +++ b/vortex-btrblocks/src/compressor.rs @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compressor traits for type-specific compression. + +use vortex_array::ArrayRef; +use vortex_array::vtable::VTable; +use vortex_error::VortexResult; + +use crate::BtrBlocksCompressor; +use crate::CompressorContext; +use crate::CompressorStats; +use crate::Scheme; + +/// Maximum cascade depth for compression. +pub(crate) const MAX_CASCADE: usize = 3; + +/// A compressor for a particular input type. +/// +/// This trait defines the interface for type-specific compressors that can adaptively +/// choose and apply compression schemes based on data characteristics. Compressors +/// analyze input arrays, select optimal compression schemes, and handle cascading +/// compression with multiple encoding layers. +/// +/// The compressor works by generating statistics on the input data, evaluating +/// available compression schemes, and selecting the one with the best compression ratio. +pub trait Compressor { + /// The VTable type for arrays this compressor operates on. + type ArrayVTable: VTable; + /// The compression scheme type used by this compressor. + type SchemeType: Scheme + ?Sized; + /// The statistics type used to analyze arrays for compression. + type StatsType: CompressorStats; + + /// Generates statistics for the given array to guide compression scheme selection. + fn gen_stats(&self, array: &::Array) -> Self::StatsType; + + /// Returns all available compression schemes for this compressor. + fn schemes(&self) -> &[&'static Self::SchemeType]; + /// Returns the default fallback compression scheme. + fn default_scheme(&self) -> &'static Self::SchemeType; +} + +/// Extension trait providing scheme selection and compression for compressors. +pub trait CompressorExt: Compressor +where + Self::SchemeType: 'static, +{ + /// Selects the best compression scheme based on expected compression ratios. + /// + /// Evaluates all available schemes against the provided statistics and returns + /// the one with the highest compression ratio. Falls back to the default scheme + /// if no scheme provides compression benefits. + #[allow(clippy::cognitive_complexity)] + fn choose_scheme( + &self, + compressor: &BtrBlocksCompressor, + stats: &Self::StatsType, + ctx: CompressorContext, + excludes: &[::CodeType], + ) -> VortexResult<&'static Self::SchemeType> { + let mut best_ratio = 1.0; + let mut best_scheme: Option<&'static Self::SchemeType> = None; + + // logging helpers + let depth = MAX_CASCADE - ctx.allowed_cascading; + + for scheme in self.schemes().iter() { + // Skip excluded schemes + if excludes.contains(&scheme.code()) { + continue; + } + + // We never choose Constant for a sample + if ctx.is_sample && scheme.is_constant() { + continue; + } + + tracing::trace!( + is_sample = ctx.is_sample, + depth, + is_constant = scheme.is_constant(), + ?scheme, + "Trying compression scheme" + ); + + let ratio = scheme.expected_compression_ratio(compressor, stats, ctx, excludes)?; + tracing::trace!( + is_sample = ctx.is_sample, + depth, + ratio, + ?scheme, + "Expected compression result" + ); + + if !(ratio.is_subnormal() || ratio.is_infinite() || ratio.is_nan()) { + if ratio > best_ratio { + best_ratio = ratio; + best_scheme = Some(*scheme); + } + } else { + tracing::trace!( + "Calculated invalid compression ratio {ratio} for scheme: {scheme:?}. Must not be sub-normal, infinite or nan." + ); + } + } + + tracing::trace!(depth, scheme = ?best_scheme, ratio = best_ratio, "best scheme found"); + + if let Some(best) = best_scheme { + Ok(best) + } else { + Ok(self.default_scheme()) + } + } + + /// Compresses an array using this compressor. + /// + /// Generates statistics on the input array, selects the best compression scheme, + /// and applies it. Returns the original array if compression would increase size. + fn compress( + &self, + btr_blocks_compressor: &BtrBlocksCompressor, + array: &<::ArrayVTable as VTable>::Array, + ctx: CompressorContext, + excludes: &[::CodeType], + ) -> VortexResult { + // Avoid compressing empty arrays. + if array.is_empty() { + return Ok(array.to_array()); + } + + // Generate stats on the array directly. + let stats = self.gen_stats(array); + let best_scheme = self.choose_scheme(btr_blocks_compressor, &stats, ctx, excludes)?; + + let output = best_scheme.compress(btr_blocks_compressor, &stats, ctx, excludes)?; + if output.nbytes() < array.nbytes() { + Ok(output) + } else { + tracing::debug!("resulting tree too large: {}", output.display_tree()); + Ok(array.to_array()) + } + } +} + +// Blanket implementation for all Compressor types with 'static SchemeType +impl CompressorExt for T where T::SchemeType: 'static {} diff --git a/vortex-btrblocks/src/ctx.rs b/vortex-btrblocks/src/ctx.rs new file mode 100644 index 00000000000..f2cb6a37102 --- /dev/null +++ b/vortex-btrblocks/src/ctx.rs @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compression context types for recursive compression. + +use crate::FloatCode; +use crate::IntCode; +use crate::MAX_CASCADE; +use crate::StringCode; + +/// Holds references to exclude lists for each compression code type. +/// +/// This struct is passed through recursive compression calls to specify +/// which schemes should be excluded at each level. +#[derive(Debug, Clone, Copy, Default)] +pub struct Excludes<'a> { + /// Integer schemes to exclude. + pub int: &'a [IntCode], + /// Float schemes to exclude. + pub float: &'a [FloatCode], + /// String schemes to exclude. + pub string: &'a [StringCode], +} + +impl<'a> Excludes<'a> { + /// Creates an empty excludes (no exclusions). + pub const fn none() -> Self { + Self { + int: &[], + float: &[], + string: &[], + } + } + + /// Creates excludes with only integer exclusions. + pub const fn int_only(int: &'a [IntCode]) -> Self { + Self { + int, + float: &[], + string: &[], + } + } + + /// Creates excludes with only float exclusions. + pub const fn float_only(float: &'a [FloatCode]) -> Self { + Self { + int: &[], + float, + string: &[], + } + } + + /// Creates excludes with only string exclusions. + pub const fn string_only(string: &'a [StringCode]) -> Self { + Self { + int: &[], + float: &[], + string, + } + } +} + +impl<'a> From<&'a [IntCode]> for Excludes<'a> { + fn from(int: &'a [IntCode]) -> Self { + Self::int_only(int) + } +} + +impl<'a, const N: usize> From<&'a [IntCode; N]> for Excludes<'a> { + fn from(int: &'a [IntCode; N]) -> Self { + Self::int_only(int) + } +} + +impl<'a> From<&'a [FloatCode]> for Excludes<'a> { + fn from(float: &'a [FloatCode]) -> Self { + Self::float_only(float) + } +} + +impl<'a, const N: usize> From<&'a [FloatCode; N]> for Excludes<'a> { + fn from(float: &'a [FloatCode; N]) -> Self { + Self::float_only(float) + } +} + +impl<'a> From<&'a [StringCode]> for Excludes<'a> { + fn from(string: &'a [StringCode]) -> Self { + Self::string_only(string) + } +} + +impl<'a, const N: usize> From<&'a [StringCode; N]> for Excludes<'a> { + fn from(string: &'a [StringCode; N]) -> Self { + Self::string_only(string) + } +} + +/// Context passed through recursive compression calls. +/// +/// Bundles `is_sample` and `allowed_cascading` which always travel together. +/// Excludes are passed separately since they're type-specific. +#[derive(Debug, Clone, Copy)] +pub struct CompressorContext { + /// Whether we're compressing a sample (for ratio estimation). + pub is_sample: bool, + /// Remaining cascade depth allowed. + pub allowed_cascading: usize, +} + +impl Default for CompressorContext { + fn default() -> Self { + Self { + is_sample: false, + allowed_cascading: MAX_CASCADE, + } + } +} + +impl CompressorContext { + /// Descend one level in the cascade (decrements `allowed_cascading`). + pub fn descend(self) -> Self { + Self { + allowed_cascading: self.allowed_cascading.saturating_sub(1), + ..self + } + } + + /// Returns a context marked as sample compression (for ratio estimation). + pub fn as_sample(self) -> Self { + Self { + is_sample: true, + ..self + } + } +} diff --git a/vortex-btrblocks/src/decimal.rs b/vortex-btrblocks/src/decimal.rs index 479b0d1c35c..5170405d10c 100644 --- a/vortex-btrblocks/src/decimal.rs +++ b/vortex-btrblocks/src/decimal.rs @@ -13,8 +13,8 @@ use vortex_scalar::DecimalType; use crate::BtrBlocksCompressor; use crate::CanonicalCompressor; +use crate::CompressorContext; use crate::Excludes; -use crate::MAX_CASCADE; // TODO(joe): add support splitting i128/256 buffers into chunks primitive values for compression. // 2 for i128 and 4 for i256 @@ -34,8 +34,7 @@ pub fn compress_decimal( let compressed = compressor.compress_canonical( Canonical::Primitive(prim), - false, - MAX_CASCADE, + CompressorContext::default(), Excludes::none(), )?; diff --git a/vortex-btrblocks/src/float.rs b/vortex-btrblocks/src/float.rs index bfff723da2f..69c298cd4e3 100644 --- a/vortex-btrblocks/src/float.rs +++ b/vortex-btrblocks/src/float.rs @@ -34,12 +34,13 @@ pub use self::stats::FloatStats; use crate::BtrBlocksCompressor; use crate::CanonicalCompressor; use crate::Compressor; +use crate::CompressorContext; use crate::CompressorStats; use crate::Excludes; use crate::GenerateStatsOptions; use crate::IntCode; use crate::Scheme; -use crate::estimate_compression_ratio_with_sampling; +use crate::SchemeExt; use crate::float::dictionary::dictionary_encode; use crate::integer; use crate::patches::compress_patches; @@ -172,14 +173,12 @@ impl rle::RLEConfig for FloatRLEConfig { fn compress_values( compressor: &BtrBlocksCompressor, values: &vortex_array::arrays::PrimitiveArray, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, excludes: &[FloatCode], ) -> VortexResult { compressor.compress_canonical( Canonical::Primitive(values.clone()), - is_sample, - allowed_cascading, + ctx, Excludes::float_only(excludes), ) } @@ -200,8 +199,7 @@ impl Scheme for UncompressedScheme { &self, _compressor: &BtrBlocksCompressor, _stats: &Self::StatsType, - _is_sample: bool, - _allowed_cascading: usize, + _ctx: CompressorContext, _excludes: &[FloatCode], ) -> VortexResult { Ok(1.0) @@ -211,8 +209,7 @@ impl Scheme for UncompressedScheme { &self, _btr_blocks_compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - _is_sample: bool, - _allowed_cascading: usize, + _ctx: CompressorContext, _excludes: &[FloatCode], ) -> VortexResult { Ok(stats.source().to_array()) @@ -231,12 +228,11 @@ impl Scheme for ConstantScheme { &self, _btr_blocks_compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - is_sample: bool, - _allowed_cascading: usize, + ctx: CompressorContext, _excludes: &[FloatCode], ) -> VortexResult { // Never select Constant when sampling - if is_sample { + if ctx.is_sample { return Ok(0.0); } @@ -256,8 +252,7 @@ impl Scheme for ConstantScheme { &self, _btr_blocks_compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - _is_sample: bool, - _allowed_cascading: usize, + _ctx: CompressorContext, _excludes: &[FloatCode], ) -> VortexResult { let scalar_idx = @@ -294,8 +289,7 @@ impl Scheme for ALPScheme { &self, compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, excludes: &[FloatCode], ) -> VortexResult { // We don't support ALP for f16 @@ -303,28 +297,20 @@ impl Scheme for ALPScheme { return Ok(0.0); } - if allowed_cascading == 0 { + if ctx.allowed_cascading == 0 { // ALP does not compress on its own, we need to be able to cascade it with // an integer compressor. return Ok(0.0); } - estimate_compression_ratio_with_sampling( - self, - compressor, - stats, - is_sample, - allowed_cascading, - excludes, - ) + self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) } fn compress( &self, compressor: &BtrBlocksCompressor, stats: &FloatStats, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, excludes: &[FloatCode], ) -> VortexResult { let alp_encoded = alp_encode(&stats.source().to_primitive(), None)?; @@ -344,8 +330,7 @@ impl Scheme for ALPScheme { let compressed_alp_ints = compressor.compress_canonical( Canonical::Primitive(alp_ints), - is_sample, - allowed_cascading - 1, + ctx.descend(), Excludes::int_only(&int_excludes), )?; @@ -367,30 +352,21 @@ impl Scheme for ALPRDScheme { &self, compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, excludes: &[FloatCode], ) -> VortexResult { if stats.source().ptype() == PType::F16 { return Ok(0.0); } - estimate_compression_ratio_with_sampling( - self, - compressor, - stats, - is_sample, - allowed_cascading, - excludes, - ) + self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) } fn compress( &self, _compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - _is_sample: bool, - _allowed_cascading: usize, + _ctx: CompressorContext, _excludes: &[FloatCode], ) -> VortexResult { let encoder = match stats.source().ptype() { @@ -423,8 +399,7 @@ impl Scheme for DictScheme { &self, compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, excludes: &[FloatCode], ) -> VortexResult { if stats.value_count == 0 { @@ -437,22 +412,14 @@ impl Scheme for DictScheme { } // Take a sample and run compression on the sample to determine before/after size. - estimate_compression_ratio_with_sampling( - self, - compressor, - stats, - is_sample, - allowed_cascading, - excludes, - ) + self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) } fn compress( &self, compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, _excludes: &[Self::CodeType], ) -> VortexResult { let dict = dictionary_encode(stats); @@ -461,16 +428,14 @@ impl Scheme for DictScheme { let compressed_codes = compressor.compress_canonical( Canonical::Primitive(codes.to_primitive()), - is_sample, - allowed_cascading - 1, + ctx.descend(), Excludes::int_only(&[IntCode::Dict, IntCode::Sequence]), )?; assert!(values.is_canonical()); let compressed_values = compressor.compress_canonical( Canonical::Primitive(values.to_primitive()), - is_sample, - allowed_cascading - 1, + ctx.descend(), Excludes::float_only(&[FloatCode::Dict]), )?; @@ -497,12 +462,11 @@ impl Scheme for NullDominated { &self, _compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - _is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, _excludes: &[Self::CodeType], ) -> VortexResult { // Only use `SparseScheme` if we can cascade. - if allowed_cascading == 0 { + if ctx.allowed_cascading == 0 { return Ok(0.0); } @@ -524,11 +488,10 @@ impl Scheme for NullDominated { &self, compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, _excludes: &[Self::CodeType], ) -> VortexResult { - assert!(allowed_cascading > 0); + assert!(ctx.allowed_cascading > 0); // We pass None as we only run this pathway for NULL-dominated float arrays let sparse_encoded = SparseArray::encode(stats.src.as_ref(), None)?; @@ -542,16 +505,9 @@ impl Scheme for NullDominated { let indices = sparse.patches().indices().to_primitive().narrow()?; let compressed_indices = compressor.compress_canonical( Canonical::Primitive(indices.to_primitive()), - is_sample, - allowed_cascading - 1, + ctx.descend(), Excludes::int_only(&new_excludes), )?; - // let compressed_indices = IntCompressor::compress_no_dict_static( - // &indices, - // is_sample, - // allowed_cascading - 1, - // &new_excludes, - // )?; SparseArray::try_new( compressed_indices, diff --git a/vortex-btrblocks/src/integer.rs b/vortex-btrblocks/src/integer.rs index b379e5ed6a6..c92867028b9 100644 --- a/vortex-btrblocks/src/integer.rs +++ b/vortex-btrblocks/src/integer.rs @@ -40,11 +40,12 @@ use vortex_zigzag::zigzag_encode; use crate::BtrBlocksCompressor; use crate::CanonicalCompressor; use crate::Compressor; +use crate::CompressorContext; use crate::CompressorStats; use crate::Excludes; use crate::GenerateStatsOptions; use crate::Scheme; -use crate::estimate_compression_ratio_with_sampling; +use crate::SchemeExt; use crate::integer::dictionary::dictionary_encode; use crate::patches::compress_patches; use crate::rle; @@ -203,14 +204,12 @@ impl rle::RLEConfig for IntRLEConfig { fn compress_values( compressor: &BtrBlocksCompressor, values: &PrimitiveArray, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, excludes: &[IntCode], ) -> VortexResult { compressor.compress_canonical( Canonical::Primitive(values.clone()), - is_sample, - allowed_cascading, + ctx, Excludes::int_only(excludes), ) } @@ -231,8 +230,7 @@ impl Scheme for UncompressedScheme { &self, _compressor: &BtrBlocksCompressor, _stats: &IntegerStats, - _is_sample: bool, - _allowed_cascading: usize, + _ctx: CompressorContext, _excludes: &[IntCode], ) -> VortexResult { // no compression @@ -243,8 +241,7 @@ impl Scheme for UncompressedScheme { &self, _compressor: &BtrBlocksCompressor, stats: &IntegerStats, - _is_sample: bool, - _allowed_cascading: usize, + _ctx: CompressorContext, _excludes: &[IntCode], ) -> VortexResult { Ok(stats.source().to_array()) @@ -267,12 +264,11 @@ impl Scheme for ConstantScheme { &self, _compressor: &BtrBlocksCompressor, stats: &IntegerStats, - is_sample: bool, - _allowed_cascading: usize, + ctx: CompressorContext, _excludes: &[IntCode], ) -> VortexResult { // Never yield ConstantScheme for a sample, it could be a false-positive. - if is_sample { + if ctx.is_sample { return Ok(0.0); } @@ -288,8 +284,7 @@ impl Scheme for ConstantScheme { &self, _compressor: &BtrBlocksCompressor, stats: &IntegerStats, - _is_sample: bool, - _allowed_cascading: usize, + _ctx: CompressorContext, _excludes: &[IntCode], ) -> VortexResult { let scalar_idx = @@ -326,12 +321,11 @@ impl Scheme for FORScheme { &self, _compressor: &BtrBlocksCompressor, stats: &IntegerStats, - _is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, _excludes: &[IntCode], ) -> VortexResult { // Only apply if we are not at the leaf - if allowed_cascading == 0 { + if ctx.allowed_cascading == 0 { return Ok(0.0); } @@ -371,8 +365,7 @@ impl Scheme for FORScheme { &self, compressor: &BtrBlocksCompressor, stats: &IntegerStats, - is_sample: bool, - _allowed_cascading: usize, + ctx: CompressorContext, excludes: &[IntCode], ) -> VortexResult { let for_array = FoRArray::encode(stats.src.clone())?; @@ -388,8 +381,12 @@ impl Scheme for FORScheme { // of bitpacking. // NOTE: we could delegate in the future if we had another downstream codec that performs // as well. + let leaf_ctx = CompressorContext { + is_sample: ctx.is_sample, + allowed_cascading: 0, + }; let compressed = - BitPackingScheme.compress(compressor, &biased_stats, is_sample, 0, excludes)?; + BitPackingScheme.compress(compressor, &biased_stats, leaf_ctx, excludes)?; let for_compressed = FoRArray::try_new(compressed, for_array.reference_scalar().clone())?; for_compressed @@ -412,12 +409,11 @@ impl Scheme for ZigZagScheme { &self, compressor: &BtrBlocksCompressor, stats: &IntegerStats, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, excludes: &[IntCode], ) -> VortexResult { // ZigZag is only useful when we cascade it with another encoding - if allowed_cascading == 0 { + if ctx.allowed_cascading == 0 { return Ok(0.0); } @@ -432,22 +428,14 @@ impl Scheme for ZigZagScheme { } // Run compression on a sample to see how it performs. - estimate_compression_ratio_with_sampling( - self, - compressor, - stats, - is_sample, - allowed_cascading, - excludes, - ) + self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) } fn compress( &self, compressor: &BtrBlocksCompressor, stats: &IntegerStats, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, excludes: &[IntCode], ) -> VortexResult { // Zigzag encode the values, then recursively compress the inner values. @@ -466,16 +454,9 @@ impl Scheme for ZigZagScheme { let compressed = compressor.compress_canonical( Canonical::Primitive(encoded), - is_sample, - allowed_cascading - 1, + ctx.descend(), Excludes::int_only(&new_excludes), )?; - // let compressed = IntCompressor::compress_static( - // &encoded, - // is_sample, - // allowed_cascading - 1, - // &new_excludes, - // )?; tracing::debug!("zigzag output: {}", compressed.display_tree()); @@ -495,8 +476,7 @@ impl Scheme for BitPackingScheme { &self, compressor: &BtrBlocksCompressor, stats: &IntegerStats, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, excludes: &[IntCode], ) -> VortexResult { // BitPacking only works for non-negative values @@ -509,22 +489,14 @@ impl Scheme for BitPackingScheme { return Ok(0.0); } - estimate_compression_ratio_with_sampling( - self, - compressor, - stats, - is_sample, - allowed_cascading, - excludes, - ) + self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) } fn compress( &self, _compressor: &BtrBlocksCompressor, stats: &IntegerStats, - _is_sample: bool, - _allowed_cascading: usize, + _ctx: CompressorContext, _excludes: &[IntCode], ) -> VortexResult { let histogram = bit_width_histogram(stats.source())?; @@ -555,12 +527,11 @@ impl Scheme for SparseScheme { &self, _compressor: &BtrBlocksCompressor, stats: &IntegerStats, - _is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, _excludes: &[IntCode], ) -> VortexResult { // Only use `SparseScheme` if we can cascade. - if allowed_cascading == 0 { + if ctx.allowed_cascading == 0 { return Ok(0.0); } @@ -595,11 +566,10 @@ impl Scheme for SparseScheme { &self, compressor: &BtrBlocksCompressor, stats: &IntegerStats, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, excludes: &[IntCode], ) -> VortexResult { - assert!(allowed_cascading > 0); + assert!(ctx.allowed_cascading > 0); let (top_pvalue, top_count) = stats.typed.top_value_and_count(); if top_count as usize == stats.src.len() { // top_value is the only value, use ConstantScheme @@ -630,8 +600,7 @@ impl Scheme for SparseScheme { let compressed_values = compressor.compress_canonical( Canonical::Primitive(sparse.patches().values().to_primitive()), - is_sample, - allowed_cascading - 1, + ctx.descend(), Excludes::int_only(&new_excludes), )?; @@ -639,8 +608,7 @@ impl Scheme for SparseScheme { let compressed_indices = compressor.compress_canonical( Canonical::Primitive(indices), - is_sample, - allowed_cascading - 1, + ctx.descend(), Excludes::int_only(&new_excludes), )?; @@ -669,12 +637,11 @@ impl Scheme for DictScheme { &self, _compressor: &BtrBlocksCompressor, stats: &IntegerStats, - _is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, _excludes: &[IntCode], ) -> VortexResult { // Dict should not be terminal. - if allowed_cascading == 0 { + if ctx.allowed_cascading == 0 { return Ok(0.0); } @@ -710,11 +677,10 @@ impl Scheme for DictScheme { &self, compressor: &BtrBlocksCompressor, stats: &IntegerStats, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, excludes: &[IntCode], ) -> VortexResult { - assert!(allowed_cascading > 0); + assert!(ctx.allowed_cascading > 0); // TODO(aduffy): we can be more prescriptive: we know that codes will EITHER be // RLE or FOR + BP. Cascading probably wastes some time here. @@ -728,8 +694,7 @@ impl Scheme for DictScheme { let compressed_codes = compressor.compress_canonical( Canonical::Primitive(dict.codes().to_primitive().narrow()?), - is_sample, - allowed_cascading - 1, + ctx.descend(), Excludes::int_only(&new_excludes), )?; @@ -756,8 +721,7 @@ impl Scheme for RunEndScheme { &self, compressor: &BtrBlocksCompressor, stats: &IntegerStats, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, excludes: &[IntCode], ) -> VortexResult { // If the run length is below the threshold, drop it. @@ -765,30 +729,22 @@ impl Scheme for RunEndScheme { return Ok(0.0); } - if allowed_cascading == 0 { + if ctx.allowed_cascading == 0 { return Ok(0.0); } // Run compression on a sample, see how it performs. - estimate_compression_ratio_with_sampling( - self, - compressor, - stats, - is_sample, - allowed_cascading, - excludes, - ) + self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) } fn compress( &self, compressor: &BtrBlocksCompressor, stats: &IntegerStats, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, excludes: &[IntCode], ) -> VortexResult { - assert!(allowed_cascading > 0); + assert!(ctx.allowed_cascading > 0); // run-end encode the ends let (ends, values) = runend_encode(&stats.src); @@ -798,15 +754,13 @@ impl Scheme for RunEndScheme { let compressed_ends = compressor.compress_canonical( Canonical::Primitive(ends.to_primitive()), - is_sample, - allowed_cascading - 1, + ctx.descend(), Excludes::int_only(&new_excludes), )?; let compressed_values = compressor.compress_canonical( Canonical::Primitive(values.to_primitive()), - is_sample, - allowed_cascading - 1, + ctx.descend(), Excludes::int_only(&new_excludes), )?; @@ -832,8 +786,7 @@ impl Scheme for SequenceScheme { &self, _compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - _is_sample: bool, - _allowed_cascading: usize, + _ctx: CompressorContext, _excludes: &[Self::CodeType], ) -> VortexResult { if stats.null_count > 0 { @@ -856,8 +809,7 @@ impl Scheme for SequenceScheme { &self, _compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - _is_sample: bool, - _allowed_cascading: usize, + _ctx: CompressorContext, _excludes: &[Self::CodeType], ) -> VortexResult { if stats.null_count > 0 { diff --git a/vortex-btrblocks/src/lib.rs b/vortex-btrblocks/src/lib.rs index 823bdf47aba..dcff5504a24 100644 --- a/vortex-btrblocks/src/lib.rs +++ b/vortex-btrblocks/src/lib.rs @@ -28,748 +28,44 @@ //! let compressor = BtrBlocksCompressor::default(); //! //! // Configure with builder to exclude specific schemes -//! let compressor = BtrBlocksCompressorBuilder::new() +//! let compressor = BtrBlocksCompressorBuilder::default() //! .exclude_int([IntCode::Dict]) //! .build(); //! ``` //! //! [BtrBlocks]: https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/papers/btrblocks.pdf -use std::fmt::Debug; -use std::hash::Hash; -use std::hash::Hasher; - -use vortex_array::Array; -use vortex_array::ArrayRef; -use vortex_array::Canonical; -use vortex_array::IntoArray; -use vortex_array::ToCanonical; -use vortex_array::arrays::ConstantArray; -use vortex_array::arrays::ExtensionArray; -use vortex_array::arrays::FixedSizeListArray; -use vortex_array::arrays::ListArray; -use vortex_array::arrays::StructArray; -use vortex_array::arrays::TemporalArray; -use vortex_array::arrays::list_from_list_view; -use vortex_array::compute::Cost; -use vortex_array::compute::IsConstantOpts; -use vortex_array::compute::is_constant_opts; -use vortex_array::vtable::VTable; -use vortex_array::vtable::ValidityHelper; -use vortex_dtype::DType; -use vortex_dtype::Nullability; -use vortex_dtype::datetime::Timestamp; -use vortex_error::VortexExpect; -use vortex_error::VortexResult; - -use crate::decimal::compress_decimal; pub use crate::float::FloatCode; -pub use crate::float::FloatCompressor; -pub use crate::float::FloatStats; -pub use crate::float::dictionary::dictionary_encode as float_dictionary_encode; +use crate::float::FloatCompressor; pub use crate::integer::IntCode; -pub use crate::integer::IntCompressor; -pub use crate::integer::IntegerStats; -pub use crate::integer::dictionary::dictionary_encode as integer_dictionary_encode; +use crate::integer::IntCompressor; pub use crate::string::StringCode; -pub use crate::string::StringCompressor; -pub use crate::string::StringStats; -pub use crate::temporal::compress_temporal; +use crate::string::StringCompressor; mod builder; +mod canonical_compressor; +mod compressor; +mod ctx; mod decimal; mod float; mod integer; mod patches; mod rle; mod sample; +mod scheme; +mod stats; mod string; mod temporal; pub use builder::BtrBlocksCompressorBuilder; - -use crate::float::FloatScheme; -use crate::integer::IntegerScheme; -use crate::string::StringScheme; - -/// Holds references to exclude lists for each compression code type. -/// -/// This struct is passed through recursive compression calls to specify -/// which schemes should be excluded at each level. -#[derive(Debug, Clone, Copy, Default)] -pub struct Excludes<'a> { - /// Integer schemes to exclude. - pub int: &'a [IntCode], - /// Float schemes to exclude. - pub float: &'a [FloatCode], - /// String schemes to exclude. - pub string: &'a [StringCode], -} - -impl<'a> Excludes<'a> { - /// Creates an empty excludes (no exclusions). - pub const fn none() -> Self { - Self { - int: &[], - float: &[], - string: &[], - } - } - - /// Creates excludes with only integer exclusions. - pub const fn int_only(int: &'a [IntCode]) -> Self { - Self { - int, - float: &[], - string: &[], - } - } - - /// Creates excludes with only float exclusions. - pub const fn float_only(float: &'a [FloatCode]) -> Self { - Self { - int: &[], - float, - string: &[], - } - } - - /// Creates excludes with only string exclusions. - pub const fn string_only(string: &'a [StringCode]) -> Self { - Self { - int: &[], - float: &[], - string, - } - } -} - -/// Configures how stats are generated. -pub struct GenerateStatsOptions { - /// Should distinct values should be counted during stats generation. - pub count_distinct_values: bool, - // pub count_runs: bool, - // should this be scheme-specific? -} - -impl Default for GenerateStatsOptions { - fn default() -> Self { - Self { - count_distinct_values: true, - // count_runs: true, - } - } -} - -/// The size of each sampled run. -const SAMPLE_SIZE: u32 = 64; -/// The number of sampled runs. -/// -/// # Warning -/// -/// The product of SAMPLE_SIZE and SAMPLE_COUNT should be (roughly) a multiple of 1024 so that -/// fastlanes bitpacking of sampled vectors does not introduce (large amounts of) padding. -const SAMPLE_COUNT: u32 = 16; - -/// Stats for the compressor. -pub trait CompressorStats: Debug + Clone { - /// The type of the underlying source array vtable. - type ArrayVTable: VTable; - - /// Generates stats with default options. - fn generate(input: &::Array) -> Self { - Self::generate_opts(input, GenerateStatsOptions::default()) - } - - /// Generates stats with provided options. - fn generate_opts( - input: &::Array, - opts: GenerateStatsOptions, - ) -> Self; - - /// Returns the underlying source array that statistics were generated from. - fn source(&self) -> &::Array; - - /// Sample the array with default options. - fn sample(&self, sample_size: u32, sample_count: u32) -> Self { - self.sample_opts(sample_size, sample_count, GenerateStatsOptions::default()) - } - - /// Sample the array with provided options. - fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self; -} - -/// Top-level compression scheme trait. -/// -/// Variants are specialized for each data type, e.g. see `IntegerScheme`, `FloatScheme`, etc. -pub trait Scheme: Debug { - /// Type of the stats generated by the compression scheme. - type StatsType: CompressorStats; - /// Type of the code used to uniquely identify the compression scheme. - type CodeType: Copy + Eq + Hash; - - /// Scheme unique identifier. - fn code(&self) -> Self::CodeType; - - /// True if this is the singular Constant scheme for this data type. - fn is_constant(&self) -> bool { - false - } - - /// Estimate the compression ratio for running this scheme (and its children) - /// for the given input. - /// - /// Depth is the depth in the encoding tree we've already reached before considering this - /// scheme. - /// - /// Returns the estimated compression ratio as well as the tree of compressors to use. - fn expected_compression_ratio( - &self, - compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - is_sample: bool, - allowed_cascading: usize, - excludes: &[Self::CodeType], - ) -> VortexResult { - estimate_compression_ratio_with_sampling( - self, - compressor, - stats, - is_sample, - allowed_cascading, - excludes, - ) - } - - /// Compress the input with this scheme, yielding a new array. - fn compress( - &self, - compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - is_sample: bool, - allowed_cascading: usize, - excludes: &[Self::CodeType], - ) -> VortexResult; -} - -impl PartialEq for dyn Scheme { - fn eq(&self, other: &Self) -> bool { - self.code() == other.code() - } -} -impl Eq for dyn Scheme {} -impl Hash for dyn Scheme { - fn hash(&self, state: &mut H) { - self.code().hash(state) - } -} - -fn estimate_compression_ratio_with_sampling( - scheme: &T, - btr_blocks_compressor: &BtrBlocksCompressor, - stats: &T::StatsType, - is_sample: bool, - allowed_cascading: usize, - excludes: &[T::CodeType], -) -> VortexResult { - let sample = if is_sample { - stats.clone() - } else { - // We want to sample about 1% of data - let source_len = stats.source().len(); - - // We want to sample about 1% of data, while keeping a minimal sample of 1024 values. - let approximately_one_percent = (source_len / 100) - / usize::try_from(SAMPLE_SIZE).vortex_expect("SAMPLE_SIZE must fit in usize"); - let sample_count = u32::max( - u32::next_multiple_of( - approximately_one_percent - .try_into() - .vortex_expect("sample count must fit in u32"), - 16, - ), - SAMPLE_COUNT, - ); - - tracing::trace!( - "Sampling {} values out of {}", - SAMPLE_SIZE as u64 * sample_count as u64, - source_len - ); - - stats.sample(SAMPLE_SIZE, sample_count) - }; - - let after = scheme - .compress( - btr_blocks_compressor, - &sample, - true, - allowed_cascading, - excludes, - )? - .nbytes(); - let before = sample.source().nbytes(); - - tracing::debug!( - "estimate_compression_ratio_with_sampling(compressor={scheme:#?} is_sample={is_sample}, allowed_cascading={allowed_cascading}) = {}", - before as f64 / after as f64 - ); - - Ok(before as f64 / after as f64) -} - -const MAX_CASCADE: usize = 3; - -/// A compressor for a particular input type. -/// -/// This trait defines the interface for type-specific compressors that can adaptively -/// choose and apply compression schemes based on data characteristics. Compressors -/// analyze input arrays, select optimal compression schemes, and handle cascading -/// compression with multiple encoding layers. -/// -/// The compressor works by generating statistics on the input data, evaluating -/// available compression schemes, and selecting the one with the best compression ratio. -pub trait Compressor { - /// The VTable type for arrays this compressor operates on. - type ArrayVTable: VTable; - /// The compression scheme type used by this compressor. - type SchemeType: Scheme + ?Sized; - /// The statistics type used to analyze arrays for compression. - type StatsType: CompressorStats; - - /// Generates statistics for the given array to guide compression scheme selection. - fn gen_stats(&self, array: &::Array) -> Self::StatsType; - - /// Returns all available compression schemes for this compressor. - fn schemes(&self) -> &[&'static Self::SchemeType]; - /// Returns the default fallback compression scheme. - fn default_scheme(&self) -> &'static Self::SchemeType; - - /// Selects the best compression scheme based on expected compression ratios. - /// - /// Evaluates all available schemes against the provided statistics and returns - /// the one with the highest compression ratio. Falls back to the default scheme - /// if no scheme provides compression benefits. - #[allow(clippy::cognitive_complexity)] - fn choose_scheme( - &self, - compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - is_sample: bool, - allowed_cascading: usize, - excludes: &[::CodeType], - ) -> VortexResult<&'static Self::SchemeType> { - let mut best_ratio = 1.0; - let mut best_scheme: Option<&'static Self::SchemeType> = None; - - // logging helpers - let depth = MAX_CASCADE - allowed_cascading; - - for scheme in self.schemes().iter() { - // Skip excluded schemes - if excludes.contains(&scheme.code()) { - continue; - } - - // We never choose Constant for a sample - if is_sample && scheme.is_constant() { - continue; - } - - tracing::trace!( - is_sample, - depth, - is_constant = scheme.is_constant(), - ?scheme, - "Trying compression scheme" - ); - - let ratio = scheme.expected_compression_ratio( - compressor, - stats, - is_sample, - allowed_cascading, - excludes, - )?; - tracing::trace!( - is_sample, - depth, - ratio, - ?scheme, - "Expected compression result" - ); - - if !(ratio.is_subnormal() || ratio.is_infinite() || ratio.is_nan()) { - if ratio > best_ratio { - best_ratio = ratio; - best_scheme = Some(*scheme); - } - } else { - tracing::trace!( - "Calculated invalid compression ratio {ratio} for scheme: {scheme:?}. Must not be sub-normal, infinite or nan." - ); - } - } - - tracing::trace!(depth, scheme = ?best_scheme, ratio = best_ratio, "best scheme found"); - - if let Some(best) = best_scheme { - Ok(best) - } else { - Ok(self.default_scheme()) - } - } -} - -/// Compresses an array using the given compressor. -/// -/// Generates statistics on the input array, selects the best compression scheme, -/// and applies it. Returns the original array if compression would increase size. -pub fn compress( - c: &C, - compressor: &BtrBlocksCompressor, - array: &<::ArrayVTable as VTable>::Array, - is_sample: bool, - allowed_cascading: usize, - excludes: &[::CodeType], -) -> VortexResult -where - ::SchemeType: 'static, -{ - // Avoid compressing empty arrays. - if array.is_empty() { - return Ok(array.to_array()); - } - - // Generate stats on the array directly. - let stats = c.gen_stats(array); - let best_scheme = - c.choose_scheme(compressor, &stats, is_sample, allowed_cascading, excludes)?; - - let output = - best_scheme.compress(compressor, &stats, is_sample, allowed_cascading, excludes)?; - if output.nbytes() < array.nbytes() { - Ok(output) - } else { - tracing::debug!("resulting tree too large: {}", output.display_tree()); - Ok(array.to_array()) - } -} - -/// Trait for compressors that can compress canonical arrays. -/// -/// Provides access to configured compression schemes and the ability to -/// compress canonical arrays recursively. -pub trait CanonicalCompressor { - /// Compresses a canonical array with the specified options. - fn compress_canonical( - &self, - array: Canonical, - is_sample: bool, - allowed_cascading: usize, - excludes: Excludes, - ) -> VortexResult; - - /// Returns the enabled integer compression schemes. - fn int_schemes(&self) -> &[&'static dyn IntegerScheme]; - - /// Returns the enabled float compression schemes. - fn float_schemes(&self) -> &[&'static dyn FloatScheme]; - - /// Returns the enabled string compression schemes. - fn string_schemes(&self) -> &[&'static dyn StringScheme]; -} - -/// The main compressor type implementing BtrBlocks-inspired compression. -/// -/// This compressor applies adaptive compression schemes to arrays based on their data types -/// and characteristics. It recursively compresses nested structures like structs and lists, -/// and chooses optimal compression schemes for primitive types. -/// -/// The compressor works by: -/// 1. Canonicalizing input arrays to a standard representation -/// 2. Analyzing data characteristics to choose optimal compression schemes -/// 3. Recursively compressing nested structures -/// 4. Applying type-specific compression for primitives, strings, and temporal data -/// -/// Use [`BtrBlocksCompressorBuilder`] to configure which compression schemes are enabled. -/// -/// # Examples -/// -/// ```rust -/// use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, IntCode}; -/// -/// // Default compressor - all schemes allowed -/// let compressor = BtrBlocksCompressor::default(); -/// -/// // Exclude specific schemes using the builder -/// let compressor = BtrBlocksCompressorBuilder::new() -/// .exclude_int([IntCode::Dict]) -/// .build(); -/// ``` -#[derive(Clone)] -pub struct BtrBlocksCompressor { - /// Integer compressor with configured schemes. - pub int_schemes: Vec<&'static dyn IntegerScheme>, - - /// Float compressor with configured schemes. - pub float_schemes: Vec<&'static dyn FloatScheme>, - - /// String compressor with configured schemes. - pub string_schemes: Vec<&'static dyn StringScheme>, -} - -impl Default for BtrBlocksCompressor { - fn default() -> Self { - BtrBlocksCompressorBuilder::new().build() - } -} - -impl BtrBlocksCompressor { - /// Creates a new compressor with default settings (all schemes allowed). - pub fn new() -> Self { - Self::default() - } - - /// Returns an iterator over the enabled integer compression scheme codes. - pub fn int_codes(&self) -> impl Iterator + '_ { - self.int_schemes.iter().map(|s| s.code()) - } - - /// Returns an iterator over the enabled float compression scheme codes. - pub fn float_codes(&self) -> impl Iterator + '_ { - self.float_schemes.iter().map(|s| s.code()) - } - - /// Returns an iterator over the enabled string compression scheme codes. - pub fn string_codes(&self) -> impl Iterator + '_ { - self.string_schemes.iter().map(|s| s.code()) - } - - /// Compresses an array using BtrBlocks-inspired compression. - /// - /// First canonicalizes and compacts the array, then applies optimal compression schemes. - pub fn compress(&self, array: &dyn Array) -> VortexResult { - // Canonicalize the array - let canonical = array.to_canonical()?; - - // Compact it, removing any wasted space before we attempt to compress it - let compact = canonical.compact()?; - - self.compress_canonical(compact, false, MAX_CASCADE, Excludes::none()) - } -} - -impl CanonicalCompressor for BtrBlocksCompressor { - /// Compresses a canonical array by dispatching to type-specific compressors. - /// - /// Recursively compresses nested structures and applies optimal schemes for each data type. - fn compress_canonical<'a>( - &self, - array: Canonical, - is_sample: bool, - allowed_cascading: usize, - excludes: Excludes<'a>, - ) -> VortexResult { - match array { - Canonical::Null(null_array) => Ok(null_array.into_array()), - // TODO(aduffy): Sparse, other bool compressors. - Canonical::Bool(bool_array) => Ok(bool_array.into_array()), - Canonical::Primitive(primitive) => { - if primitive.ptype().is_int() { - compress( - &IntCompressor { - btr_blocks_compressor: self, - }, - self, - &primitive, - is_sample, - allowed_cascading, - excludes.int, - ) - } else { - compress( - &FloatCompressor { - btr_blocks_compressor: self, - }, - self, - &primitive, - is_sample, - allowed_cascading, - excludes.float, - ) - } - } - Canonical::Decimal(decimal) => compress_decimal(self, &decimal), - Canonical::Struct(struct_array) => { - let fields = struct_array - .unmasked_fields() - .iter() - .map(|field| self.compress(field)) - .collect::, _>>()?; - - Ok(StructArray::try_new( - struct_array.names().clone(), - fields, - struct_array.len(), - struct_array.validity().clone(), - )? - .into_array()) - } - Canonical::List(list_view_array) => { - // TODO(joe): We might want to write list views in the future and chose between - // list and list view. - let list_array = list_from_list_view(list_view_array)?; - - // Reset the offsets to remove garbage data that might prevent us from narrowing our - // offsets (there could be a large amount of trailing garbage data that the current - // views do not reference at all). - let list_array = list_array.reset_offsets(true)?; - - let compressed_elems = self.compress(list_array.elements())?; - - // Note that since the type of our offsets are not encoded in our `DType`, and since - // we guarantee above that all elements are referenced by offsets, we may narrow the - // widths. - - let compressed_offsets = self.compress_canonical( - Canonical::Primitive(list_array.offsets().to_primitive().narrow()?), - is_sample, - allowed_cascading, - Excludes::int_only(&[IntCode::Dict]), - )?; - - Ok(ListArray::try_new( - compressed_elems, - compressed_offsets, - list_array.validity().clone(), - )? - .into_array()) - } - Canonical::FixedSizeList(fsl_array) => { - let compressed_elems = self.compress(fsl_array.elements())?; - - Ok(FixedSizeListArray::try_new( - compressed_elems, - fsl_array.list_size(), - fsl_array.validity().clone(), - fsl_array.len(), - )? - .into_array()) - } - Canonical::VarBinView(strings) => { - if strings - .dtype() - .eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable)) - { - compress( - &StringCompressor { - btr_blocks_compressor: self, - }, - self, - &strings, - is_sample, - allowed_cascading, - excludes.string, - ) - } else { - // Binary arrays do not compress - Ok(strings.into_array()) - } - } - Canonical::Extension(ext_array) => { - // We compress Timestamp-level arrays with DateTimeParts compression - if ext_array.ext_dtype().is::() { - if is_constant_opts( - ext_array.as_ref(), - &IsConstantOpts { - cost: Cost::Canonicalize, - }, - )? - .unwrap_or_default() - { - return Ok(ConstantArray::new( - ext_array.as_ref().scalar_at(0)?, - ext_array.len(), - ) - .into_array()); - } - - let temporal_array = TemporalArray::try_from(ext_array)?; - return compress_temporal(self, temporal_array); - } - - // Compress the underlying storage array. - let compressed_storage = self.compress(ext_array.storage())?; - - Ok( - ExtensionArray::new(ext_array.ext_dtype().clone(), compressed_storage) - .into_array(), - ) - } - } - } - - fn int_schemes(&self) -> &[&'static dyn IntegerScheme] { - &self.int_schemes - } - - fn float_schemes(&self) -> &[&'static dyn FloatScheme] { - &self.float_schemes - } - - fn string_schemes(&self) -> &[&'static dyn StringScheme] { - &self.string_schemes - } -} - -/// Context passed through recursive compression calls. -#[derive(Debug, Clone, Copy)] -pub struct CompressorContext<'a> { - /// Whether we're compressing a sample (for ratio estimation). - pub is_sample: bool, - /// Remaining cascade depth allowed. - pub allowed_cascading: usize, - /// Schemes to exclude at this level. - pub excludes: Excludes<'a>, -} - -impl<'a> CompressorContext<'a> { - /// Creates a new context for top-level compression. - pub fn new(allowed_cascading: usize) -> Self { - Self { - is_sample: false, - allowed_cascading, - excludes: Excludes::none(), - } - } - - /// Creates a context for sample-based compression ratio estimation. - pub fn for_sample(allowed_cascading: usize) -> Self { - Self { - is_sample: true, - allowed_cascading, - excludes: Excludes::none(), - } - } - - /// Returns a new context with decremented cascade depth. - pub fn decrement_cascade(self) -> Self { - Self { - allowed_cascading: self.allowed_cascading.saturating_sub(1), - ..self - } - } - - /// Returns a new context with additional integer excludes. - pub fn with_int_excludes(self, int: &'a [IntCode]) -> Self { - Self { - excludes: Excludes { - int, - ..self.excludes - }, - ..self - } - } -} +pub use canonical_compressor::BtrBlocksCompressor; +pub use canonical_compressor::CanonicalCompressor; +pub use compressor::Compressor; +pub use compressor::CompressorExt; +pub(crate) use compressor::MAX_CASCADE; +pub use ctx::CompressorContext; +pub use ctx::Excludes; +pub use scheme::Scheme; +pub use scheme::SchemeExt; +pub use stats::CompressorStats; +pub use stats::GenerateStatsOptions; diff --git a/vortex-btrblocks/src/rle.rs b/vortex-btrblocks/src/rle.rs index d97ba4f8d42..eb73f08368d 100644 --- a/vortex-btrblocks/src/rle.rs +++ b/vortex-btrblocks/src/rle.rs @@ -15,11 +15,12 @@ use vortex_fastlanes::RLEArray; use crate::BtrBlocksCompressor; use crate::CanonicalCompressor; +use crate::CompressorContext; use crate::CompressorStats; use crate::Excludes; use crate::IntCode; use crate::Scheme; -use crate::estimate_compression_ratio_with_sampling; +use crate::SchemeExt; /// Threshold for the average run length in an array before we consider run-length encoding. pub const RUN_LENGTH_THRESHOLD: u32 = 4; @@ -48,8 +49,7 @@ pub trait RLEConfig: Debug + Send + Sync + 'static { fn compress_values( compressor: &BtrBlocksCompressor, values: &PrimitiveArray, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, excludes: &[Self::Code], ) -> VortexResult; } @@ -85,12 +85,11 @@ impl Scheme for RLEScheme { &self, compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, excludes: &[C::Code], ) -> VortexResult { // RLE is only useful when we cascade it with another encoding. - if allowed_cascading == 0 { + if ctx.allowed_cascading == 0 { return Ok(0.0); } @@ -105,27 +104,19 @@ impl Scheme for RLEScheme { } // Run compression on a sample to see how it performs. - estimate_compression_ratio_with_sampling( - self, - compressor, - stats, - is_sample, - allowed_cascading, - excludes, - ) + self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) } fn compress( &self, compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, excludes: &[C::Code], ) -> VortexResult { let rle_array = RLEArray::encode(RLEStats::source(stats))?; - if allowed_cascading == 0 { + if ctx.allowed_cascading == 0 { return Ok(rle_array.into_array()); } @@ -136,22 +127,19 @@ impl Scheme for RLEScheme { let compressed_values = C::compress_values( compressor, &rle_array.values().to_primitive(), - is_sample, - allowed_cascading - 1, + ctx.descend(), &new_excludes, )?; let compressed_indices = compressor.compress_canonical( Canonical::Primitive(rle_array.indices().to_primitive().narrow()?), - is_sample, - allowed_cascading - 1, + ctx.descend(), Excludes::int_only(&[IntCode::Dict]), )?; let compressed_offsets = compressor.compress_canonical( Canonical::Primitive(rle_array.values_idx_offsets().to_primitive().narrow()?), - is_sample, - allowed_cascading - 1, + ctx.descend(), Excludes::int_only(&[IntCode::Dict]), )?; diff --git a/vortex-btrblocks/src/scheme.rs b/vortex-btrblocks/src/scheme.rs new file mode 100644 index 00000000000..8dbd4d1a010 --- /dev/null +++ b/vortex-btrblocks/src/scheme.rs @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compression scheme traits. + +use std::fmt::Debug; +use std::hash::Hash; +use std::hash::Hasher; + +use vortex_array::ArrayRef; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; + +use crate::BtrBlocksCompressor; +use crate::CompressorContext; +use crate::CompressorStats; +use crate::stats::SAMPLE_COUNT; +use crate::stats::SAMPLE_SIZE; + +/// Top-level compression scheme trait. +/// +/// Variants are specialized for each data type, e.g. see `IntegerScheme`, `FloatScheme`, etc. +pub trait Scheme: Debug { + /// Type of the stats generated by the compression scheme. + type StatsType: CompressorStats; + /// Type of the code used to uniquely identify the compression scheme. + type CodeType: Copy + Eq + Hash; + + /// Scheme unique identifier. + fn code(&self) -> Self::CodeType; + + /// True if this is the singular Constant scheme for this data type. + fn is_constant(&self) -> bool { + false + } + + /// Estimate the compression ratio for running this scheme (and its children) + /// for the given input. + /// + /// Depth is the depth in the encoding tree we've already reached before considering this + /// scheme. + /// + /// Returns the estimated compression ratio as well as the tree of compressors to use. + fn expected_compression_ratio( + &self, + compressor: &BtrBlocksCompressor, + stats: &Self::StatsType, + ctx: CompressorContext, + excludes: &[Self::CodeType], + ) -> VortexResult { + self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) + } + + /// Compress the input with this scheme, yielding a new array. + fn compress( + &self, + compressor: &BtrBlocksCompressor, + stats: &Self::StatsType, + ctx: CompressorContext, + excludes: &[Self::CodeType], + ) -> VortexResult; +} + +impl PartialEq for dyn Scheme { + fn eq(&self, other: &Self) -> bool { + self.code() == other.code() + } +} +impl Eq for dyn Scheme {} +impl Hash for dyn Scheme { + fn hash(&self, state: &mut H) { + self.code().hash(state) + } +} + +/// Extension trait providing sampling-based compression ratio estimation for schemes. +pub trait SchemeExt: Scheme { + /// Estimates compression ratio by compressing a sample of the data. + /// + /// This method samples approximately 1% of the data (with a minimum of 1024 values) + /// and compresses it to estimate the overall compression ratio. + fn estimate_compression_ratio_with_sampling( + &self, + btr_blocks_compressor: &BtrBlocksCompressor, + stats: &Self::StatsType, + ctx: CompressorContext, + excludes: &[Self::CodeType], + ) -> VortexResult { + let sample = if ctx.is_sample { + stats.clone() + } else { + // We want to sample about 1% of data + let source_len = stats.source().len(); + + // We want to sample about 1% of data, while keeping a minimal sample of 1024 values. + let approximately_one_percent = (source_len / 100) + / usize::try_from(SAMPLE_SIZE).vortex_expect("SAMPLE_SIZE must fit in usize"); + let sample_count = u32::max( + u32::next_multiple_of( + approximately_one_percent + .try_into() + .vortex_expect("sample count must fit in u32"), + 16, + ), + SAMPLE_COUNT, + ); + + tracing::trace!( + "Sampling {} values out of {}", + SAMPLE_SIZE as u64 * sample_count as u64, + source_len + ); + + stats.sample(SAMPLE_SIZE, sample_count) + }; + + let after = self + .compress(btr_blocks_compressor, &sample, ctx.as_sample(), excludes)? + .nbytes(); + let before = sample.source().nbytes(); + + tracing::debug!( + "estimate_compression_ratio_with_sampling(compressor={self:#?} ctx={ctx:?}) = {}", + before as f64 / after as f64 + ); + + Ok(before as f64 / after as f64) + } +} + +// Blanket implementation for all Scheme types +impl SchemeExt for T {} diff --git a/vortex-btrblocks/src/stats.rs b/vortex-btrblocks/src/stats.rs new file mode 100644 index 00000000000..b3e25cfb8d6 --- /dev/null +++ b/vortex-btrblocks/src/stats.rs @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compression statistics types. + +use std::fmt::Debug; + +use vortex_array::vtable::VTable; + +/// Configures how stats are generated. +pub struct GenerateStatsOptions { + /// Should distinct values should be counted during stats generation. + pub count_distinct_values: bool, + // pub count_runs: bool, + // should this be scheme-specific? +} + +impl Default for GenerateStatsOptions { + fn default() -> Self { + Self { + count_distinct_values: true, + // count_runs: true, + } + } +} + +/// The size of each sampled run. +pub(crate) const SAMPLE_SIZE: u32 = 64; +/// The number of sampled runs. +/// +/// # Warning +/// +/// The product of SAMPLE_SIZE and SAMPLE_COUNT should be (roughly) a multiple of 1024 so that +/// fastlanes bitpacking of sampled vectors does not introduce (large amounts of) padding. +pub(crate) const SAMPLE_COUNT: u32 = 16; + +/// Stats for the compressor. +pub trait CompressorStats: Debug + Clone { + /// The type of the underlying source array vtable. + type ArrayVTable: VTable; + + /// Generates stats with default options. + fn generate(input: &::Array) -> Self { + Self::generate_opts(input, GenerateStatsOptions::default()) + } + + /// Generates stats with provided options. + fn generate_opts( + input: &::Array, + opts: GenerateStatsOptions, + ) -> Self; + + /// Returns the underlying source array that statistics were generated from. + fn source(&self) -> &::Array; + + /// Sample the array with default options. + fn sample(&self, sample_size: u32, sample_count: u32) -> Self { + self.sample_opts(sample_size, sample_count, GenerateStatsOptions::default()) + } + + /// Sample the array with provided options. + fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self; +} diff --git a/vortex-btrblocks/src/string.rs b/vortex-btrblocks/src/string.rs index 2fb3bb617f8..358a085f35b 100644 --- a/vortex-btrblocks/src/string.rs +++ b/vortex-btrblocks/src/string.rs @@ -33,12 +33,13 @@ use vortex_utils::aliases::hash_set::HashSet; use crate::BtrBlocksCompressor; use crate::CanonicalCompressor; use crate::Compressor; +use crate::CompressorContext; use crate::CompressorStats; use crate::Excludes; use crate::GenerateStatsOptions; use crate::IntCode; use crate::Scheme; -use crate::estimate_compression_ratio_with_sampling; +use crate::SchemeExt; use crate::integer; use crate::sample::sample; @@ -233,8 +234,7 @@ impl Scheme for UncompressedScheme { &self, _compressor: &BtrBlocksCompressor, _stats: &Self::StatsType, - _is_sample: bool, - _allowed_cascading: usize, + _ctx: CompressorContext, _excludes: &[StringCode], ) -> VortexResult { Ok(1.0) @@ -244,8 +244,7 @@ impl Scheme for UncompressedScheme { &self, _compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - _is_sample: bool, - _allowed_cascading: usize, + _ctx: CompressorContext, _excludes: &[StringCode], ) -> VortexResult { Ok(stats.source().to_array()) @@ -264,8 +263,7 @@ impl Scheme for DictScheme { &self, compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, excludes: &[StringCode], ) -> VortexResult { // If we don't have a sufficiently high number of distinct values, do not attempt Dict. @@ -278,36 +276,27 @@ impl Scheme for DictScheme { return Ok(0.0); } - estimate_compression_ratio_with_sampling( - self, - compressor, - stats, - is_sample, - allowed_cascading, - excludes, - ) + self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) } fn compress( &self, compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, _excludes: &[StringCode], ) -> VortexResult { let dict = dict_encode(&stats.source().clone().into_array())?; // If we are not allowed to cascade, do not attempt codes or values compression. - if allowed_cascading == 0 { + if ctx.allowed_cascading == 0 { return Ok(dict.into_array()); } // Find best compressor for codes and values separately let compressed_codes = compressor.compress_canonical( Canonical::Primitive(dict.codes().to_primitive()), - is_sample, - allowed_cascading - 1, + ctx.descend(), Excludes::int_only(&[integer::DictScheme.code(), integer::SequenceScheme.code()]), )?; @@ -315,8 +304,7 @@ impl Scheme for DictScheme { // Currently this will only be FSST. let compressed_values = compressor.compress_canonical( Canonical::VarBinView(dict.values().to_varbinview()), - is_sample, - allowed_cascading - 1, + ctx.descend(), Excludes::string_only(&[DictScheme.code()]), )?; @@ -343,8 +331,7 @@ impl Scheme for FSSTScheme { &self, compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, _excludes: &[StringCode], ) -> VortexResult { let fsst = { @@ -354,15 +341,13 @@ impl Scheme for FSSTScheme { let compressed_original_lengths = compressor.compress_canonical( Canonical::Primitive(fsst.uncompressed_lengths().to_primitive().narrow()?), - is_sample, - allowed_cascading, + ctx, Excludes::int_only(&[]), )?; let compressed_codes_offsets = compressor.compress_canonical( Canonical::Primitive(fsst.codes().offsets().to_primitive().narrow()?), - is_sample, - allowed_cascading, + ctx, Excludes::int_only(&[]), )?; let compressed_codes = VarBinArray::try_new( @@ -400,11 +385,10 @@ impl Scheme for ConstantScheme { &self, _compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - is_sample: bool, - _allowed_cascading: usize, + ctx: CompressorContext, _excludes: &[Self::CodeType], ) -> VortexResult { - if is_sample { + if ctx.is_sample { return Ok(0.0); } @@ -421,8 +405,7 @@ impl Scheme for ConstantScheme { &self, _compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - _is_sample: bool, - _allowed_cascading: usize, + _ctx: CompressorContext, _excludes: &[Self::CodeType], ) -> VortexResult { let scalar_idx = @@ -459,12 +442,11 @@ impl Scheme for NullDominated { &self, _compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - _is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, _excludes: &[Self::CodeType], ) -> VortexResult { // Only use `SparseScheme` if we can cascade. - if allowed_cascading == 0 { + if ctx.allowed_cascading == 0 { return Ok(0.0); } @@ -486,11 +468,10 @@ impl Scheme for NullDominated { &self, compressor: &BtrBlocksCompressor, stats: &Self::StatsType, - is_sample: bool, - allowed_cascading: usize, + ctx: CompressorContext, _excludes: &[Self::CodeType], ) -> VortexResult { - assert!(allowed_cascading > 0); + assert!(ctx.allowed_cascading > 0); // We pass None as we only run this pathway for NULL-dominated string arrays let sparse_encoded = SparseArray::encode(stats.src.as_ref(), None)?; @@ -502,8 +483,7 @@ impl Scheme for NullDominated { let indices = sparse.patches().indices().to_primitive().narrow()?; let compressed_indices = compressor.compress_canonical( Canonical::Primitive(indices), - is_sample, - allowed_cascading - 1, + ctx.descend(), Excludes::int_only(&new_excludes), )?; diff --git a/vortex-btrblocks/src/temporal.rs b/vortex-btrblocks/src/temporal.rs index 12fe24926da..dad004b97a9 100644 --- a/vortex-btrblocks/src/temporal.rs +++ b/vortex-btrblocks/src/temporal.rs @@ -15,8 +15,8 @@ use vortex_error::VortexResult; use crate::BtrBlocksCompressor; use crate::CanonicalCompressor; +use crate::CompressorContext; use crate::Excludes; -use crate::MAX_CASCADE; /// Compress a temporal array into a `DateTimePartsArray`. pub fn compress_temporal( @@ -30,22 +30,21 @@ pub fn compress_temporal( subseconds, } = split_temporal(array)?; + let ctx = CompressorContext::default().descend(); + let days = compressor.compress_canonical( Canonical::Primitive(days.to_primitive().narrow()?), - false, - MAX_CASCADE - 1, + ctx, Excludes::int_only(&[]), )?; let seconds = compressor.compress_canonical( Canonical::Primitive(seconds.to_primitive().narrow()?), - false, - MAX_CASCADE - 1, + ctx, Excludes::int_only(&[]), )?; let subseconds = compressor.compress_canonical( Canonical::Primitive(subseconds.to_primitive().narrow()?), - false, - MAX_CASCADE - 1, + ctx, Excludes::int_only(&[]), )?; diff --git a/vortex-layout/src/layouts/compressed.rs b/vortex-layout/src/layouts/compressed.rs index 83e5e93f001..9f45b5b25df 100644 --- a/vortex-layout/src/layouts/compressed.rs +++ b/vortex-layout/src/layouts/compressed.rs @@ -78,7 +78,7 @@ impl CompressingStrategy { /// which is useful when compressing dictionary codes to avoid recursive dictionary encoding. pub fn new_btrblocks(child: S, exclude_int_dict_encoding: bool) -> Self { let compressor = if exclude_int_dict_encoding { - BtrBlocksCompressorBuilder::new() + BtrBlocksCompressorBuilder::default() .exclude_int([IntCode::Dict]) .build() } else { From 1536f8afd2896be54a8a22704554c722c520eb0b Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 30 Jan 2026 17:56:26 +0000 Subject: [PATCH 06/14] wip Signed-off-by: Joe Isaacs --- vortex-btrblocks/src/builder.rs | 12 +- vortex-btrblocks/src/canonical_compressor.rs | 49 +- .../src/{ => compressor}/decimal.rs | 0 .../src/{ => compressor}/float/dictionary.rs | 8 +- .../src/{float.rs => compressor/float/mod.rs} | 381 ++++++----- .../src/{ => compressor}/float/stats.rs | 16 +- .../{ => compressor}/integer/dictionary.rs | 8 +- .../{integer.rs => compressor/integer/mod.rs} | 637 ++++++++++-------- .../src/{ => compressor}/integer/stats.rs | 6 +- .../src/{compressor.rs => compressor/mod.rs} | 8 + .../src/{ => compressor}/patches.rs | 0 vortex-btrblocks/src/{ => compressor}/rle.rs | 0 .../src/{ => compressor}/string.rs | 8 +- .../src/{ => compressor}/temporal.rs | 0 vortex-btrblocks/src/lib.rs | 22 +- 15 files changed, 625 insertions(+), 530 deletions(-) rename vortex-btrblocks/src/{ => compressor}/decimal.rs (100%) rename vortex-btrblocks/src/{ => compressor}/float/dictionary.rs (96%) rename vortex-btrblocks/src/{float.rs => compressor/float/mod.rs} (67%) rename vortex-btrblocks/src/{ => compressor}/float/stats.rs (96%) rename vortex-btrblocks/src/{ => compressor}/integer/dictionary.rs (97%) rename vortex-btrblocks/src/{integer.rs => compressor/integer/mod.rs} (66%) rename vortex-btrblocks/src/{ => compressor}/integer/stats.rs (99%) rename vortex-btrblocks/src/{compressor.rs => compressor/mod.rs} (97%) rename vortex-btrblocks/src/{ => compressor}/patches.rs (100%) rename vortex-btrblocks/src/{ => compressor}/rle.rs (100%) rename vortex-btrblocks/src/{ => compressor}/string.rs (98%) rename vortex-btrblocks/src/{ => compressor}/temporal.rs (100%) diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index 9df7575b252..201104f9b85 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -10,12 +10,12 @@ use crate::BtrBlocksCompressor; use crate::FloatCode; use crate::IntCode; use crate::StringCode; -use crate::float::ALL_FLOAT_SCHEMES; -use crate::float::FloatScheme; -use crate::integer::ALL_INT_SCHEMES; -use crate::integer::IntegerScheme; -use crate::string::ALL_STRING_SCHEMES; -use crate::string::StringScheme; +use crate::compressor::float::ALL_FLOAT_SCHEMES; +use crate::compressor::float::FloatScheme; +use crate::compressor::integer::ALL_INT_SCHEMES; +use crate::compressor::integer::IntegerScheme; +use crate::compressor::string::ALL_STRING_SCHEMES; +use crate::compressor::string::StringScheme; /// Builder for creating configured [`BtrBlocksCompressor`] instances. /// diff --git a/vortex-btrblocks/src/canonical_compressor.rs b/vortex-btrblocks/src/canonical_compressor.rs index 5a3e6c7beaa..78cf2883eef 100644 --- a/vortex-btrblocks/src/canonical_compressor.rs +++ b/vortex-btrblocks/src/canonical_compressor.rs @@ -32,11 +32,11 @@ use crate::FloatCompressor; use crate::IntCode; use crate::IntCompressor; use crate::StringCompressor; -use crate::decimal::compress_decimal; -use crate::float::FloatScheme; -use crate::integer::IntegerScheme; -use crate::string::StringScheme; -use crate::temporal::compress_temporal; +use crate::compressor::decimal::compress_decimal; +use crate::compressor::float::FloatScheme; +use crate::compressor::integer::IntegerScheme; +use crate::compressor::string::StringScheme; +use crate::compressor::temporal::compress_temporal; /// Trait for compressors that can compress canonical arrays. /// @@ -119,6 +119,27 @@ impl BtrBlocksCompressor { self.compress_canonical(compact, CompressorContext::default(), Excludes::none()) } + + /// Creates an integer compressor using this compressor's configuration. + pub fn integer_compressor(&self) -> IntCompressor<'_> { + IntCompressor { + btr_blocks_compressor: self, + } + } + + /// Creates a float compressor using this compressor's configuration. + pub fn float_compressor(&self) -> FloatCompressor<'_> { + FloatCompressor { + btr_blocks_compressor: self, + } + } + + /// Creates a string compressor using this compressor's configuration. + pub fn string_compressor(&self) -> StringCompressor<'_> { + StringCompressor { + btr_blocks_compressor: self, + } + } } impl CanonicalCompressor for BtrBlocksCompressor { @@ -137,15 +158,11 @@ impl CanonicalCompressor for BtrBlocksCompressor { Canonical::Bool(bool_array) => Ok(bool_array.into_array()), Canonical::Primitive(primitive) => { if primitive.ptype().is_int() { - IntCompressor { - btr_blocks_compressor: self, - } - .compress(self, &primitive, ctx, excludes.int) + self.integer_compressor() + .compress(self, &primitive, ctx, excludes.int) } else { - FloatCompressor { - btr_blocks_compressor: self, - } - .compress(self, &primitive, ctx, excludes.float) + self.float_compressor() + .compress(self, &primitive, ctx, excludes.float) } } Canonical::Decimal(decimal) => compress_decimal(self, &decimal), @@ -209,10 +226,8 @@ impl CanonicalCompressor for BtrBlocksCompressor { .dtype() .eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable)) { - StringCompressor { - btr_blocks_compressor: self, - } - .compress(self, &strings, ctx, excludes.string) + self.string_compressor() + .compress(self, &strings, ctx, excludes.string) } else { // Binary arrays do not compress Ok(strings.into_array()) diff --git a/vortex-btrblocks/src/decimal.rs b/vortex-btrblocks/src/compressor/decimal.rs similarity index 100% rename from vortex-btrblocks/src/decimal.rs rename to vortex-btrblocks/src/compressor/decimal.rs diff --git a/vortex-btrblocks/src/float/dictionary.rs b/vortex-btrblocks/src/compressor/float/dictionary.rs similarity index 96% rename from vortex-btrblocks/src/float/dictionary.rs rename to vortex-btrblocks/src/compressor/float/dictionary.rs index 5b9bfc331cc..9a5955a6daf 100644 --- a/vortex-btrblocks/src/float/dictionary.rs +++ b/vortex-btrblocks/src/compressor/float/dictionary.rs @@ -11,8 +11,8 @@ use vortex_array::vtable::ValidityHelper; use vortex_buffer::Buffer; use vortex_dtype::half::f16; -use crate::float::stats::ErasedDistinctValues; -use crate::float::stats::FloatStats; +use super::stats::ErasedDistinctValues; +use super::stats::FloatStats; macro_rules! typed_encode { ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{ @@ -104,9 +104,9 @@ mod tests { use vortex_array::validity::Validity; use vortex_buffer::buffer; + use super::super::FloatStats; use crate::CompressorStats; - use crate::float::dictionary::dictionary_encode; - use crate::float::stats::FloatStats; + use crate::compressor::float::dictionary::dictionary_encode; #[test] fn test_float_dict_encode() { diff --git a/vortex-btrblocks/src/float.rs b/vortex-btrblocks/src/compressor/float/mod.rs similarity index 67% rename from vortex-btrblocks/src/float.rs rename to vortex-btrblocks/src/compressor/float/mod.rs index 69c298cd4e3..14d9fd00889 100644 --- a/vortex-btrblocks/src/float.rs +++ b/vortex-btrblocks/src/compressor/float/mod.rs @@ -2,7 +2,7 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors pub(crate) mod dictionary; -mod stats; +pub(super) mod stats; use std::hash::Hash; use std::hash::Hasher; @@ -30,7 +30,11 @@ use vortex_scalar::Scalar; use vortex_sparse::SparseArray; use vortex_sparse::SparseVTable; +use self::dictionary::dictionary_encode; pub use self::stats::FloatStats; +use super::integer::DictScheme as IntDictScheme; +use super::integer::RunEndScheme as IntRunEndScheme; +use super::integer::SparseScheme as IntSparseScheme; use crate::BtrBlocksCompressor; use crate::CanonicalCompressor; use crate::Compressor; @@ -41,11 +45,9 @@ use crate::GenerateStatsOptions; use crate::IntCode; use crate::Scheme; use crate::SchemeExt; -use crate::float::dictionary::dictionary_encode; -use crate::integer; -use crate::patches::compress_patches; -use crate::rle; -use crate::rle::RLEScheme; +use crate::compressor::patches::compress_patches; +use crate::compressor::rle; +use crate::compressor::rle::RLEScheme; pub trait FloatScheme: Scheme + Send + Sync {} @@ -322,10 +324,10 @@ impl Scheme for ALPScheme { // to keep them linear for easy indexing. let mut int_excludes = Vec::new(); if excludes.contains(&FloatCode::Dict) { - int_excludes.push(integer::DictScheme.code()); + int_excludes.push(IntDictScheme.code()); } if excludes.contains(&FloatCode::RunEnd) { - int_excludes.push(integer::RunEndScheme.code()); + int_excludes.push(IntRunEndScheme.code()); } let compressed_alp_ints = compressor.compress_canonical( @@ -498,7 +500,7 @@ impl Scheme for NullDominated { if let Some(sparse) = sparse_encoded.as_opt::() { // Compress the values - let new_excludes = [integer::SparseScheme.code()]; + let new_excludes = [IntSparseScheme.code()]; // Don't attempt to compress the non-null values @@ -521,173 +523,194 @@ impl Scheme for NullDominated { } } } -// -// #[cfg(test)] -// mod tests { -// -// use std::iter; -// -// use vortex_array::Array; -// use vortex_array::IntoArray; -// use vortex_array::ToCanonical; -// use vortex_array::arrays::PrimitiveArray; -// use vortex_array::assert_arrays_eq; -// use vortex_array::builders::ArrayBuilder; -// use vortex_array::builders::PrimitiveBuilder; -// use vortex_array::display::DisplayOptions; -// use vortex_array::validity::Validity; -// use vortex_buffer::Buffer; -// use vortex_buffer::buffer_mut; -// use vortex_dtype::Nullability; -// use vortex_error::VortexResult; -// -// use crate::{compress, Compressor}; -// use crate::CompressorStats; -// use crate::MAX_CASCADE; -// use crate::Scheme; -// use crate::float::FloatCompressor; -// use crate::float::RLE_FLOAT_SCHEME; -// -// #[test] -// fn test_empty() -> VortexResult<()> { -// // Make sure empty array compression does not fail -// -// let result = FloatCompressor::default().compress( -// &PrimitiveArray::new(Buffer::::empty(), Validity::NonNullable), -// false, -// 3, -// )?; -// -// assert!(result.is_empty()); -// Ok(()) -// } -// -// #[test] -// fn test_compress() -> VortexResult<()> { -// let mut values = buffer_mut![1.0f32; 1024]; -// // Sprinkle some other values in. -// for i in 0..1024 { -// // Insert 2.0 at all odd positions. -// // This should force dictionary encoding and exclude run-end due to the -// // average run length being 1. -// values[i] = (i % 50) as f32; -// } -// -// let floats = values.into_array().to_primitive(); -// let compressed = FloatCompressor::default().compress(&floats, false, MAX_CASCADE)?; -// assert_eq!(compressed.len(), 1024); -// -// let display = compressed -// .display_as(DisplayOptions::MetadataOnly) -// .to_string() -// .to_lowercase(); -// assert_eq!(display, "vortex.dict(f32, len=1024)"); -// -// Ok(()) -// } -// -// #[test] -// fn test_rle_compression() -> VortexResult<()> { -// let mut values = Vec::new(); -// values.extend(iter::repeat_n(1.5f32, 100)); -// values.extend(iter::repeat_n(2.7f32, 200)); -// values.extend(iter::repeat_n(3.15f32, 150)); -// -// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); -// let stats = crate::float::FloatStats::generate(&array); -// let compressed = RLE_FLOAT_SCHEME.compress(&stats, false, 3, &[])?; -// -// let decoded = compressed; -// let expected = Buffer::copy_from(&values).into_array(); -// assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); -// Ok(()) -// } -// -// #[test] -// fn test_sparse_compression() -> VortexResult<()> { -// let mut array = PrimitiveBuilder::::with_capacity(Nullability::Nullable, 100); -// array.append_value(f32::NAN); -// array.append_value(-f32::NAN); -// array.append_value(f32::INFINITY); -// array.append_value(-f32::INFINITY); -// array.append_value(0.0f32); -// array.append_value(-0.0f32); -// array.append_nulls(90); -// -// let floats = array.finish_into_primitive(); -// -// let compressed = FloatCompressor::default().compress(&floats, false, MAX_CASCADE)?; -// assert_eq!(compressed.len(), 96); -// -// let display = compressed -// .display_as(DisplayOptions::MetadataOnly) -// .to_string() -// .to_lowercase(); -// assert_eq!(display, "vortex.sparse(f32?, len=96)"); -// -// Ok(()) -// } -// } -// -// /// Tests to verify that each float compression scheme produces the expected encoding. -// #[cfg(test)] -// mod scheme_selection_tests { -// -// use vortex_alp::ALPVTable; -// use vortex_array::arrays::ConstantVTable; -// use vortex_array::arrays::DictVTable; -// use vortex_array::arrays::PrimitiveArray; -// use vortex_array::builders::ArrayBuilder; -// use vortex_array::builders::PrimitiveBuilder; -// use vortex_array::validity::Validity; -// use vortex_buffer::Buffer; -// use vortex_dtype::Nullability; -// use vortex_error::VortexResult; -// -// use crate::Compressor; -// use crate::float::FloatCompressor; -// -// #[test] -// fn test_constant_compressed() -> VortexResult<()> { -// let values: Vec = vec![42.5; 100]; -// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); -// let compressed = FloatCompressor::default().compress(&array, false, 3)?; -// assert!(compressed.is::()); -// Ok(()) -// } -// -// #[test] -// fn test_alp_compressed() -> VortexResult<()> { -// let values: Vec = (0..1000).map(|i| (i as f64) * 0.01).collect(); -// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); -// let compressed = FloatCompressor::default().compress(&array, false, 3)?; -// assert!(compressed.is::()); -// Ok(()) -// } -// -// #[test] -// fn test_dict_compressed() -> VortexResult<()> { -// let distinct_values = [1.1, 2.2, 3.3, 4.4, 5.5]; -// let values: Vec = (0..1000) -// .map(|i| distinct_values[i % distinct_values.len()]) -// .collect(); -// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); -// let compressed = FloatCompressor::default().compress(&array, false, 3)?; -// assert!(compressed.is::()); -// Ok(()) -// } -// -// #[test] -// fn test_null_dominated_compressed() -> VortexResult<()> { -// let mut builder = PrimitiveBuilder::::with_capacity(Nullability::Nullable, 100); -// for i in 0..5 { -// builder.append_value(i as f64); -// } -// builder.append_nulls(95); -// let array = builder.finish_into_primitive(); -// let compressed = FloatCompressor::default().compress(&array, false, 3)?; -// // Verify the compressed array preserves values. -// assert_eq!(compressed.len(), 100); -// Ok(()) -// } -// } + +#[cfg(test)] +mod tests { + + use std::iter; + + use vortex_array::Array; + use vortex_array::IntoArray; + use vortex_array::ToCanonical; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::assert_arrays_eq; + use vortex_array::builders::ArrayBuilder; + use vortex_array::builders::PrimitiveBuilder; + use vortex_array::display::DisplayOptions; + use vortex_array::validity::Validity; + use vortex_buffer::Buffer; + use vortex_buffer::buffer_mut; + use vortex_dtype::Nullability; + use vortex_error::VortexResult; + + use super::RLE_FLOAT_SCHEME; + use crate::BtrBlocksCompressor; + use crate::CompressorContext; + use crate::CompressorExt; + use crate::CompressorStats; + use crate::Scheme; + + #[test] + fn test_empty() -> VortexResult<()> { + // Make sure empty array compression does not fail + let btr = BtrBlocksCompressor::default(); + let result = btr.float_compressor().compress( + &btr, + &PrimitiveArray::new(Buffer::::empty(), Validity::NonNullable), + CompressorContext::default(), + &[], + )?; + + assert!(result.is_empty()); + Ok(()) + } + + #[test] + fn test_compress() -> VortexResult<()> { + let mut values = buffer_mut![1.0f32; 1024]; + // Sprinkle some other values in. + for i in 0..1024 { + // Insert 2.0 at all odd positions. + // This should force dictionary encoding and exclude run-end due to the + // average run length being 1. + values[i] = (i % 50) as f32; + } + + let floats = values.into_array().to_primitive(); + let btr = BtrBlocksCompressor::default(); + let compressed = + btr.float_compressor() + .compress(&btr, &floats, CompressorContext::default(), &[])?; + assert_eq!(compressed.len(), 1024); + + let display = compressed + .display_as(DisplayOptions::MetadataOnly) + .to_string() + .to_lowercase(); + assert_eq!(display, "vortex.dict(f32, len=1024)"); + + Ok(()) + } + + #[test] + fn test_rle_compression() -> VortexResult<()> { + let mut values = Vec::new(); + values.extend(iter::repeat_n(1.5f32, 100)); + values.extend(iter::repeat_n(2.7f32, 200)); + values.extend(iter::repeat_n(3.15f32, 150)); + + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let stats = super::FloatStats::generate(&array); + let btr = BtrBlocksCompressor::default(); + let compressed = + RLE_FLOAT_SCHEME.compress(&btr, &stats, CompressorContext::default(), &[])?; + + let decoded = compressed; + let expected = Buffer::copy_from(&values).into_array(); + assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); + Ok(()) + } + + #[test] + fn test_sparse_compression() -> VortexResult<()> { + let mut array = PrimitiveBuilder::::with_capacity(Nullability::Nullable, 100); + array.append_value(f32::NAN); + array.append_value(-f32::NAN); + array.append_value(f32::INFINITY); + array.append_value(-f32::INFINITY); + array.append_value(0.0f32); + array.append_value(-0.0f32); + array.append_nulls(90); + + let floats = array.finish_into_primitive(); + let btr = BtrBlocksCompressor::default(); + let compressed = + btr.float_compressor() + .compress(&btr, &floats, CompressorContext::default(), &[])?; + assert_eq!(compressed.len(), 96); + + let display = compressed + .display_as(DisplayOptions::MetadataOnly) + .to_string() + .to_lowercase(); + assert_eq!(display, "vortex.sparse(f32?, len=96)"); + + Ok(()) + } +} + +/// Tests to verify that each float compression scheme produces the expected encoding. +#[cfg(test)] +mod scheme_selection_tests { + + use vortex_alp::ALPVTable; + use vortex_array::arrays::ConstantVTable; + use vortex_array::arrays::DictVTable; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::builders::ArrayBuilder; + use vortex_array::builders::PrimitiveBuilder; + use vortex_array::validity::Validity; + use vortex_buffer::Buffer; + use vortex_dtype::Nullability; + use vortex_error::VortexResult; + + use crate::BtrBlocksCompressor; + use crate::CompressorContext; + use crate::CompressorExt; + + #[test] + fn test_constant_compressed() -> VortexResult<()> { + let values: Vec = vec![42.5; 100]; + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = + btr.float_compressor() + .compress(&btr, &array, CompressorContext::default(), &[])?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_alp_compressed() -> VortexResult<()> { + let values: Vec = (0..1000).map(|i| (i as f64) * 0.01).collect(); + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = + btr.float_compressor() + .compress(&btr, &array, CompressorContext::default(), &[])?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_dict_compressed() -> VortexResult<()> { + let distinct_values = [1.1, 2.2, 3.3, 4.4, 5.5]; + let values: Vec = (0..1000) + .map(|i| distinct_values[i % distinct_values.len()]) + .collect(); + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = + btr.float_compressor() + .compress(&btr, &array, CompressorContext::default(), &[])?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_null_dominated_compressed() -> VortexResult<()> { + let mut builder = PrimitiveBuilder::::with_capacity(Nullability::Nullable, 100); + for i in 0..5 { + builder.append_value(i as f64); + } + builder.append_nulls(95); + let array = builder.finish_into_primitive(); + let btr = BtrBlocksCompressor::default(); + let compressed = + btr.float_compressor() + .compress(&btr, &array, CompressorContext::default(), &[])?; + // Verify the compressed array preserves values. + assert_eq!(compressed.len(), 100); + Ok(()) + } +} diff --git a/vortex-btrblocks/src/float/stats.rs b/vortex-btrblocks/src/compressor/float/stats.rs similarity index 96% rename from vortex-btrblocks/src/float/stats.rs rename to vortex-btrblocks/src/compressor/float/stats.rs index a1dcaa35d98..eb9c337bc6a 100644 --- a/vortex-btrblocks/src/float/stats.rs +++ b/vortex-btrblocks/src/compressor/float/stats.rs @@ -22,7 +22,7 @@ use vortex_utils::aliases::hash_set::HashSet; use crate::CompressorStats; use crate::GenerateStatsOptions; -use crate::rle::RLEStats; +use crate::compressor::rle::RLEStats; use crate::sample::sample; #[derive(Debug, Clone)] @@ -54,15 +54,15 @@ impl_from_typed!(f64, ErasedDistinctValues::F64); /// Array of floating-point numbers and relevant stats for compression. #[derive(Debug, Clone)] pub struct FloatStats { - pub(super) src: PrimitiveArray, + pub(crate) src: PrimitiveArray, // cache for validity.false_count() - pub(super) null_count: u32, + pub(crate) null_count: u32, // cache for validity.true_count() - pub(super) value_count: u32, + pub(crate) value_count: u32, #[allow(dead_code)] - pub(super) average_run_length: u32, - pub(super) distinct_values: ErasedDistinctValues, - pub(super) distinct_values_count: u32, + pub(crate) average_run_length: u32, + pub(crate) distinct_values: ErasedDistinctValues, + pub(crate) distinct_values_count: u32, } impl FloatStats { @@ -233,8 +233,8 @@ mod tests { use vortex_array::validity::Validity; use vortex_buffer::buffer; + use super::FloatStats; use crate::CompressorStats; - use crate::float::stats::FloatStats; #[test] fn test_float_stats() { diff --git a/vortex-btrblocks/src/integer/dictionary.rs b/vortex-btrblocks/src/compressor/integer/dictionary.rs similarity index 97% rename from vortex-btrblocks/src/integer/dictionary.rs rename to vortex-btrblocks/src/compressor/integer/dictionary.rs index b441240ece0..681bf6c5811 100644 --- a/vortex-btrblocks/src/integer/dictionary.rs +++ b/vortex-btrblocks/src/compressor/integer/dictionary.rs @@ -10,8 +10,8 @@ use vortex_array::validity::Validity; use vortex_array::vtable::ValidityHelper; use vortex_buffer::Buffer; -use crate::integer::IntegerStats; -use crate::integer::stats::ErasedStats; +use super::IntegerStats; +use super::stats::ErasedStats; macro_rules! typed_encode { ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{ @@ -120,9 +120,9 @@ mod tests { use vortex_array::validity::Validity; use vortex_buffer::buffer; + use super::IntegerStats; + use super::dictionary_encode; use crate::CompressorStats; - use crate::integer::IntegerStats; - use crate::integer::dictionary::dictionary_encode; #[test] fn test_dict_encode_integer_stats() { diff --git a/vortex-btrblocks/src/integer.rs b/vortex-btrblocks/src/compressor/integer/mod.rs similarity index 66% rename from vortex-btrblocks/src/integer.rs rename to vortex-btrblocks/src/compressor/integer/mod.rs index c92867028b9..0e804b5689e 100644 --- a/vortex-btrblocks/src/integer.rs +++ b/vortex-btrblocks/src/compressor/integer/mod.rs @@ -1,8 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -pub mod dictionary; -mod stats; +pub(crate) mod dictionary; +pub(super) mod stats; use std::hash::Hash; use std::hash::Hasher; @@ -37,6 +37,7 @@ use vortex_sparse::SparseVTable; use vortex_zigzag::ZigZagArray; use vortex_zigzag::zigzag_encode; +use self::dictionary::dictionary_encode; use crate::BtrBlocksCompressor; use crate::CanonicalCompressor; use crate::Compressor; @@ -46,10 +47,9 @@ use crate::Excludes; use crate::GenerateStatsOptions; use crate::Scheme; use crate::SchemeExt; -use crate::integer::dictionary::dictionary_encode; -use crate::patches::compress_patches; -use crate::rle; -use crate::rle::RLEScheme; +use crate::compressor::patches::compress_patches; +use crate::compressor::rle; +use crate::compressor::rle::RLEScheme; /// All available integer compression schemes. pub const ALL_INT_SCHEMES: &[&dyn IntegerScheme] = &[ @@ -818,290 +818,341 @@ impl Scheme for SequenceScheme { sequence_encode(&stats.src)?.ok_or_else(|| vortex_err!("cannot sequence encode array")) } } -// -// #[cfg(test)] -// mod tests { -// use std::iter; -// -// use itertools::Itertools; -// use rand::RngCore; -// use rand::SeedableRng; -// use rand::rngs::StdRng; -// use vortex_array::Array; -// use vortex_array::IntoArray; -// use vortex_array::ToCanonical; -// use vortex_array::arrays::DictVTable; -// use vortex_array::arrays::PrimitiveArray; -// use vortex_array::assert_arrays_eq; -// use vortex_array::validity::Validity; -// use vortex_array::vtable::ValidityHelper; -// use vortex_buffer::Buffer; -// use vortex_buffer::BufferMut; -// use vortex_buffer::buffer; -// use vortex_error::VortexResult; -// use vortex_sequence::SequenceVTable; -// use vortex_sparse::SparseVTable; -// -// use crate::Compressor; -// use crate::CompressorStats; -// use crate::FloatCompressor; -// use crate::Scheme; -// use crate::integer::IntCompressor; -// use crate::integer::IntegerStats; -// use crate::integer::RLE_INTEGER_SCHEME; -// use crate::integer::SequenceScheme; -// use crate::integer::SparseScheme; -// -// #[test] -// fn test_empty() { -// // Make sure empty array compression does not fail -// let result = IntCompressor::default() -// .compress( -// &PrimitiveArray::new(Buffer::::empty(), Validity::NonNullable), -// false, -// 3, -// ) -// .unwrap(); -// -// assert!(result.is_empty()); -// } -// -// #[test] -// fn test_dict_encodable() -> VortexResult<()> { -// let mut codes = BufferMut::::with_capacity(65_535); -// // Write some runs of length 3 of a handful of different values. Interrupted by some -// // one-off values. -// -// let numbers = [0, 10, 50, 100, 1000, 3000] -// .into_iter() -// .map(|i| 12340 * i) // must be big enough to not prefer fastlanes.bitpacked -// .collect_vec(); -// -// let mut rng = StdRng::seed_from_u64(1u64); -// while codes.len() < 64000 { -// let run_length = rng.next_u32() % 5; -// let value = numbers[rng.next_u32() as usize % numbers.len()]; -// for _ in 0..run_length { -// codes.push(value); -// } -// } -// -// let primitive = codes.freeze().into_array().to_primitive(); -// let compressed = IntCompressor::default().compress(&primitive, false, 3)?; -// assert!(compressed.is::()); -// Ok(()) -// } -// -// #[test] -// fn sparse_with_nulls() -> VortexResult<()> { -// let array = PrimitiveArray::new( -// buffer![189u8, 189, 189, 0, 46], -// Validity::from_iter(vec![true, true, true, true, false]), -// ); -// let compressed = SparseScheme.compress(&IntegerStats::generate(&array), false, 3, &[])?; -// assert!(compressed.is::()); -// let decoded = compressed.clone(); -// let expected = -// PrimitiveArray::new(buffer![189u8, 189, 189, 0, 0], array.validity().clone()) -// .into_array(); -// assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); -// Ok(()) -// } -// -// #[test] -// fn sparse_mostly_nulls() -> VortexResult<()> { -// let array = PrimitiveArray::new( -// buffer![189u8, 189, 189, 189, 189, 189, 189, 189, 189, 0, 46], -// Validity::from_iter(vec![ -// false, false, false, false, false, false, false, false, false, false, true, -// ]), -// ); -// let compressed = SparseScheme.compress(&IntegerStats::generate(&array), false, 3, &[])?; -// assert!(compressed.is::()); -// let decoded = compressed.clone(); -// let expected = PrimitiveArray::new( -// buffer![0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 46], -// array.validity().clone(), -// ) -// .into_array(); -// assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); -// Ok(()) -// } -// -// #[test] -// fn nullable_sequence() -> VortexResult<()> { -// let values = (0i32..20).step_by(7).collect_vec(); -// let array = PrimitiveArray::from_option_iter(values.clone().into_iter().map(Some)); -// let compressed = SequenceScheme.compress(&IntegerStats::generate(&array), false, 3, &[])?; -// assert!(compressed.is::()); -// let decoded = compressed; -// let expected = PrimitiveArray::from_option_iter(values.into_iter().map(Some)).into_array(); -// assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); -// Ok(()) -// } -// -// #[test] -// fn test_rle_compression() -> VortexResult<()> { -// let mut values = Vec::new(); -// values.extend(iter::repeat_n(42i32, 100)); -// values.extend(iter::repeat_n(123i32, 200)); -// values.extend(iter::repeat_n(987i32, 150)); -// -// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); -// let compressed = -// RLE_INTEGER_SCHEME.compress(&IntegerStats::generate(&array), false, 3, &[])?; -// -// let decoded = compressed; -// let expected = Buffer::copy_from(&values).into_array(); -// assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); -// Ok(()) -// } -// -// #[test_with::env(CI)] -// #[test_with::no_env(VORTEX_SKIP_SLOW_TESTS)] -// fn compress_large_int() -> VortexResult<()> { -// const NUM_LISTS: usize = 10_000; -// const ELEMENTS_PER_LIST: usize = 5_000; -// -// let prim = (0..NUM_LISTS) -// .flat_map(|list_idx| { -// (0..ELEMENTS_PER_LIST).map(move |elem_idx| (list_idx * 1000 + elem_idx) as f64) -// }) -// .collect::(); -// -// drop(FloatCompressor::compress_static(&prim, false, 3, &[])?); -// -// Ok(()) -// } -// } -// -// /// Tests to verify that each integer compression scheme produces the expected encoding. -// #[cfg(test)] -// mod scheme_selection_tests { -// use std::iter; -// -// use vortex_array::arrays::ConstantVTable; -// use vortex_array::arrays::DictVTable; -// use vortex_array::arrays::PrimitiveArray; -// use vortex_array::validity::Validity; -// use vortex_buffer::Buffer; -// use vortex_error::VortexResult; -// use vortex_fastlanes::BitPackedVTable; -// use vortex_fastlanes::FoRVTable; -// use vortex_fastlanes::RLEVTable; -// use vortex_runend::RunEndVTable; -// use vortex_sequence::SequenceVTable; -// use vortex_sparse::SparseVTable; -// -// use crate::Compressor; -// use crate::integer::IntCompressor; -// -// #[test] -// fn test_constant_compressed() { -// let values: Vec = iter::repeat_n(42, 100).collect(); -// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); -// let compressed = IntCompressor::default().compress(&array, false, 3).unwrap(); -// assert!(compressed.is::()); -// } -// -// #[test] -// fn test_for_compressed() { -// let values: Vec = (0..1000).map(|i| 1_000_000 + ((i * 37) % 100)).collect(); -// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); -// let compressed = IntCompressor::default().compress(&array, false, 3).unwrap(); -// assert!(compressed.is::()); -// } -// -// #[test] -// fn test_bitpacking_compressed() { -// let values: Vec = (0..1000).map(|i| i % 16).collect(); -// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); -// let compressed = IntCompressor::default().compress(&array, false, 3).unwrap(); -// assert!(compressed.is::()); -// } -// -// #[test] -// fn test_sparse_compressed() { -// let mut values: Vec = Vec::new(); -// for i in 0..1000 { -// if i % 20 == 0 { -// values.push(2_000_000 + (i * 7) % 1000); -// } else { -// values.push(1_000_000); -// } -// } -// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); -// let compressed = IntCompressor::default().compress(&array, false, 3).unwrap(); -// assert!(compressed.is::()); -// } -// -// #[test] -// fn test_dict_compressed() { -// use rand::RngCore; -// use rand::SeedableRng; -// use rand::rngs::StdRng; -// -// let mut codes = Vec::with_capacity(65_535); -// let numbers: Vec = [0, 10, 50, 100, 1000, 3000] -// .into_iter() -// .map(|i| 12340 * i) // must be big enough to not prefer fastlanes.bitpacked -// .collect(); -// -// let mut rng = StdRng::seed_from_u64(1u64); -// while codes.len() < 64000 { -// let run_length = rng.next_u32() % 5; -// let value = numbers[rng.next_u32() as usize % numbers.len()]; -// for _ in 0..run_length { -// codes.push(value); -// } -// } -// -// let array = PrimitiveArray::new(Buffer::copy_from(&codes), Validity::NonNullable); -// let compressed = IntCompressor::default().compress(&array, false, 3).unwrap(); -// assert!(compressed.is::()); -// } -// -// #[test] -// fn test_runend_compressed() { -// let mut values: Vec = Vec::new(); -// for i in 0..100 { -// values.extend(iter::repeat_n((i32::MAX - 50).wrapping_add(i), 10)); -// } -// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); -// let compressed = IntCompressor::default().compress(&array, false, 3).unwrap(); -// assert!(compressed.is::()); -// } -// -// #[test] -// fn test_sequence_compressed() { -// let values: Vec = (0..1000).map(|i| i * 7).collect(); -// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); -// let compressed = IntCompressor::default().compress(&array, false, 3).unwrap(); -// assert!(compressed.is::()); -// } -// -// #[test] -// fn test_rle_compressed() { -// let mut values: Vec = Vec::new(); -// for i in 0..10 { -// values.extend(iter::repeat_n(i, 100)); -// } -// let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); -// let compressed = IntCompressor::default().compress(&array, false, 3).unwrap(); -// assert!(compressed.is::()); -// } -// -// #[test] -// fn test_prim_constant() -> VortexResult<()> { -// tracing_subscriber::fmt() -// .with_max_level(tracing::Level::TRACE) -// .init(); -// -// let prim = (0..1000).map(|_x| 40).collect::(); -// let comp = IntCompressor::default(); -// let resul = comp.compress(&prim, false, 2)?; -// println!("res {}", resul); -// -// Ok(()) -// } -// } + +#[cfg(test)] +mod tests { + use std::iter; + + use itertools::Itertools; + use rand::RngCore; + use rand::SeedableRng; + use rand::rngs::StdRng; + use vortex_array::Array; + use vortex_array::IntoArray; + use vortex_array::ToCanonical; + use vortex_array::arrays::DictVTable; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::assert_arrays_eq; + use vortex_array::validity::Validity; + use vortex_array::vtable::ValidityHelper; + use vortex_buffer::Buffer; + use vortex_buffer::BufferMut; + use vortex_buffer::buffer; + use vortex_error::VortexResult; + use vortex_sequence::SequenceVTable; + use vortex_sparse::SparseVTable; + + use super::IntegerStats; + use super::RLE_INTEGER_SCHEME; + use super::SequenceScheme; + use super::SparseScheme; + use crate::BtrBlocksCompressor; + use crate::CompressorContext; + use crate::CompressorExt; + use crate::CompressorStats; + use crate::Scheme; + + #[test] + fn test_empty() -> VortexResult<()> { + // Make sure empty array compression does not fail + let btr = BtrBlocksCompressor::default(); + let result = btr.integer_compressor().compress( + &btr, + &PrimitiveArray::new(Buffer::::empty(), Validity::NonNullable), + CompressorContext::default(), + &[], + )?; + + assert!(result.is_empty()); + Ok(()) + } + + #[test] + fn test_dict_encodable() -> VortexResult<()> { + let mut codes = BufferMut::::with_capacity(65_535); + // Write some runs of length 3 of a handful of different values. Interrupted by some + // one-off values. + + let numbers = [0, 10, 50, 100, 1000, 3000] + .into_iter() + .map(|i| 12340 * i) // must be big enough to not prefer fastlanes.bitpacked + .collect_vec(); + + let mut rng = StdRng::seed_from_u64(1u64); + while codes.len() < 64000 { + let run_length = rng.next_u32() % 5; + let value = numbers[rng.next_u32() as usize % numbers.len()]; + for _ in 0..run_length { + codes.push(value); + } + } + + let primitive = codes.freeze().into_array().to_primitive(); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.integer_compressor().compress( + &btr, + &primitive, + CompressorContext::default(), + &[], + )?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn sparse_with_nulls() -> VortexResult<()> { + let array = PrimitiveArray::new( + buffer![189u8, 189, 189, 0, 46], + Validity::from_iter(vec![true, true, true, true, false]), + ); + let btr = BtrBlocksCompressor::default(); + let compressed = SparseScheme.compress( + &btr, + &IntegerStats::generate(&array), + CompressorContext::default(), + &[], + )?; + assert!(compressed.is::()); + let decoded = compressed.clone(); + let expected = + PrimitiveArray::new(buffer![189u8, 189, 189, 0, 0], array.validity().clone()) + .into_array(); + assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); + Ok(()) + } + + #[test] + fn sparse_mostly_nulls() -> VortexResult<()> { + let array = PrimitiveArray::new( + buffer![189u8, 189, 189, 189, 189, 189, 189, 189, 189, 0, 46], + Validity::from_iter(vec![ + false, false, false, false, false, false, false, false, false, false, true, + ]), + ); + let btr = BtrBlocksCompressor::default(); + let compressed = SparseScheme.compress( + &btr, + &IntegerStats::generate(&array), + CompressorContext::default(), + &[], + )?; + assert!(compressed.is::()); + let decoded = compressed.clone(); + let expected = PrimitiveArray::new( + buffer![0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 46], + array.validity().clone(), + ) + .into_array(); + assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); + Ok(()) + } + + #[test] + fn nullable_sequence() -> VortexResult<()> { + let values = (0i32..20).step_by(7).collect_vec(); + let array = PrimitiveArray::from_option_iter(values.clone().into_iter().map(Some)); + let btr = BtrBlocksCompressor::default(); + let compressed = SequenceScheme.compress( + &btr, + &IntegerStats::generate(&array), + CompressorContext::default(), + &[], + )?; + assert!(compressed.is::()); + let decoded = compressed; + let expected = PrimitiveArray::from_option_iter(values.into_iter().map(Some)).into_array(); + assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); + Ok(()) + } + + #[test] + fn test_rle_compression() -> VortexResult<()> { + let mut values = Vec::new(); + values.extend(iter::repeat_n(42i32, 100)); + values.extend(iter::repeat_n(123i32, 200)); + values.extend(iter::repeat_n(987i32, 150)); + + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = RLE_INTEGER_SCHEME.compress( + &btr, + &IntegerStats::generate(&array), + CompressorContext::default(), + &[], + )?; + + let decoded = compressed; + let expected = Buffer::copy_from(&values).into_array(); + assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); + Ok(()) + } + + #[test_with::env(CI)] + #[test_with::no_env(VORTEX_SKIP_SLOW_TESTS)] + fn compress_large_int() -> VortexResult<()> { + const NUM_LISTS: usize = 10_000; + const ELEMENTS_PER_LIST: usize = 5_000; + + let prim = (0..NUM_LISTS) + .flat_map(|list_idx| { + (0..ELEMENTS_PER_LIST).map(move |elem_idx| (list_idx * 1000 + elem_idx) as f64) + }) + .collect::() + .into_array(); + + let btr = BtrBlocksCompressor::default(); + drop(btr.compress(prim.as_ref())?); + + Ok(()) + } +} + +/// Tests to verify that each integer compression scheme produces the expected encoding. +#[cfg(test)] +mod scheme_selection_tests { + use std::iter; + + use vortex_array::arrays::ConstantVTable; + use vortex_array::arrays::DictVTable; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::validity::Validity; + use vortex_buffer::Buffer; + use vortex_error::VortexResult; + use vortex_fastlanes::BitPackedVTable; + use vortex_fastlanes::FoRVTable; + use vortex_fastlanes::RLEVTable; + use vortex_runend::RunEndVTable; + use vortex_sequence::SequenceVTable; + use vortex_sparse::SparseVTable; + + use crate::BtrBlocksCompressor; + use crate::CompressorContext; + use crate::CompressorExt; + + #[test] + fn test_constant_compressed() -> VortexResult<()> { + let values: Vec = iter::repeat_n(42, 100).collect(); + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = + btr.integer_compressor() + .compress(&btr, &array, CompressorContext::default(), &[])?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_for_compressed() -> VortexResult<()> { + let values: Vec = (0..1000).map(|i| 1_000_000 + ((i * 37) % 100)).collect(); + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = + btr.integer_compressor() + .compress(&btr, &array, CompressorContext::default(), &[])?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_bitpacking_compressed() -> VortexResult<()> { + let values: Vec = (0..1000).map(|i| i % 16).collect(); + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = + btr.integer_compressor() + .compress(&btr, &array, CompressorContext::default(), &[])?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_sparse_compressed() -> VortexResult<()> { + let mut values: Vec = Vec::new(); + for i in 0..1000 { + if i % 20 == 0 { + values.push(2_000_000 + (i * 7) % 1000); + } else { + values.push(1_000_000); + } + } + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = + btr.integer_compressor() + .compress(&btr, &array, CompressorContext::default(), &[])?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_dict_compressed() -> VortexResult<()> { + use rand::RngCore; + use rand::SeedableRng; + use rand::rngs::StdRng; + + let mut codes = Vec::with_capacity(65_535); + let numbers: Vec = [0, 10, 50, 100, 1000, 3000] + .into_iter() + .map(|i| 12340 * i) // must be big enough to not prefer fastlanes.bitpacked + .collect(); + + let mut rng = StdRng::seed_from_u64(1u64); + while codes.len() < 64000 { + let run_length = rng.next_u32() % 5; + let value = numbers[rng.next_u32() as usize % numbers.len()]; + for _ in 0..run_length { + codes.push(value); + } + } + + let array = PrimitiveArray::new(Buffer::copy_from(&codes), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = + btr.integer_compressor() + .compress(&btr, &array, CompressorContext::default(), &[])?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_runend_compressed() -> VortexResult<()> { + let mut values: Vec = Vec::new(); + for i in 0..100 { + values.extend(iter::repeat_n((i32::MAX - 50).wrapping_add(i), 10)); + } + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = + btr.integer_compressor() + .compress(&btr, &array, CompressorContext::default(), &[])?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_sequence_compressed() -> VortexResult<()> { + let values: Vec = (0..1000).map(|i| i * 7).collect(); + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = + btr.integer_compressor() + .compress(&btr, &array, CompressorContext::default(), &[])?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_rle_compressed() -> VortexResult<()> { + let mut values: Vec = Vec::new(); + for i in 0..10 { + values.extend(iter::repeat_n(i, 100)); + } + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = + btr.integer_compressor() + .compress(&btr, &array, CompressorContext::default(), &[])?; + assert!(compressed.is::()); + Ok(()) + } +} diff --git a/vortex-btrblocks/src/integer/stats.rs b/vortex-btrblocks/src/compressor/integer/stats.rs similarity index 99% rename from vortex-btrblocks/src/integer/stats.rs rename to vortex-btrblocks/src/compressor/integer/stats.rs index 954e100d114..cff8fcf8901 100644 --- a/vortex-btrblocks/src/integer/stats.rs +++ b/vortex-btrblocks/src/compressor/integer/stats.rs @@ -23,7 +23,7 @@ use vortex_utils::aliases::hash_map::HashMap; use crate::CompressorStats; use crate::GenerateStatsOptions; -use crate::rle::RLEStats; +use crate::compressor::rle::RLEStats; use crate::sample::sample; #[derive(Clone, Debug)] @@ -437,9 +437,9 @@ mod tests { use vortex_buffer::buffer; use vortex_error::VortexResult; + use super::IntegerStats; + use super::typed_int_stats; use crate::CompressorStats; - use crate::integer::IntegerStats; - use crate::integer::stats::typed_int_stats; #[test] fn test_naive_count_distinct_values() -> VortexResult<()> { diff --git a/vortex-btrblocks/src/compressor.rs b/vortex-btrblocks/src/compressor/mod.rs similarity index 97% rename from vortex-btrblocks/src/compressor.rs rename to vortex-btrblocks/src/compressor/mod.rs index 15ca78cfcf0..1e0c7983dc4 100644 --- a/vortex-btrblocks/src/compressor.rs +++ b/vortex-btrblocks/src/compressor/mod.rs @@ -12,6 +12,14 @@ use crate::CompressorContext; use crate::CompressorStats; use crate::Scheme; +pub(crate) mod decimal; +pub(crate) mod float; +pub(crate) mod integer; +mod patches; +mod rle; +pub(crate) mod string; +pub(crate) mod temporal; + /// Maximum cascade depth for compression. pub(crate) const MAX_CASCADE: usize = 3; diff --git a/vortex-btrblocks/src/patches.rs b/vortex-btrblocks/src/compressor/patches.rs similarity index 100% rename from vortex-btrblocks/src/patches.rs rename to vortex-btrblocks/src/compressor/patches.rs diff --git a/vortex-btrblocks/src/rle.rs b/vortex-btrblocks/src/compressor/rle.rs similarity index 100% rename from vortex-btrblocks/src/rle.rs rename to vortex-btrblocks/src/compressor/rle.rs diff --git a/vortex-btrblocks/src/string.rs b/vortex-btrblocks/src/compressor/string.rs similarity index 98% rename from vortex-btrblocks/src/string.rs rename to vortex-btrblocks/src/compressor/string.rs index 358a085f35b..5aeaf11ffd3 100644 --- a/vortex-btrblocks/src/string.rs +++ b/vortex-btrblocks/src/compressor/string.rs @@ -30,6 +30,9 @@ use vortex_sparse::SparseArray; use vortex_sparse::SparseVTable; use vortex_utils::aliases::hash_set::HashSet; +use super::integer::DictScheme as IntDictScheme; +use super::integer::SequenceScheme as IntSequenceScheme; +use super::integer::SparseScheme as IntSparseScheme; use crate::BtrBlocksCompressor; use crate::CanonicalCompressor; use crate::Compressor; @@ -40,7 +43,6 @@ use crate::GenerateStatsOptions; use crate::IntCode; use crate::Scheme; use crate::SchemeExt; -use crate::integer; use crate::sample::sample; /// Array of variable-length byte arrays, and relevant stats for compression. @@ -297,7 +299,7 @@ impl Scheme for DictScheme { let compressed_codes = compressor.compress_canonical( Canonical::Primitive(dict.codes().to_primitive()), ctx.descend(), - Excludes::int_only(&[integer::DictScheme.code(), integer::SequenceScheme.code()]), + Excludes::int_only(&[IntDictScheme.code(), IntSequenceScheme.code()]), )?; // Attempt to compress the values with non-Dict compression. @@ -478,7 +480,7 @@ impl Scheme for NullDominated { if let Some(sparse) = sparse_encoded.as_opt::() { // Compress the indices only (not the values for strings) - let new_excludes = vec![integer::SparseScheme.code(), IntCode::Dict]; + let new_excludes = vec![IntSparseScheme.code(), IntCode::Dict]; let indices = sparse.patches().indices().to_primitive().narrow()?; let compressed_indices = compressor.compress_canonical( diff --git a/vortex-btrblocks/src/temporal.rs b/vortex-btrblocks/src/compressor/temporal.rs similarity index 100% rename from vortex-btrblocks/src/temporal.rs rename to vortex-btrblocks/src/compressor/temporal.rs diff --git a/vortex-btrblocks/src/lib.rs b/vortex-btrblocks/src/lib.rs index dcff5504a24..9259039b1fe 100644 --- a/vortex-btrblocks/src/lib.rs +++ b/vortex-btrblocks/src/lib.rs @@ -35,27 +35,23 @@ //! //! [BtrBlocks]: https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/papers/btrblocks.pdf -pub use crate::float::FloatCode; -use crate::float::FloatCompressor; -pub use crate::integer::IntCode; -use crate::integer::IntCompressor; -pub use crate::string::StringCode; -use crate::string::StringCompressor; +pub use compressor::float::FloatCode; +use compressor::float::FloatCompressor; +pub use compressor::integer::IntCode; +use compressor::integer::IntCompressor; +pub use compressor::integer::IntegerStats; +/// Dictionary encode an integer array using the precomputed stats. +pub use compressor::integer::dictionary::dictionary_encode as integer_dictionary_encode; +pub use compressor::string::StringCode; +use compressor::string::StringCompressor; mod builder; mod canonical_compressor; mod compressor; mod ctx; -mod decimal; -mod float; -mod integer; -mod patches; -mod rle; mod sample; mod scheme; mod stats; -mod string; -mod temporal; pub use builder::BtrBlocksCompressorBuilder; pub use canonical_compressor::BtrBlocksCompressor; From ccb7819dab455db0df730ab14817b2f816106efa Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 30 Jan 2026 18:19:28 +0000 Subject: [PATCH 07/14] wip Signed-off-by: Joe Isaacs --- vortex-btrblocks/src/canonical_compressor.rs | 11 ++++------ vortex-btrblocks/src/compressor/float/mod.rs | 8 ++----- .../src/compressor/integer/mod.rs | 6 +----- vortex-btrblocks/src/compressor/rle.rs | 4 ++-- vortex-btrblocks/src/compressor/string.rs | 8 +++---- vortex-btrblocks/src/compressor/temporal.rs | 6 +++--- vortex-btrblocks/src/lib.rs | 21 ++++++++----------- 7 files changed, 25 insertions(+), 39 deletions(-) diff --git a/vortex-btrblocks/src/canonical_compressor.rs b/vortex-btrblocks/src/canonical_compressor.rs index 78cf2883eef..e977680a9a2 100644 --- a/vortex-btrblocks/src/canonical_compressor.rs +++ b/vortex-btrblocks/src/canonical_compressor.rs @@ -120,22 +120,19 @@ impl BtrBlocksCompressor { self.compress_canonical(compact, CompressorContext::default(), Excludes::none()) } - /// Creates an integer compressor using this compressor's configuration. - pub fn integer_compressor(&self) -> IntCompressor<'_> { + pub(crate) fn integer_compressor(&self) -> IntCompressor<'_> { IntCompressor { btr_blocks_compressor: self, } } - /// Creates a float compressor using this compressor's configuration. - pub fn float_compressor(&self) -> FloatCompressor<'_> { + pub(crate) fn float_compressor(&self) -> FloatCompressor<'_> { FloatCompressor { btr_blocks_compressor: self, } } - /// Creates a string compressor using this compressor's configuration. - pub fn string_compressor(&self) -> StringCompressor<'_> { + pub(crate) fn string_compressor(&self) -> StringCompressor<'_> { StringCompressor { btr_blocks_compressor: self, } @@ -200,7 +197,7 @@ impl CanonicalCompressor for BtrBlocksCompressor { let compressed_offsets = self.compress_canonical( Canonical::Primitive(list_array.offsets().to_primitive().narrow()?), ctx, - Excludes::int_only(&[IntCode::Dict]), + Excludes::from(&[IntCode::Dict]), )?; Ok(ListArray::try_new( diff --git a/vortex-btrblocks/src/compressor/float/mod.rs b/vortex-btrblocks/src/compressor/float/mod.rs index 14d9fd00889..205bb8472b0 100644 --- a/vortex-btrblocks/src/compressor/float/mod.rs +++ b/vortex-btrblocks/src/compressor/float/mod.rs @@ -178,11 +178,7 @@ impl rle::RLEConfig for FloatRLEConfig { ctx: CompressorContext, excludes: &[FloatCode], ) -> VortexResult { - compressor.compress_canonical( - Canonical::Primitive(values.clone()), - ctx, - Excludes::float_only(excludes), - ) + compressor.compress_canonical(Canonical::Primitive(values.clone()), ctx, excludes.into()) } } @@ -438,7 +434,7 @@ impl Scheme for DictScheme { let compressed_values = compressor.compress_canonical( Canonical::Primitive(values.to_primitive()), ctx.descend(), - Excludes::float_only(&[FloatCode::Dict]), + Excludes::from(&[FloatCode::Dict]), )?; // SAFETY: compressing codes or values does not alter the invariants diff --git a/vortex-btrblocks/src/compressor/integer/mod.rs b/vortex-btrblocks/src/compressor/integer/mod.rs index 0e804b5689e..abd3f5af87c 100644 --- a/vortex-btrblocks/src/compressor/integer/mod.rs +++ b/vortex-btrblocks/src/compressor/integer/mod.rs @@ -207,11 +207,7 @@ impl rle::RLEConfig for IntRLEConfig { ctx: CompressorContext, excludes: &[IntCode], ) -> VortexResult { - compressor.compress_canonical( - Canonical::Primitive(values.clone()), - ctx, - Excludes::int_only(excludes), - ) + compressor.compress_canonical(Canonical::Primitive(values.clone()), ctx, excludes.into()) } } diff --git a/vortex-btrblocks/src/compressor/rle.rs b/vortex-btrblocks/src/compressor/rle.rs index eb73f08368d..b9e21652489 100644 --- a/vortex-btrblocks/src/compressor/rle.rs +++ b/vortex-btrblocks/src/compressor/rle.rs @@ -134,13 +134,13 @@ impl Scheme for RLEScheme { let compressed_indices = compressor.compress_canonical( Canonical::Primitive(rle_array.indices().to_primitive().narrow()?), ctx.descend(), - Excludes::int_only(&[IntCode::Dict]), + Excludes::from(&[IntCode::Dict]), )?; let compressed_offsets = compressor.compress_canonical( Canonical::Primitive(rle_array.values_idx_offsets().to_primitive().narrow()?), ctx.descend(), - Excludes::int_only(&[IntCode::Dict]), + Excludes::from(&[IntCode::Dict]), )?; // SAFETY: Recursive compression doesn't affect the invariants. diff --git a/vortex-btrblocks/src/compressor/string.rs b/vortex-btrblocks/src/compressor/string.rs index 5aeaf11ffd3..8e52c48f355 100644 --- a/vortex-btrblocks/src/compressor/string.rs +++ b/vortex-btrblocks/src/compressor/string.rs @@ -299,7 +299,7 @@ impl Scheme for DictScheme { let compressed_codes = compressor.compress_canonical( Canonical::Primitive(dict.codes().to_primitive()), ctx.descend(), - Excludes::int_only(&[IntDictScheme.code(), IntSequenceScheme.code()]), + Excludes::from(&[IntDictScheme.code(), IntSequenceScheme.code()]), )?; // Attempt to compress the values with non-Dict compression. @@ -307,7 +307,7 @@ impl Scheme for DictScheme { let compressed_values = compressor.compress_canonical( Canonical::VarBinView(dict.values().to_varbinview()), ctx.descend(), - Excludes::string_only(&[DictScheme.code()]), + Excludes::from(&[DictScheme.code()]), )?; // SAFETY: compressing codes or values does not alter the invariants @@ -344,13 +344,13 @@ impl Scheme for FSSTScheme { let compressed_original_lengths = compressor.compress_canonical( Canonical::Primitive(fsst.uncompressed_lengths().to_primitive().narrow()?), ctx, - Excludes::int_only(&[]), + Excludes::none(), )?; let compressed_codes_offsets = compressor.compress_canonical( Canonical::Primitive(fsst.codes().offsets().to_primitive().narrow()?), ctx, - Excludes::int_only(&[]), + Excludes::none(), )?; let compressed_codes = VarBinArray::try_new( compressed_codes_offsets, diff --git a/vortex-btrblocks/src/compressor/temporal.rs b/vortex-btrblocks/src/compressor/temporal.rs index dad004b97a9..6fb917be58d 100644 --- a/vortex-btrblocks/src/compressor/temporal.rs +++ b/vortex-btrblocks/src/compressor/temporal.rs @@ -35,17 +35,17 @@ pub fn compress_temporal( let days = compressor.compress_canonical( Canonical::Primitive(days.to_primitive().narrow()?), ctx, - Excludes::int_only(&[]), + Excludes::none(), )?; let seconds = compressor.compress_canonical( Canonical::Primitive(seconds.to_primitive().narrow()?), ctx, - Excludes::int_only(&[]), + Excludes::none(), )?; let subseconds = compressor.compress_canonical( Canonical::Primitive(subseconds.to_primitive().narrow()?), ctx, - Excludes::int_only(&[]), + Excludes::none(), )?; Ok(DateTimePartsArray::try_new(dtype, days, seconds, subseconds)?.into_array()) diff --git a/vortex-btrblocks/src/lib.rs b/vortex-btrblocks/src/lib.rs index 9259039b1fe..6e28239927e 100644 --- a/vortex-btrblocks/src/lib.rs +++ b/vortex-btrblocks/src/lib.rs @@ -39,9 +39,6 @@ pub use compressor::float::FloatCode; use compressor::float::FloatCompressor; pub use compressor::integer::IntCode; use compressor::integer::IntCompressor; -pub use compressor::integer::IntegerStats; -/// Dictionary encode an integer array using the precomputed stats. -pub use compressor::integer::dictionary::dictionary_encode as integer_dictionary_encode; pub use compressor::string::StringCode; use compressor::string::StringCompressor; @@ -56,12 +53,12 @@ mod stats; pub use builder::BtrBlocksCompressorBuilder; pub use canonical_compressor::BtrBlocksCompressor; pub use canonical_compressor::CanonicalCompressor; -pub use compressor::Compressor; -pub use compressor::CompressorExt; -pub(crate) use compressor::MAX_CASCADE; -pub use ctx::CompressorContext; -pub use ctx::Excludes; -pub use scheme::Scheme; -pub use scheme::SchemeExt; -pub use stats::CompressorStats; -pub use stats::GenerateStatsOptions; +use compressor::Compressor; +use compressor::CompressorExt; +use compressor::MAX_CASCADE; +use ctx::CompressorContext; +use ctx::Excludes; +use scheme::Scheme; +use scheme::SchemeExt; +use stats::CompressorStats; +use stats::GenerateStatsOptions; From c7509adf2b5c57d1e7ee3d9f134b8f415801c393 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Fri, 30 Jan 2026 18:25:55 +0000 Subject: [PATCH 08/14] wip Signed-off-by: Joe Isaacs --- vortex-btrblocks/src/lib.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vortex-btrblocks/src/lib.rs b/vortex-btrblocks/src/lib.rs index 6e28239927e..6d542352024 100644 --- a/vortex-btrblocks/src/lib.rs +++ b/vortex-btrblocks/src/lib.rs @@ -56,9 +56,11 @@ pub use canonical_compressor::CanonicalCompressor; use compressor::Compressor; use compressor::CompressorExt; use compressor::MAX_CASCADE; +pub use compressor::integer::IntegerStats; +pub use compressor::integer::dictionary::dictionary_encode as integer_dictionary_encode; use ctx::CompressorContext; use ctx::Excludes; use scheme::Scheme; use scheme::SchemeExt; -use stats::CompressorStats; -use stats::GenerateStatsOptions; +pub use stats::CompressorStats; +pub use stats::GenerateStatsOptions; From 991c508ea5e0e5cc9c47cbb7b313decdf77e7f2d Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Mon, 2 Feb 2026 16:07:15 +0000 Subject: [PATCH 09/14] wip Signed-off-by: Joe Isaacs --- vortex-btrblocks/src/compressor/integer/mod.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vortex-btrblocks/src/compressor/integer/mod.rs b/vortex-btrblocks/src/compressor/integer/mod.rs index abd3f5af87c..85738f287d7 100644 --- a/vortex-btrblocks/src/compressor/integer/mod.rs +++ b/vortex-btrblocks/src/compressor/integer/mod.rs @@ -789,8 +789,11 @@ impl Scheme for SequenceScheme { return Ok(0.0); } - // All values in a seq are unique. - if stats.distinct_values_count as usize != stats.src.len() { + // If the distinct_values_count was computed (!= u32::MAX) + // Then all values in a sequence must be unqiue. + if stats.distinct_values_count != u32::MAX + && stats.distinct_values_count as usize != stats.src.len() + { return Ok(0.0); } From 4cf48de5d166b8c49b46208cfccd07374d6233e9 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Mon, 2 Feb 2026 17:02:24 +0000 Subject: [PATCH 10/14] wip Signed-off-by: Joe Isaacs --- vortex-btrblocks/src/compressor/integer/mod.rs | 2 +- vortex-python/src/io.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vortex-btrblocks/src/compressor/integer/mod.rs b/vortex-btrblocks/src/compressor/integer/mod.rs index 85738f287d7..bbc96ab362b 100644 --- a/vortex-btrblocks/src/compressor/integer/mod.rs +++ b/vortex-btrblocks/src/compressor/integer/mod.rs @@ -790,7 +790,7 @@ impl Scheme for SequenceScheme { } // If the distinct_values_count was computed (!= u32::MAX) - // Then all values in a sequence must be unqiue. + // Then all values in a sequence must be unique. if stats.distinct_values_count != u32::MAX && stats.distinct_values_count as usize != stats.src.len() { diff --git a/vortex-python/src/io.rs b/vortex-python/src/io.rs index 67bf253293a..fd2b49a7292 100644 --- a/vortex-python/src/io.rs +++ b/vortex-python/src/io.rs @@ -278,7 +278,7 @@ impl PyVortexWriteOptions { /// >>> vx.io.VortexWriteOptions.default().write(sprl, "chonky.vortex") /// >>> import os /// >>> os.path.getsize('chonky.vortex') - /// 216156 + /// 216020 /// ``` /// /// Wow, Vortex manages to use about two bytes per integer! So advanced. So tiny. From 5d2c7d0e95af47ad69a6642c61530a2b733844c1 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Mon, 2 Feb 2026 17:22:45 +0000 Subject: [PATCH 11/14] wip Signed-off-by: Joe Isaacs --- vortex-python/src/io.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vortex-python/src/io.rs b/vortex-python/src/io.rs index fd2b49a7292..08f50b5e6d3 100644 --- a/vortex-python/src/io.rs +++ b/vortex-python/src/io.rs @@ -267,6 +267,7 @@ impl PyVortexWriteOptions { /// ```python /// >>> import os /// >>> import random + /// >>> random.seed(42) /// >>> sprl = vx.array([random.randint(i, i + 10) for i in range(100_000)]) /// ``` /// @@ -278,7 +279,7 @@ impl PyVortexWriteOptions { /// >>> vx.io.VortexWriteOptions.default().write(sprl, "chonky.vortex") /// >>> import os /// >>> os.path.getsize('chonky.vortex') - /// 216020 + /// 216156 /// ``` /// /// Wow, Vortex manages to use about two bytes per integer! So advanced. So tiny. @@ -290,7 +291,7 @@ impl PyVortexWriteOptions { /// ```python /// >>> vx.io.VortexWriteOptions.compact().write(sprl, "tiny.vortex") /// >>> os.path.getsize('tiny.vortex') - /// 55116 + /// 55052 /// ``` /// /// Random numbers are not (usually) composed of random bytes! From ff53a0c0a84a76f8b1ccdf7f4c4f63f53c22a6e0 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Mon, 2 Feb 2026 17:30:38 +0000 Subject: [PATCH 12/14] wip Signed-off-by: Joe Isaacs --- vortex-python/src/io.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vortex-python/src/io.rs b/vortex-python/src/io.rs index 08f50b5e6d3..934109c9f4e 100644 --- a/vortex-python/src/io.rs +++ b/vortex-python/src/io.rs @@ -279,7 +279,7 @@ impl PyVortexWriteOptions { /// >>> vx.io.VortexWriteOptions.default().write(sprl, "chonky.vortex") /// >>> import os /// >>> os.path.getsize('chonky.vortex') - /// 216156 + /// 216020 /// ``` /// /// Wow, Vortex manages to use about two bytes per integer! So advanced. So tiny. From b7b710575783f690573813366b1218198901dedf Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Mon, 2 Feb 2026 17:53:38 +0000 Subject: [PATCH 13/14] wip Signed-off-by: Joe Isaacs --- vortex-python/src/io.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/vortex-python/src/io.rs b/vortex-python/src/io.rs index 934109c9f4e..234fbffea70 100644 --- a/vortex-python/src/io.rs +++ b/vortex-python/src/io.rs @@ -267,7 +267,6 @@ impl PyVortexWriteOptions { /// ```python /// >>> import os /// >>> import random - /// >>> random.seed(42) /// >>> sprl = vx.array([random.randint(i, i + 10) for i in range(100_000)]) /// ``` /// From 8cd032da5f51940b253dda582eaad3423c106e2f Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Mon, 2 Feb 2026 18:03:49 +0000 Subject: [PATCH 14/14] wip Signed-off-by: Joe Isaacs --- vortex-python/src/io.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vortex-python/src/io.rs b/vortex-python/src/io.rs index 234fbffea70..67bf253293a 100644 --- a/vortex-python/src/io.rs +++ b/vortex-python/src/io.rs @@ -278,7 +278,7 @@ impl PyVortexWriteOptions { /// >>> vx.io.VortexWriteOptions.default().write(sprl, "chonky.vortex") /// >>> import os /// >>> os.path.getsize('chonky.vortex') - /// 216020 + /// 216156 /// ``` /// /// Wow, Vortex manages to use about two bytes per integer! So advanced. So tiny. @@ -290,7 +290,7 @@ impl PyVortexWriteOptions { /// ```python /// >>> vx.io.VortexWriteOptions.compact().write(sprl, "tiny.vortex") /// >>> os.path.getsize('tiny.vortex') - /// 55052 + /// 55116 /// ``` /// /// Random numbers are not (usually) composed of random bytes!