From e27e1f87877da962a7184c001e5e8568124c7a2c Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Wed, 1 Oct 2025 23:05:15 +0900 Subject: [PATCH 01/35] draft: using arrow-avro --- Cargo.toml | 9 +++++++- datafusion/common/Cargo.toml | 2 +- datafusion/datasource-avro/Cargo.toml | 2 +- .../datasource-avro/src/avro_to_arrow/mod.rs | 15 +++++++------ datafusion/datasource-avro/src/mod.rs | 1 - datafusion/datasource-avro/src/source.rs | 21 ++++++++++--------- 6 files changed, 28 insertions(+), 22 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b54a75e5d4b52..254a8ce7bcfad 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -89,7 +89,7 @@ version = "50.0.0" ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } -apache-avro = { version = "0.20", default-features = false } +#apache-avro = { version = "0.20", default-features = false } arrow = { version = "56.2.0", features = [ "prettyprint", "chrono-tz", @@ -103,6 +103,7 @@ arrow-ipc = { version = "56.2.0", default-features = false, features = [ ] } arrow-ord = { version = "56.2.0", default-features = false } arrow-schema = { version = "56.2.0", default-features = false } +arrow-avro = { version = "56.2.0", default-features = false } async-trait = "0.1.89" bigdecimal = "0.4.8" bytes = "1.10" @@ -196,6 +197,12 @@ unexpected_cfgs = { level = "warn", check-cfg = [ ] } unused_qualifications = "deny" + +# TEMPORARY: override arrow-avro for testing +[patch.crates-io] +arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "b444ea7127ebc8136564bc9af036353d0c90991b" } +arrow-avro = { git = "https://github.com/apache/arrow-rs.git", rev = "b444ea7127ebc8136564bc9af036353d0c90991b" } + # -------------------- # Compilation Profiles # -------------------- diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index f5e51cb236d47..dfe3632ef6835 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -71,7 +71,7 @@ log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" -pyo3 = { version = "0.25", optional = true } +pyo3 = { version = "0.26.0", optional = true } recursive = { workspace = true, optional = true } sqlparser = { workspace = true, optional = true } tokio = { workspace = true } diff --git a/datafusion/datasource-avro/Cargo.toml b/datafusion/datasource-avro/Cargo.toml index e013e8a3d0934..c5ba018bdbd96 100644 --- a/datafusion/datasource-avro/Cargo.toml +++ b/datafusion/datasource-avro/Cargo.toml @@ -31,8 +31,8 @@ version.workspace = true all-features = true [dependencies] -apache-avro = { workspace = true } arrow = { workspace = true } +arrow-avro = { workspace = true } async-trait = { workspace = true } bytes = { workspace = true } datafusion-common = { workspace = true, features = ["object_store", "avro"] } diff --git a/datafusion/datasource-avro/src/avro_to_arrow/mod.rs b/datafusion/datasource-avro/src/avro_to_arrow/mod.rs index c1530a4880205..85a0590a4ca13 100644 --- a/datafusion/datasource-avro/src/avro_to_arrow/mod.rs +++ b/datafusion/datasource-avro/src/avro_to_arrow/mod.rs @@ -19,21 +19,20 @@ //! //! [Avro]: https://avro.apache.org/docs/1.2.0/ -mod arrow_array_reader; -mod reader; -mod schema; +// mod arrow_array_reader; +// mod reader; +// mod schema; use arrow::datatypes::Schema; -pub use reader::{Reader, ReaderBuilder}; +use arrow_avro::reader::ReaderBuilder; -pub use schema::to_arrow_schema; +// pub use schema::to_arrow_schema; use std::io::Read; /// Read Avro schema given a reader pub fn read_avro_schema_from_reader( reader: &mut R, ) -> datafusion_common::Result { - let avro_reader = apache_avro::Reader::new(reader)?; - let schema = avro_reader.writer_schema(); - to_arrow_schema(schema) + let avro_reader = ReaderBuilder::new().build(reader)?; + Ok(avro_reader.schema().as_ref().clone()) } diff --git a/datafusion/datasource-avro/src/mod.rs b/datafusion/datasource-avro/src/mod.rs index ad8ebe11446f5..88b8041c03abc 100644 --- a/datafusion/datasource-avro/src/mod.rs +++ b/datafusion/datasource-avro/src/mod.rs @@ -30,5 +30,4 @@ pub mod avro_to_arrow; pub mod file_format; pub mod source; -pub use apache_avro; pub use file_format::*; diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index da871837cdada..1b0caf44f25b9 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -20,9 +20,9 @@ use std::any::Any; use std::sync::Arc; -use crate::avro_to_arrow::Reader as AvroReader; - -use arrow::datatypes::SchemaRef; +use arrow::datatypes::{Schema, SchemaRef}; +use arrow_avro::reader::{Reader, ReaderBuilder}; +use arrow_avro::schema::AvroSchema; use datafusion_common::error::Result; use datafusion_common::Statistics; use datafusion_datasource::file::FileSource; @@ -51,13 +51,14 @@ impl AvroSource { Self::default() } - fn open(&self, reader: R) -> Result> { - AvroReader::try_new( - reader, - Arc::clone(self.schema.as_ref().expect("Schema must set before open")), - self.batch_size.expect("Batch size must set before open"), - self.projection.clone(), - ) + fn open(&self, reader: R) -> Result> { + let schema: &Schema = self.schema.as_ref().expect("Schema must set before open"); + let avro_schema = AvroSchema::try_from(schema)?; + ReaderBuilder::new() + .with_reader_schema(avro_schema) + .with_batch_size(self.batch_size.expect("Batch size must set before open")) + .build(reader) + .map_err(Into::into) } } From 027e79d858dc9af4a616a37f0b8e09ed1f195d62 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Wed, 1 Oct 2025 23:35:44 +0900 Subject: [PATCH 02/35] draft: using arrow-avro --- datafusion/datasource-avro/src/source.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index 1b0caf44f25b9..b02388784b4ac 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -20,7 +20,7 @@ use std::any::Any; use std::sync::Arc; -use arrow::datatypes::{Schema, SchemaRef}; +use arrow::datatypes::SchemaRef; use arrow_avro::reader::{Reader, ReaderBuilder}; use arrow_avro::schema::AvroSchema; use datafusion_common::error::Result; @@ -52,8 +52,7 @@ impl AvroSource { } fn open(&self, reader: R) -> Result> { - let schema: &Schema = self.schema.as_ref().expect("Schema must set before open"); - let avro_schema = AvroSchema::try_from(schema)?; + let avro_schema = AvroSchema::try_from(self.schema.expect("Schema must set before open").as_ref())?; ReaderBuilder::new() .with_reader_schema(avro_schema) .with_batch_size(self.batch_size.expect("Batch size must set before open")) From 2c7e364c749dcd61f7a525cfe5e7a29bc239018c Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Sun, 19 Oct 2025 17:34:56 +0900 Subject: [PATCH 03/35] cargo fmt --- Cargo.toml | 6 +++--- datafusion/common/Cargo.toml | 2 +- datafusion/datasource-avro/src/source.rs | 4 +++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d47b31be3fb60..c0a557873fa51 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -200,10 +200,10 @@ unexpected_cfgs = { level = "warn", check-cfg = [ unused_qualifications = "deny" -# TEMPORARY: override arrow-avro for testing +# FIXME - TEMPORARY override arrow-avro for testing [patch.crates-io] -arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "b444ea7127ebc8136564bc9af036353d0c90991b" } -arrow-avro = { git = "https://github.com/apache/arrow-rs.git", rev = "b444ea7127ebc8136564bc9af036353d0c90991b" } +arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "d49f017fe1c6712ba32e2222c6f031278b588ca5" } +arrow-avro = { git = "https://github.com/apache/arrow-rs.git", rev = "d49f017fe1c6712ba32e2222c6f031278b588ca5" } # -------------------- # Compilation Profiles diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index dfe3632ef6835..6ea7c050b9f85 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -71,7 +71,7 @@ log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" -pyo3 = { version = "0.26.0", optional = true } +pyo3 = { version = "0.26.0", optional = true } # fixme - revert recursive = { workspace = true, optional = true } sqlparser = { workspace = true, optional = true } tokio = { workspace = true } diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index 9a794efd32723..56272b8736882 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -52,7 +52,9 @@ impl AvroSource { } fn open(&self, reader: R) -> Result> { - let avro_schema = AvroSchema::try_from(self.schema.expect("Schema must set before open").as_ref())?; + let avro_schema = AvroSchema::try_from( + self.schema.expect("Schema must set before open").as_ref(), + )?; ReaderBuilder::new() .with_reader_schema(avro_schema) .with_batch_size(self.batch_size.expect("Batch size must set before open")) From 7c9f976562b58f6ade76db714f31786cd5ed44d1 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Sun, 19 Oct 2025 22:03:46 +0900 Subject: [PATCH 04/35] check public API's can be replaced --- datafusion/datasource-avro/src/file_format.rs | 2 +- datafusion/datasource-avro/src/source.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/datasource-avro/src/file_format.rs b/datafusion/datasource-avro/src/file_format.rs index 60c361b42e771..3a55d6b5ee2b1 100644 --- a/datafusion/datasource-avro/src/file_format.rs +++ b/datafusion/datasource-avro/src/file_format.rs @@ -16,7 +16,7 @@ // under the License. //! Apache Avro [`FileFormat`] abstractions - +// todo - Check if it can be replaced with arrow-avro use std::any::Any; use std::collections::HashMap; use std::fmt; diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index 56272b8736882..0a3c4b1071979 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -56,9 +56,9 @@ impl AvroSource { self.schema.expect("Schema must set before open").as_ref(), )?; ReaderBuilder::new() - .with_reader_schema(avro_schema) + .with_reader_schema(avro_schema) // Used for projection on read. .with_batch_size(self.batch_size.expect("Batch size must set before open")) - .build(reader) + .build(reader) // TODO - A File (which doesn't implement BufRead) is being passed; confirm whether this is safe. .map_err(Into::into) } } From 7bc24eaffea38fba1e4ac1f968f24c22bbe0e138 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Mon, 27 Oct 2025 23:52:17 +0900 Subject: [PATCH 05/35] revert arrow dependency --- Cargo.lock | 221 ++++++++++++++++++++--------------- Cargo.toml | 5 - datafusion/common/Cargo.toml | 2 +- 3 files changed, 126 insertions(+), 102 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 13fe25c914102..de1081103416a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -234,9 +234,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e833808ff2d94ed40d9379848a950d995043c7fb3e81a30b383f4c6033821cc" +checksum = "4df8bb5b0bd64c0b9bc61317fcc480bad0f00e56d3bc32c69a4c8dada4786bae" dependencies = [ "arrow-arith", "arrow-array", @@ -258,23 +258,23 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad08897b81588f60ba983e3ca39bda2b179bdd84dced378e7df81a5313802ef8" +checksum = "a1a640186d3bd30a24cb42264c2dafb30e236a6f50d510e56d40b708c9582491" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "num", + "num-traits", ] [[package]] name = "arrow-array" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8548ca7c070d8db9ce7aa43f37393e4bfcf3f2d3681df278490772fd1673d08d" +checksum = "219fe420e6800979744c8393b687afb0252b3f8a89b91027d27887b72aa36d31" dependencies = [ "ahash 0.8.12", "arrow-buffer", @@ -284,25 +284,45 @@ dependencies = [ "chrono-tz", "half", "hashbrown 0.16.0", - "num", + "num-complex", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-avro" +version = "57.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfabe428b25092f4f45f019d21a7a6b8d65b10458a120d00f8ee1843404dfe14" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-schema", + "indexmap 2.11.4", + "rand 0.9.2", + "serde", + "serde_json", + "strum_macros 0.27.2", + "uuid", ] [[package]] name = "arrow-buffer" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e003216336f70446457e280807a73899dd822feaf02087d31febca1363e2fccc" +checksum = "76885a2697a7edf6b59577f568b456afc94ce0e2edc15b784ce3685b6c3c5c27" dependencies = [ "bytes", "half", - "num", + "num-bigint", + "num-traits", ] [[package]] name = "arrow-cast" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "919418a0681298d3a77d1a315f625916cb5678ad0d74b9c60108eb15fd083023" +checksum = "9c9ebb4c987e6b3b236fb4a14b20b34835abfdd80acead3ccf1f9bf399e1f168" dependencies = [ "arrow-array", "arrow-buffer", @@ -315,15 +335,15 @@ dependencies = [ "comfy-table", "half", "lexical-core", - "num", + "num-traits", "ryu", ] [[package]] name = "arrow-csv" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa9bf02705b5cf762b6f764c65f04ae9082c7cfc4e96e0c33548ee3f67012eb" +checksum = "92386159c8d4bce96f8bd396b0642a0d544d471bdc2ef34d631aec80db40a09c" dependencies = [ "arrow-array", "arrow-cast", @@ -336,21 +356,22 @@ dependencies = [ [[package]] name = "arrow-data" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5c64fff1d142f833d78897a772f2e5b55b36cb3e6320376f0961ab0db7bd6d0" +checksum = "727681b95de313b600eddc2a37e736dcb21980a40f640314dcf360e2f36bc89b" dependencies = [ "arrow-buffer", "arrow-schema", "half", - "num", + "num-integer", + "num-traits", ] [[package]] name = "arrow-flight" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c8b0ba0784d56bc6266b79f5de7a24b47024e7b3a0045d2ad4df3d9b686099f" +checksum = "f70bb56412a007b0cfc116d15f24dda6adeed9611a213852a004cda20085a3b9" dependencies = [ "arrow-arith", "arrow-array", @@ -368,16 +389,17 @@ dependencies = [ "futures", "once_cell", "paste", - "prost 0.13.5", - "prost-types 0.13.5", - "tonic", + "prost 0.14.1", + "prost-types 0.14.1", + "tonic 0.14.2", + "tonic-prost", ] [[package]] name = "arrow-ipc" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d3594dcddccc7f20fd069bc8e9828ce37220372680ff638c5e00dea427d88f5" +checksum = "da9ba92e3de170295c98a84e5af22e2b037f0c7b32449445e6c493b5fca27f27" dependencies = [ "arrow-array", "arrow-buffer", @@ -391,9 +413,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88cf36502b64a127dc659e3b305f1d993a544eab0d48cce704424e62074dc04b" +checksum = "b969b4a421ae83828591c6bf5450bd52e6d489584142845ad6a861f42fe35df8" dependencies = [ "arrow-array", "arrow-buffer", @@ -403,19 +425,21 @@ dependencies = [ "chrono", "half", "indexmap 2.11.4", + "itoa", "lexical-core", "memchr", - "num", - "serde", + "num-traits", + "ryu", + "serde_core", "serde_json", "simdutf8", ] [[package]] name = "arrow-ord" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c8f82583eb4f8d84d4ee55fd1cb306720cddead7596edce95b50ee418edf66f" +checksum = "141c05298b21d03e88062317a1f1a73f5ba7b6eb041b350015b1cd6aabc0519b" dependencies = [ "arrow-array", "arrow-buffer", @@ -426,9 +450,9 @@ dependencies = [ [[package]] name = "arrow-pyarrow" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d924b32e96f8bb74d94cd82bd97b313c432fcb0ea331689ef9e7c6b8be4b258" +checksum = "cfcfb2be2e9096236f449c11f425cddde18c4cc540f516d90f066f10a29ed515" dependencies = [ "arrow-array", "arrow-data", @@ -438,9 +462,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d07ba24522229d9085031df6b94605e0f4b26e099fb7cdeec37abd941a73753" +checksum = "c5f3c06a6abad6164508ed283c7a02151515cef3de4b4ff2cebbcaeb85533db2" dependencies = [ "arrow-array", "arrow-buffer", @@ -451,34 +475,35 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3aa9e59c611ebc291c28582077ef25c97f1975383f1479b12f3b9ffee2ffabe" +checksum = "9cfa7a03d1eee2a4d061476e1840ad5c9867a544ca6c4c59256496af5d0a8be5" dependencies = [ "bitflags 2.9.4", "serde", + "serde_core", "serde_json", ] [[package]] name = "arrow-select" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c41dbbd1e97bfcaee4fcb30e29105fb2c75e4d82ae4de70b792a5d3f66b2e7a" +checksum = "bafa595babaad59f2455f4957d0f26448fb472722c186739f4fac0823a1bdb47" dependencies = [ "ahash 0.8.12", "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", - "num", + "num-traits", ] [[package]] name = "arrow-string" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53f5183c150fbc619eede22b861ea7c0eebed8eaac0333eaa7f6da5205fd504d" +checksum = "32f46457dbbb99f2650ff3ac23e46a929e0ab81db809b02aa5511c258348bef2" dependencies = [ "arrow-array", "arrow-buffer", @@ -486,7 +511,7 @@ dependencies = [ "arrow-schema", "arrow-select", "memchr", - "num", + "num-traits", "regex", "regex-syntax", ] @@ -2057,8 +2082,8 @@ dependencies = [ name = "datafusion-datasource-avro" version = "50.2.0" dependencies = [ - "apache-avro", "arrow", + "arrow-avro", "async-trait", "bytes", "datafusion-common", @@ -2173,7 +2198,7 @@ dependencies = [ "tempfile", "test-utils", "tokio", - "tonic", + "tonic 0.13.1", "tracing", "tracing-subscriber", "url", @@ -4225,20 +4250,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "num" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" -dependencies = [ - "num-bigint", - "num-complex", - "num-integer", - "num-iter", - "num-rational", - "num-traits", -] - [[package]] name = "num-bigint" version = "0.4.6" @@ -4274,28 +4285,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "num-iter" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" -dependencies = [ - "autocfg", - "num-integer", - "num-traits", -] - -[[package]] -name = "num-rational" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" -dependencies = [ - "num-bigint", - "num-integer", - "num-traits", -] - [[package]] name = "num-traits" version = "0.2.19" @@ -4447,9 +4436,9 @@ dependencies = [ [[package]] name = "parquet" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dbd48ad52d7dccf8ea1b90a3ddbfaea4f69878dd7683e51c507d4bc52b5b27" +checksum = "7a0f31027ef1af7549f7cec603a9a21dce706d3f8d7c2060a68f43c1773be95a" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -4468,8 +4457,9 @@ dependencies = [ "half", "hashbrown 0.16.0", "lz4_flex", - "num", "num-bigint", + "num-integer", + "num-traits", "object_store", "paste", "ring", @@ -4981,9 +4971,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8970a78afe0628a3e3430376fc5fd76b6b45c4d43360ffd6cdd40bdde72b682a" +checksum = "7ba0117f4212101ee6544044dae45abe1083d30ce7b29c4b5cbdfa2354e07383" dependencies = [ "indoc", "libc", @@ -4998,19 +4988,18 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "458eb0c55e7ece017adeba38f2248ff3ac615e53660d7c71a238d7d2a01c7598" +checksum = "4fc6ddaf24947d12a9aa31ac65431fb1b851b8f4365426e182901eabfb87df5f" dependencies = [ - "once_cell", "target-lexicon", ] [[package]] name = "pyo3-ffi" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7114fe5457c61b276ab77c5055f206295b812608083644a5c5b2640c3102565c" +checksum = "025474d3928738efb38ac36d4744a74a400c901c7596199e20e45d98eb194105" dependencies = [ "libc", "pyo3-build-config", @@ -5018,9 +5007,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8725c0a622b374d6cb051d11a0983786448f7785336139c3c94f5aa6bef7e50" +checksum = "2e64eb489f22fe1c95911b77c44cc41e7c19f3082fc81cce90f657cdc42ffded" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -5030,9 +5019,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4109984c22491085343c05b0dbc54ddc405c3cf7b4374fc533f5c3313a572ccc" +checksum = "100246c0ecf400b475341b8455a9213344569af29a3c841d29270e53102e0fcf" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -6654,6 +6643,46 @@ dependencies = [ "tracing", ] +[[package]] +name = "tonic" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb7613188ce9f7df5bfe185db26c5814347d110db17920415cf2fbcad85e7203" +dependencies = [ + "async-trait", + "axum", + "base64 0.22.1", + "bytes", + "h2", + "http 1.3.1", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "socket2 0.6.0", + "sync_wrapper", + "tokio", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-prost" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66bd50ad6ce1252d87ef024b3d64fe4c3cf54a86fb9ef4c631fdd0ded7aeaa67" +dependencies = [ + "bytes", + "prost 0.14.1", + "tonic 0.14.2", +] + [[package]] name = "tower" version = "0.5.2" diff --git a/Cargo.toml b/Cargo.toml index c0a557873fa51..f7ddb395fe7a7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -200,11 +200,6 @@ unexpected_cfgs = { level = "warn", check-cfg = [ unused_qualifications = "deny" -# FIXME - TEMPORARY override arrow-avro for testing -[patch.crates-io] -arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "d49f017fe1c6712ba32e2222c6f031278b588ca5" } -arrow-avro = { git = "https://github.com/apache/arrow-rs.git", rev = "d49f017fe1c6712ba32e2222c6f031278b588ca5" } - # -------------------- # Compilation Profiles # -------------------- diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 6ea7c050b9f85..6fd526683f5ef 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -71,7 +71,7 @@ log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" -pyo3 = { version = "0.26.0", optional = true } # fixme - revert +pyo3 = { version = "0.25.0", optional = true } recursive = { workspace = true, optional = true } sqlparser = { workspace = true, optional = true } tokio = { workspace = true } From 51557feba71990c0516932de520248a8266dc918 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Mon, 27 Oct 2025 23:58:51 +0900 Subject: [PATCH 06/35] apply arrow-avro 57.0.0 --- Cargo.lock | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 55c334e157db4..a1663bf8a9ea7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -280,6 +280,23 @@ dependencies = [ "num-traits", ] +[[package]] +name = "arrow-avro" +version = "57.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfabe428b25092f4f45f019d21a7a6b8d65b10458a120d00f8ee1843404dfe14" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-schema", + "indexmap 2.12.0", + "rand 0.9.2", + "serde", + "serde_json", + "strum_macros 0.27.2", + "uuid", +] + [[package]] name = "arrow-buffer" version = "57.0.0" @@ -2041,8 +2058,8 @@ dependencies = [ name = "datafusion-datasource-avro" version = "50.3.0" dependencies = [ - "apache-avro", "arrow", + "arrow-avro", "async-trait", "bytes", "datafusion-common", From 8e12f472dc6ba7d4c0581646aace513dbcd8d356 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Thu, 30 Oct 2025 00:36:09 +0900 Subject: [PATCH 07/35] BufReader --- datafusion/core/src/lib.rs | 2 +- datafusion/datasource-avro/src/avro_to_arrow/mod.rs | 4 ++-- datafusion/datasource-avro/src/mod.rs | 1 + datafusion/datasource-avro/src/source.rs | 12 ++++++++---- testing | 2 +- 5 files changed, 13 insertions(+), 8 deletions(-) diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index 78db28eaacc79..ded0d2863bd21 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -777,7 +777,7 @@ pub use object_store; pub use parquet; #[cfg(feature = "avro")] -pub use datafusion_datasource_avro::apache_avro; +pub use datafusion_datasource_avro::arrow_avro; // re-export DataFusion sub-crates at the top level. Use `pub use *` // so that the contents of the subcrates appears in rustdocs diff --git a/datafusion/datasource-avro/src/avro_to_arrow/mod.rs b/datafusion/datasource-avro/src/avro_to_arrow/mod.rs index 85a0590a4ca13..4d262a396eb3b 100644 --- a/datafusion/datasource-avro/src/avro_to_arrow/mod.rs +++ b/datafusion/datasource-avro/src/avro_to_arrow/mod.rs @@ -27,12 +27,12 @@ use arrow::datatypes::Schema; use arrow_avro::reader::ReaderBuilder; // pub use schema::to_arrow_schema; -use std::io::Read; +use std::io::{BufReader, Read}; /// Read Avro schema given a reader pub fn read_avro_schema_from_reader( reader: &mut R, ) -> datafusion_common::Result { - let avro_reader = ReaderBuilder::new().build(reader)?; + let avro_reader = ReaderBuilder::new().build(BufReader::new(reader))?; Ok(avro_reader.schema().as_ref().clone()) } diff --git a/datafusion/datasource-avro/src/mod.rs b/datafusion/datasource-avro/src/mod.rs index 88b8041c03abc..fd69a323a33d7 100644 --- a/datafusion/datasource-avro/src/mod.rs +++ b/datafusion/datasource-avro/src/mod.rs @@ -30,4 +30,5 @@ pub mod avro_to_arrow; pub mod file_format; pub mod source; +pub use arrow_avro; pub use file_format::*; diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index 0a3c4b1071979..45fb1672ecbad 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -51,9 +51,12 @@ impl AvroSource { Self::default() } - fn open(&self, reader: R) -> Result> { + fn open(&self, reader: R) -> Result> { let avro_schema = AvroSchema::try_from( - self.schema.expect("Schema must set before open").as_ref(), + self.schema + .as_ref() + .expect("Schema must set before open") + .as_ref(), )?; ReaderBuilder::new() .with_reader_schema(avro_schema) // Used for projection on read. @@ -145,6 +148,7 @@ impl FileSource for AvroSource { mod private { use super::*; + use std::io::BufReader; use bytes::Buf; use datafusion_datasource::{file_stream::FileOpenFuture, PartitionedFile}; @@ -166,14 +170,14 @@ mod private { .await?; match r.payload { GetResultPayload::File(file, _) => { - let reader = config.open(file)?; + let reader = config.open(BufReader::new(file))?; Ok(futures::stream::iter(reader) .map(|r| r.map_err(Into::into)) .boxed()) } GetResultPayload::Stream(_) => { let bytes = r.bytes().await?; - let reader = config.open(bytes.reader())?; + let reader = config.open(BufReader::new(bytes.reader()))?; Ok(futures::stream::iter(reader) .map(|r| r.map_err(Into::into)) .boxed()) diff --git a/testing b/testing index 0d60ccae40d0e..d2a1371230349 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 0d60ccae40d0e8f2d22c15fafb01c5d4be8c63a6 +Subproject commit d2a13712303498963395318a4eb42872e66aead7 From 83ec01dfb13f13057c1194b2099e9f7a5944af93 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Thu, 30 Oct 2025 01:10:27 +0900 Subject: [PATCH 08/35] remove apache-avro dependencies --- Cargo.lock | 112 +++++++------------------- Cargo.toml | 5 +- datafusion/common/Cargo.toml | 7 -- datafusion/common/src/error.rs | 17 ---- datafusion/core/Cargo.toml | 2 +- datafusion/datasource-avro/Cargo.toml | 2 +- datafusion/proto/Cargo.toml | 2 +- 7 files changed, 37 insertions(+), 110 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a1663bf8a9ea7..260871e195903 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -182,35 +182,6 @@ version = "1.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" -[[package]] -name = "apache-avro" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a033b4ced7c585199fb78ef50fca7fe2f444369ec48080c5fd072efa1a03cc7" -dependencies = [ - "bigdecimal", - "bon", - "bzip2 0.6.1", - "crc32fast", - "digest", - "log", - "miniz_oxide", - "num-bigint", - "quad-rand", - "rand 0.9.2", - "regex-lite", - "serde", - "serde_bytes", - "serde_json", - "snap", - "strum 0.27.2", - "strum_macros 0.27.2", - "thiserror", - "uuid", - "xz2", - "zstd", -] - [[package]] name = "arrayref" version = "0.3.9" @@ -289,12 +260,18 @@ dependencies = [ "arrow-array", "arrow-buffer", "arrow-schema", + "bzip2 0.6.1", + "crc", + "flate2", "indexmap 2.12.0", "rand 0.9.2", "serde", "serde_json", + "snap", "strum_macros 0.27.2", "uuid", + "xz", + "zstd", ] [[package]] @@ -1014,7 +991,6 @@ dependencies = [ "num-bigint", "num-integer", "num-traits", - "serde", ] [[package]] @@ -1142,31 +1118,6 @@ dependencies = [ "serde_with", ] -[[package]] -name = "bon" -version = "3.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2529c31017402be841eb45892278a6c21a000c0a17643af326c73a73f83f0fb" -dependencies = [ - "bon-macros", - "rustversion", -] - -[[package]] -name = "bon-macros" -version = "3.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82020dadcb845a345591863adb65d74fa8dc5c18a0b6d408470e13b7adc7005" -dependencies = [ - "darling", - "ident_case", - "prettyplease", - "proc-macro2", - "quote", - "rustversion", - "syn 2.0.106", -] - [[package]] name = "borsh" version = "1.5.7" @@ -1481,7 +1432,7 @@ version = "7.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" dependencies = [ - "strum 0.26.3", + "strum", "strum_macros 0.26.4", "unicode-width 0.2.1", ] @@ -1596,6 +1547,21 @@ dependencies = [ "libc", ] +[[package]] +name = "crc" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + [[package]] name = "crc32fast" version = "1.5.0" @@ -1965,7 +1931,6 @@ name = "datafusion-common" version = "50.3.0" dependencies = [ "ahash 0.8.12", - "apache-avro", "arrow", "arrow-ipc", "chrono", @@ -4217,7 +4182,6 @@ checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ "num-integer", "num-traits", - "serde", ] [[package]] @@ -4916,12 +4880,6 @@ dependencies = [ "syn 2.0.106", ] -[[package]] -name = "quad-rand" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" - [[package]] name = "quick-xml" version = "0.38.3" @@ -5635,16 +5593,6 @@ dependencies = [ "serde_derive", ] -[[package]] -name = "serde_bytes" -version = "0.11.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5d440709e79d88e51ac01c4b72fc6cb7314017bb7da9eeff678aa94c10e3ea8" -dependencies = [ - "serde", - "serde_core", -] - [[package]] name = "serde_core" version = "1.0.228" @@ -6021,12 +5969,6 @@ version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" -[[package]] -name = "strum" -version = "0.27.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" - [[package]] name = "strum_macros" version = "0.26.4" @@ -6826,7 +6768,6 @@ checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" dependencies = [ "getrandom 0.3.4", "js-sys", - "serde", "wasm-bindgen", ] @@ -7418,6 +7359,15 @@ version = "0.13.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" +[[package]] +name = "xz" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c887690ff2a2e233e8e49633461521f98ec57fbff9d59a884c9a4f04ec1da34" +dependencies = [ + "xz2", +] + [[package]] name = "xz2" version = "0.1.7" diff --git a/Cargo.toml b/Cargo.toml index 5cc286f9e15b5..02cc7c3569a2c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -90,7 +90,6 @@ version = "50.3.0" ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } -#apache-avro = { version = "0.20", default-features = false } arrow = { version = "57.0.0", features = [ "prettyprint", "chrono-tz", @@ -104,7 +103,9 @@ arrow-ipc = { version = "57.0.0", default-features = false, features = [ ] } arrow-ord = { version = "57.0.0", default-features = false } arrow-schema = { version = "57.0.0", default-features = false } -arrow-avro = { version = "57.0.0", default-features = false } +arrow-avro = { version = "57.0.0", default-features = false, features = [ + "deflate", "snappy", "zstd", "bzip2", "xz" +] } async-trait = "0.1.89" bigdecimal = "0.4.8" bytes = "1.10" diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index abeb4e66a269f..ff0faead7fe69 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -38,7 +38,6 @@ workspace = true name = "datafusion_common" [features] -avro = ["apache-avro"] backtrace = [] parquet_encryption = [ "parquet", @@ -53,12 +52,6 @@ sql = ["sqlparser"] [dependencies] ahash = { workspace = true } -apache-avro = { version = "0.20", default-features = false, features = [ - "bzip", - "snappy", - "xz", - "zstandard", -], optional = true } arrow = { workspace = true } arrow-ipc = { workspace = true } chrono = { workspace = true } diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 210f0442972d2..efe255c6dbae0 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -30,8 +30,6 @@ use std::sync::Arc; use crate::utils::datafusion_strsim::normalized_levenshtein; use crate::utils::quote_identifier; use crate::{Column, DFSchema, Diagnostic, TableReference}; -#[cfg(feature = "avro")] -use apache_avro::Error as AvroError; use arrow::error::ArrowError; #[cfg(feature = "parquet")] use parquet::errors::ParquetError; @@ -58,9 +56,6 @@ pub enum DataFusionError { /// Error when reading / writing Parquet data. #[cfg(feature = "parquet")] ParquetError(Box), - /// Error when reading Avro data. - #[cfg(feature = "avro")] - AvroError(Box), /// Error when reading / writing to / from an object_store (e.g. S3 or LocalFile) #[cfg(feature = "object_store")] ObjectStore(Box), @@ -310,12 +305,6 @@ impl From for DataFusionError { } } -#[cfg(feature = "avro")] -impl From for DataFusionError { - fn from(e: AvroError) -> Self { - DataFusionError::AvroError(Box::new(e)) - } -} #[cfg(feature = "object_store")] impl From for DataFusionError { @@ -367,8 +356,6 @@ impl Error for DataFusionError { DataFusionError::ArrowError(e, _) => Some(e.as_ref()), #[cfg(feature = "parquet")] DataFusionError::ParquetError(e) => Some(e.as_ref()), - #[cfg(feature = "avro")] - DataFusionError::AvroError(e) => Some(e.as_ref()), #[cfg(feature = "object_store")] DataFusionError::ObjectStore(e) => Some(e.as_ref()), DataFusionError::IoError(e) => Some(e), @@ -497,8 +484,6 @@ impl DataFusionError { DataFusionError::ArrowError(_, _) => "Arrow error: ", #[cfg(feature = "parquet")] DataFusionError::ParquetError(_) => "Parquet error: ", - #[cfg(feature = "avro")] - DataFusionError::AvroError(_) => "Avro error: ", #[cfg(feature = "object_store")] DataFusionError::ObjectStore(_) => "Object Store error: ", DataFusionError::IoError(_) => "IO error: ", @@ -537,8 +522,6 @@ impl DataFusionError { } #[cfg(feature = "parquet")] DataFusionError::ParquetError(ref desc) => Cow::Owned(desc.to_string()), - #[cfg(feature = "avro")] - DataFusionError::AvroError(ref desc) => Cow::Owned(desc.to_string()), DataFusionError::IoError(ref desc) => Cow::Owned(desc.to_string()), #[cfg(feature = "sql")] DataFusionError::SQL(ref desc, ref backtrace) => { diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 22c9f43a902e8..f043a558f2c42 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -40,7 +40,7 @@ nested_expressions = ["datafusion-functions-nested"] # This feature is deprecated. Use the `nested_expressions` feature instead. array_expressions = ["nested_expressions"] # Used to enable the avro format -avro = ["datafusion-common/avro", "datafusion-datasource-avro"] +avro = ["datafusion-datasource-avro"] backtrace = ["datafusion-common/backtrace"] compression = [ "xz2", diff --git a/datafusion/datasource-avro/Cargo.toml b/datafusion/datasource-avro/Cargo.toml index c5ba018bdbd96..f810aa59b4442 100644 --- a/datafusion/datasource-avro/Cargo.toml +++ b/datafusion/datasource-avro/Cargo.toml @@ -35,7 +35,7 @@ arrow = { workspace = true } arrow-avro = { workspace = true } async-trait = { workspace = true } bytes = { workspace = true } -datafusion-common = { workspace = true, features = ["object_store", "avro"] } +datafusion-common = { workspace = true, features = ["object_store"] } datafusion-datasource = { workspace = true } datafusion-physical-expr-common = { workspace = true } datafusion-physical-plan = { workspace = true } diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml index 920e277b8ccc0..0d38e7c14741f 100644 --- a/datafusion/proto/Cargo.toml +++ b/datafusion/proto/Cargo.toml @@ -41,7 +41,7 @@ name = "datafusion_proto" default = ["parquet"] json = ["pbjson", "serde", "serde_json", "datafusion-proto-common/json"] parquet = ["datafusion-datasource-parquet", "datafusion-common/parquet", "datafusion/parquet"] -avro = ["datafusion-datasource-avro", "datafusion-common/avro"] +avro = ["datafusion-datasource-avro"] # Note to developers: do *not* add `datafusion` as a dependency in # this crate. See https://github.com/apache/datafusion/issues/17713 From b12f563075549130ba299dc36382855148402957 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Thu, 30 Oct 2025 01:18:36 +0900 Subject: [PATCH 09/35] remove apache-avro dependencies --- Cargo.lock | 131 ++++++++++++++++++++--------------------------------- 1 file changed, 49 insertions(+), 82 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 120dc29db223b..bc0fbdcb283b6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -182,35 +182,6 @@ version = "1.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" -[[package]] -name = "apache-avro" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a033b4ced7c585199fb78ef50fca7fe2f444369ec48080c5fd072efa1a03cc7" -dependencies = [ - "bigdecimal", - "bon", - "bzip2 0.6.1", - "crc32fast", - "digest", - "log", - "miniz_oxide", - "num-bigint", - "quad-rand", - "rand 0.9.2", - "regex-lite", - "serde", - "serde_bytes", - "serde_json", - "snap", - "strum 0.27.2", - "strum_macros 0.27.2", - "thiserror", - "uuid", - "xz2", - "zstd", -] - [[package]] name = "arrayref" version = "0.3.9" @@ -280,6 +251,29 @@ dependencies = [ "num-traits", ] +[[package]] +name = "arrow-avro" +version = "57.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfabe428b25092f4f45f019d21a7a6b8d65b10458a120d00f8ee1843404dfe14" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-schema", + "bzip2 0.6.1", + "crc", + "flate2", + "indexmap 2.12.0", + "rand 0.9.2", + "serde", + "serde_json", + "snap", + "strum_macros 0.27.2", + "uuid", + "xz", + "zstd", +] + [[package]] name = "arrow-buffer" version = "57.0.0" @@ -1035,7 +1029,6 @@ dependencies = [ "num-bigint", "num-integer", "num-traits", - "serde", ] [[package]] @@ -1190,31 +1183,6 @@ dependencies = [ "serde_with", ] -[[package]] -name = "bon" -version = "3.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2529c31017402be841eb45892278a6c21a000c0a17643af326c73a73f83f0fb" -dependencies = [ - "bon-macros", - "rustversion", -] - -[[package]] -name = "bon-macros" -version = "3.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82020dadcb845a345591863adb65d74fa8dc5c18a0b6d408470e13b7adc7005" -dependencies = [ - "darling", - "ident_case", - "prettyplease", - "proc-macro2", - "quote", - "rustversion", - "syn 2.0.108", -] - [[package]] name = "borsh" version = "1.5.7" @@ -1529,7 +1497,7 @@ version = "7.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" dependencies = [ - "strum 0.26.3", + "strum", "strum_macros 0.26.4", "unicode-width 0.2.1", ] @@ -1644,6 +1612,21 @@ dependencies = [ "libc", ] +[[package]] +name = "crc" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + [[package]] name = "crc32fast" version = "1.5.0" @@ -2013,7 +1996,6 @@ name = "datafusion-common" version = "50.3.0" dependencies = [ "ahash 0.8.12", - "apache-avro", "arrow", "arrow-ipc", "chrono", @@ -2106,8 +2088,8 @@ dependencies = [ name = "datafusion-datasource-avro" version = "50.3.0" dependencies = [ - "apache-avro", "arrow", + "arrow-avro", "async-trait", "bytes", "datafusion-common", @@ -4280,7 +4262,6 @@ checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ "num-integer", "num-traits", - "serde", ] [[package]] @@ -5001,12 +4982,6 @@ dependencies = [ "syn 2.0.108", ] -[[package]] -name = "quad-rand" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" - [[package]] name = "quick-xml" version = "0.38.3" @@ -5712,16 +5687,6 @@ dependencies = [ "serde_derive", ] -[[package]] -name = "serde_bytes" -version = "0.11.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5d440709e79d88e51ac01c4b72fc6cb7314017bb7da9eeff678aa94c10e3ea8" -dependencies = [ - "serde", - "serde_core", -] - [[package]] name = "serde_core" version = "1.0.228" @@ -6098,12 +6063,6 @@ version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" -[[package]] -name = "strum" -version = "0.27.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" - [[package]] name = "strum_macros" version = "0.26.4" @@ -6933,7 +6892,6 @@ checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" dependencies = [ "getrandom 0.3.4", "js-sys", - "serde", "wasm-bindgen", ] @@ -7534,6 +7492,15 @@ version = "0.13.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" +[[package]] +name = "xz" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c887690ff2a2e233e8e49633461521f98ec57fbff9d59a884c9a4f04ec1da34" +dependencies = [ + "xz2", +] + [[package]] name = "xz2" version = "0.1.7" From 581ecae9e3e601e67e11b83df43b5d84dd76428e Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Thu, 30 Oct 2025 01:26:05 +0900 Subject: [PATCH 10/35] remove apache-avro dependencies --- testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing b/testing index d2a1371230349..8da05fdd62a72 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit d2a13712303498963395318a4eb42872e66aead7 +Subproject commit 8da05fdd62a7243ef77aa9757acb62e0586a4d0c From 74792ffecd30b8791e24a3906c9d4bd2e22475f3 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Fri, 31 Oct 2025 00:58:23 +0900 Subject: [PATCH 11/35] fix test result --- .../core/src/datasource/file_format/avro.rs | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/datafusion/core/src/datasource/file_format/avro.rs b/datafusion/core/src/datasource/file_format/avro.rs index 3428d08a6ae52..d4453592e7a1b 100644 --- a/datafusion/core/src/datasource/file_format/avro.rs +++ b/datafusion/core/src/datasource/file_format/avro.rs @@ -108,7 +108,7 @@ mod tests { "double_col: Float64", "date_string_col: Binary", "string_col: Binary", - "timestamp_col: Timestamp(Microsecond, None)", + "timestamp_col: Timestamp(Microsecond, Some(\"+00:00\"))", ], x ); @@ -117,18 +117,18 @@ mod tests { assert_eq!(batches.len(), 1); assert_snapshot!(batches_to_string(&batches),@r###" - +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+ - | id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col | string_col | timestamp_col | - +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+ - | 4 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30332f30312f3039 | 30 | 2009-03-01T00:00:00 | - | 5 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30332f30312f3039 | 31 | 2009-03-01T00:01:00 | - | 6 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30342f30312f3039 | 30 | 2009-04-01T00:00:00 | - | 7 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30342f30312f3039 | 31 | 2009-04-01T00:01:00 | - | 2 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30322f30312f3039 | 30 | 2009-02-01T00:00:00 | - | 3 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30322f30312f3039 | 31 | 2009-02-01T00:01:00 | - | 0 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30312f30312f3039 | 30 | 2009-01-01T00:00:00 | - | 1 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30312f30312f3039 | 31 | 2009-01-01T00:01:00 | - +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+ + +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+----------------------+ + | id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col | string_col | timestamp_col | + +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+----------------------+ + | 4 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30332f30312f3039 | 30 | 2009-03-01T00:00:00Z | + | 5 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30332f30312f3039 | 31 | 2009-03-01T00:01:00Z | + | 6 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30342f30312f3039 | 30 | 2009-04-01T00:00:00Z | + | 7 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30342f30312f3039 | 31 | 2009-04-01T00:01:00Z | + | 2 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30322f30312f3039 | 30 | 2009-02-01T00:00:00Z | + | 3 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30322f30312f3039 | 31 | 2009-02-01T00:01:00Z | + | 0 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30312f30312f3039 | 30 | 2009-01-01T00:00:00Z | + | 1 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30312f30312f3039 | 31 | 2009-01-01T00:01:00Z | + +----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+----------------------+ "###); Ok(()) } From 151ea95e17ded48b0b6edeea0034f20cd4cfae1c Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Fri, 31 Oct 2025 01:34:05 +0900 Subject: [PATCH 12/35] pass projected schema to arrow-avro --- datafusion/common/src/error.rs | 1 - datafusion/datasource-avro/src/source.rs | 26 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 7889ce145ccf0..3c979802f6708 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -305,7 +305,6 @@ impl From for DataFusionError { } } - #[cfg(feature = "object_store")] impl From for DataFusionError { fn from(e: object_store::Error) -> Self { diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index 45fb1672ecbad..9a0b97babdb4d 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -39,7 +39,7 @@ use object_store::ObjectStore; pub struct AvroSource { schema: Option, batch_size: Option, - projection: Option>, + file_projection: Option>, metrics: ExecutionPlanMetricsSet, projected_statistics: Option, schema_adapter_factory: Option>, @@ -52,16 +52,24 @@ impl AvroSource { } fn open(&self, reader: R) -> Result> { - let avro_schema = AvroSchema::try_from( - self.schema - .as_ref() - .expect("Schema must set before open") - .as_ref(), - )?; + let schema = self + .schema + .as_ref() + .expect("Schema must set before open") + .as_ref(); + + let projected_schema = if let Some(projection) = &self.file_projection { + &schema.project(projection)? + } else { + schema + }; + + let avro_schema = AvroSchema::try_from(projected_schema)?; + ReaderBuilder::new() .with_reader_schema(avro_schema) // Used for projection on read. .with_batch_size(self.batch_size.expect("Batch size must set before open")) - .build(reader) // TODO - A File (which doesn't implement BufRead) is being passed; confirm whether this is safe. + .build(reader) .map_err(Into::into) } } @@ -102,7 +110,7 @@ impl FileSource for AvroSource { fn with_projection(&self, config: &FileScanConfig) -> Arc { let mut conf = self.clone(); - conf.projection = config.projected_file_column_names(); + conf.file_projection = config.file_column_projection_indices(); Arc::new(conf) } From 78df065b21217201455fc388205cfa7e84fdac21 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Fri, 31 Oct 2025 11:16:26 +0900 Subject: [PATCH 13/35] testing submodule checkout --- testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing b/testing index 8da05fdd62a72..0d60ccae40d0e 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 8da05fdd62a7243ef77aa9757acb62e0586a4d0c +Subproject commit 0d60ccae40d0e8f2d22c15fafb01c5d4be8c63a6 From c7b3b6362c3051d80e34fd1cbe3b5e6589dd813f Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Fri, 31 Oct 2025 11:27:47 +0900 Subject: [PATCH 14/35] taplo format --- Cargo.toml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 30a8d44a275f6..2bff65f957b19 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -94,6 +94,13 @@ arrow = { version = "57.0.0", features = [ "prettyprint", "chrono-tz", ] } +arrow-avro = { version = "57.0.0", default-features = false, features = [ + "deflate", + "snappy", + "zstd", + "bzip2", + "xz", +] } arrow-buffer = { version = "57.0.0", default-features = false } arrow-flight = { version = "57.0.0", features = [ "flight-sql-experimental", @@ -103,9 +110,6 @@ arrow-ipc = { version = "57.0.0", default-features = false, features = [ ] } arrow-ord = { version = "57.0.0", default-features = false } arrow-schema = { version = "57.0.0", default-features = false } -arrow-avro = { version = "57.0.0", default-features = false, features = [ - "deflate", "snappy", "zstd", "bzip2", "xz" -] } async-trait = "0.1.89" bigdecimal = "0.4.8" bytes = "1.10" From 25565911b57028c7aeeb755ba753c489707af41a Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Tue, 4 Nov 2025 23:59:40 +0900 Subject: [PATCH 15/35] replace with arrow-avro --- .../src/avro_to_arrow/arrow_array_reader.rs | 1723 ----------------- .../datasource-avro/src/avro_to_arrow/mod.rs | 38 - .../src/avro_to_arrow/reader.rs | 357 ---- .../src/avro_to_arrow/schema.rs | 523 ----- datafusion/datasource-avro/src/file_format.rs | 3 +- datafusion/datasource-avro/src/mod.rs | 12 +- datafusion/datasource-avro/src/source.rs | 192 +- datafusion/sqllogictest/test_files/avro.slt | 2 +- 8 files changed, 192 insertions(+), 2658 deletions(-) delete mode 100644 datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs delete mode 100644 datafusion/datasource-avro/src/avro_to_arrow/mod.rs delete mode 100644 datafusion/datasource-avro/src/avro_to_arrow/reader.rs delete mode 100644 datafusion/datasource-avro/src/avro_to_arrow/schema.rs diff --git a/datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs b/datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs deleted file mode 100644 index a80f18cf818fe..0000000000000 --- a/datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs +++ /dev/null @@ -1,1723 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Avro to Arrow array readers - -use apache_avro::schema::RecordSchema; -use apache_avro::{ - error::Details as AvroErrorDetails, - schema::{Schema as AvroSchema, SchemaKind}, - types::Value, - Error as AvroError, Reader as AvroReader, -}; -use arrow::array::{ - make_array, Array, ArrayBuilder, ArrayData, ArrayDataBuilder, ArrayRef, - BooleanBuilder, LargeStringArray, ListBuilder, NullArray, OffsetSizeTrait, - PrimitiveArray, StringArray, StringBuilder, StringDictionaryBuilder, -}; -use arrow::array::{BinaryArray, FixedSizeBinaryArray, GenericListArray}; -use arrow::buffer::{Buffer, MutableBuffer}; -use arrow::datatypes::{ - ArrowDictionaryKeyType, ArrowNumericType, ArrowPrimitiveType, DataType, Date32Type, - Date64Type, Field, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, - Int8Type, Time32MillisecondType, Time32SecondType, Time64MicrosecondType, - Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, - TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, - UInt8Type, -}; -use arrow::datatypes::{Fields, SchemaRef}; -use arrow::error::ArrowError; -use arrow::error::ArrowError::SchemaError; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; -use arrow::util::bit_util; -use datafusion_common::arrow_err; -use datafusion_common::error::{DataFusionError, Result}; -use num_traits::NumCast; -use std::collections::BTreeMap; -use std::io::Read; -use std::sync::Arc; - -type RecordSlice<'a> = &'a [&'a Vec<(String, Value)>]; - -pub struct AvroArrowArrayReader<'a, R: Read> { - reader: AvroReader<'a, R>, - schema: SchemaRef, - schema_lookup: BTreeMap, -} - -impl AvroArrowArrayReader<'_, R> { - pub fn try_new(reader: R, schema: SchemaRef) -> Result { - let reader = AvroReader::new(reader)?; - let writer_schema = reader.writer_schema().clone(); - let schema_lookup = Self::schema_lookup(writer_schema)?; - Ok(Self { - reader, - schema, - schema_lookup, - }) - } - - pub fn schema_lookup(schema: AvroSchema) -> Result> { - match schema { - AvroSchema::Record(RecordSchema { - fields, mut lookup, .. - }) => { - for field in fields { - Self::child_schema_lookup(&field.name, &field.schema, &mut lookup)?; - } - Ok(lookup) - } - _ => arrow_err!(SchemaError( - "expected avro schema to be a record".to_string(), - )), - } - } - - fn child_schema_lookup<'b>( - parent_field_name: &str, - schema: &AvroSchema, - schema_lookup: &'b mut BTreeMap, - ) -> Result<&'b BTreeMap> { - match schema { - AvroSchema::Union(us) => { - let has_nullable = us - .find_schema_with_known_schemata::( - &Value::Null, - None, - &None, - ) - .is_some(); - let sub_schemas = us.variants(); - if has_nullable && sub_schemas.len() == 2 { - if let Some(sub_schema) = - sub_schemas.iter().find(|&s| !matches!(s, AvroSchema::Null)) - { - Self::child_schema_lookup( - parent_field_name, - sub_schema, - schema_lookup, - )?; - } - } - } - AvroSchema::Record(RecordSchema { fields, lookup, .. }) => { - lookup.iter().for_each(|(field_name, pos)| { - schema_lookup - .insert(format!("{parent_field_name}.{field_name}"), *pos); - }); - - for field in fields { - let sub_parent_field_name = - format!("{}.{}", parent_field_name, field.name); - Self::child_schema_lookup( - &sub_parent_field_name, - &field.schema, - schema_lookup, - )?; - } - } - AvroSchema::Array(schema) => { - let sub_parent_field_name = format!("{parent_field_name}.element"); - Self::child_schema_lookup( - &sub_parent_field_name, - &schema.items, - schema_lookup, - )?; - } - _ => (), - } - Ok(schema_lookup) - } - - /// Read the next batch of records - pub fn next_batch(&mut self, batch_size: usize) -> Option> { - let rows_result = self - .reader - .by_ref() - .take(batch_size) - .map(|value| match value { - Ok(Value::Record(v)) => Ok(v), - Err(e) => Err(ArrowError::ParseError(format!( - "Failed to parse avro value: {e}" - ))), - other => Err(ArrowError::ParseError(format!( - "Row needs to be of type object, got: {other:?}" - ))), - }) - .collect::>>>(); - - let rows = match rows_result { - // Return error early - Err(e) => return Some(Err(e)), - // No rows: return None early - Ok(rows) if rows.is_empty() => return None, - Ok(rows) => rows, - }; - - let rows = rows.iter().collect::>>(); - let arrays = self.build_struct_array(&rows, "", self.schema.fields()); - - Some(arrays.and_then(|arr| RecordBatch::try_new(Arc::clone(&self.schema), arr))) - } - - fn build_boolean_array(&self, rows: RecordSlice, col_name: &str) -> ArrayRef { - let mut builder = BooleanBuilder::with_capacity(rows.len()); - for row in rows { - if let Some(value) = self.field_lookup(col_name, row) { - if let Some(boolean) = resolve_boolean(value) { - builder.append_value(boolean) - } else { - builder.append_null(); - } - } else { - builder.append_null(); - } - } - Arc::new(builder.finish()) - } - - fn build_primitive_array(&self, rows: RecordSlice, col_name: &str) -> ArrayRef - where - T: ArrowNumericType + Resolver, - T::Native: NumCast, - { - Arc::new( - rows.iter() - .map(|row| { - self.field_lookup(col_name, row) - .and_then(|value| resolve_item::(value)) - }) - .collect::>(), - ) - } - - #[inline(always)] - fn build_string_dictionary_builder( - &self, - row_len: usize, - ) -> StringDictionaryBuilder - where - T: ArrowPrimitiveType + ArrowDictionaryKeyType, - { - StringDictionaryBuilder::with_capacity(row_len, row_len, row_len) - } - - fn build_wrapped_list_array( - &self, - rows: RecordSlice, - col_name: &str, - key_type: &DataType, - ) -> ArrowResult { - match *key_type { - DataType::Int8 => { - let dtype = DataType::Dictionary( - Box::new(DataType::Int8), - Box::new(DataType::Utf8), - ); - self.list_array_string_array_builder::(&dtype, col_name, rows) - } - DataType::Int16 => { - let dtype = DataType::Dictionary( - Box::new(DataType::Int16), - Box::new(DataType::Utf8), - ); - self.list_array_string_array_builder::(&dtype, col_name, rows) - } - DataType::Int32 => { - let dtype = DataType::Dictionary( - Box::new(DataType::Int32), - Box::new(DataType::Utf8), - ); - self.list_array_string_array_builder::(&dtype, col_name, rows) - } - DataType::Int64 => { - let dtype = DataType::Dictionary( - Box::new(DataType::Int64), - Box::new(DataType::Utf8), - ); - self.list_array_string_array_builder::(&dtype, col_name, rows) - } - DataType::UInt8 => { - let dtype = DataType::Dictionary( - Box::new(DataType::UInt8), - Box::new(DataType::Utf8), - ); - self.list_array_string_array_builder::(&dtype, col_name, rows) - } - DataType::UInt16 => { - let dtype = DataType::Dictionary( - Box::new(DataType::UInt16), - Box::new(DataType::Utf8), - ); - self.list_array_string_array_builder::(&dtype, col_name, rows) - } - DataType::UInt32 => { - let dtype = DataType::Dictionary( - Box::new(DataType::UInt32), - Box::new(DataType::Utf8), - ); - self.list_array_string_array_builder::(&dtype, col_name, rows) - } - DataType::UInt64 => { - let dtype = DataType::Dictionary( - Box::new(DataType::UInt64), - Box::new(DataType::Utf8), - ); - self.list_array_string_array_builder::(&dtype, col_name, rows) - } - ref e => Err(SchemaError(format!( - "Data type is currently not supported for dictionaries in list : {e}" - ))), - } - } - - #[inline(always)] - fn list_array_string_array_builder( - &self, - data_type: &DataType, - col_name: &str, - rows: RecordSlice, - ) -> ArrowResult - where - D: ArrowPrimitiveType + ArrowDictionaryKeyType, - { - let mut builder: Box = match data_type { - DataType::Utf8 => { - let values_builder = StringBuilder::with_capacity(rows.len(), 5); - Box::new(ListBuilder::new(values_builder)) - } - DataType::Dictionary(_, _) => { - let values_builder = - self.build_string_dictionary_builder::(rows.len() * 5); - Box::new(ListBuilder::new(values_builder)) - } - e => { - return Err(SchemaError(format!( - "Nested list data builder type is not supported: {e}" - ))) - } - }; - - for row in rows { - if let Some(value) = self.field_lookup(col_name, row) { - let value = maybe_resolve_union(value); - // value can be an array or a scalar - let vals: Vec> = if let Value::String(v) = value { - vec![Some(v.to_string())] - } else if let Value::Array(n) = value { - n.iter() - .map(resolve_string) - .collect::>>>()? - .into_iter() - .collect::>>() - } else if let Value::Null = value { - vec![None] - } else if !matches!(value, Value::Record(_)) { - vec![resolve_string(value)?] - } else { - return Err(SchemaError( - "Only scalars are currently supported in Avro arrays".to_string(), - )); - }; - - // TODO: ARROW-10335: APIs of dictionary arrays and others are different. Unify - // them. - match data_type { - DataType::Utf8 => { - let builder = builder - .as_any_mut() - .downcast_mut::>() - .ok_or_else(||SchemaError( - "Cast failed for ListBuilder during nested data parsing".to_string(), - ))?; - for val in vals { - if let Some(v) = val { - builder.values().append_value(&v) - } else { - builder.values().append_null() - }; - } - - // Append to the list - builder.append(true); - } - DataType::Dictionary(_, _) => { - let builder = builder.as_any_mut().downcast_mut::>>().ok_or_else(||SchemaError( - "Cast failed for ListBuilder during nested data parsing".to_string(), - ))?; - for val in vals { - if let Some(v) = val { - let _ = builder.values().append(&v)?; - } else { - builder.values().append_null() - }; - } - - // Append to the list - builder.append(true); - } - e => { - return Err(SchemaError(format!( - "Nested list data builder type is not supported: {e}" - ))) - } - } - } - } - - Ok(builder.finish() as ArrayRef) - } - - #[inline(always)] - fn build_dictionary_array( - &self, - rows: RecordSlice, - col_name: &str, - ) -> ArrowResult - where - T::Native: NumCast, - T: ArrowPrimitiveType + ArrowDictionaryKeyType, - { - let mut builder: StringDictionaryBuilder = - self.build_string_dictionary_builder(rows.len()); - for row in rows { - if let Some(value) = self.field_lookup(col_name, row) { - if let Ok(Some(str_v)) = resolve_string(value) { - builder.append(str_v).map(drop)? - } else { - builder.append_null() - } - } else { - builder.append_null() - } - } - Ok(Arc::new(builder.finish()) as ArrayRef) - } - - #[inline(always)] - fn build_string_dictionary_array( - &self, - rows: RecordSlice, - col_name: &str, - key_type: &DataType, - value_type: &DataType, - ) -> ArrowResult { - if let DataType::Utf8 = *value_type { - match *key_type { - DataType::Int8 => self.build_dictionary_array::(rows, col_name), - DataType::Int16 => { - self.build_dictionary_array::(rows, col_name) - } - DataType::Int32 => { - self.build_dictionary_array::(rows, col_name) - } - DataType::Int64 => { - self.build_dictionary_array::(rows, col_name) - } - DataType::UInt8 => { - self.build_dictionary_array::(rows, col_name) - } - DataType::UInt16 => { - self.build_dictionary_array::(rows, col_name) - } - DataType::UInt32 => { - self.build_dictionary_array::(rows, col_name) - } - DataType::UInt64 => { - self.build_dictionary_array::(rows, col_name) - } - _ => Err(SchemaError("unsupported dictionary key type".to_string())), - } - } else { - Err(SchemaError( - "dictionary types other than UTF-8 not yet supported".to_string(), - )) - } - } - - /// Build a nested GenericListArray from a list of unnested `Value`s - fn build_nested_list_array( - &self, - parent_field_name: &str, - rows: &[&Value], - list_field: &Field, - ) -> ArrowResult { - // build list offsets - let mut cur_offset = OffsetSize::zero(); - let list_len = rows.len(); - let num_list_bytes = bit_util::ceil(list_len, 8); - let mut offsets = Vec::with_capacity(list_len + 1); - let mut list_nulls = MutableBuffer::from_len_zeroed(num_list_bytes); - offsets.push(cur_offset); - rows.iter().enumerate().for_each(|(i, v)| { - // TODO: unboxing Union(Array(Union(...))) should probably be done earlier - let v = maybe_resolve_union(v); - if let Value::Array(a) = v { - cur_offset += OffsetSize::from_usize(a.len()).unwrap(); - bit_util::set_bit(&mut list_nulls, i); - } else if let Value::Null = v { - // value is null, not incremented - } else { - cur_offset += OffsetSize::one(); - } - offsets.push(cur_offset); - }); - let valid_len = cur_offset.to_usize().unwrap(); - let array_data = match list_field.data_type() { - DataType::Null => NullArray::new(valid_len).into_data(), - DataType::Boolean => { - let num_bytes = bit_util::ceil(valid_len, 8); - let mut bool_values = MutableBuffer::from_len_zeroed(num_bytes); - let mut bool_nulls = - MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); - let mut curr_index = 0; - rows.iter().for_each(|v| { - if let Value::Array(vs) = v { - vs.iter().for_each(|value| { - if let Value::Boolean(child) = value { - // if valid boolean, append value - if *child { - bit_util::set_bit(&mut bool_values, curr_index); - } - } else { - // null slot - bit_util::unset_bit(&mut bool_nulls, curr_index); - } - curr_index += 1; - }); - } - }); - ArrayData::builder(list_field.data_type().clone()) - .len(valid_len) - .add_buffer(bool_values.into()) - .null_bit_buffer(Some(bool_nulls.into())) - .build() - .unwrap() - } - DataType::Int8 => self.read_primitive_list_values::(rows), - DataType::Int16 => self.read_primitive_list_values::(rows), - DataType::Int32 => self.read_primitive_list_values::(rows), - DataType::Int64 => self.read_primitive_list_values::(rows), - DataType::UInt8 => self.read_primitive_list_values::(rows), - DataType::UInt16 => self.read_primitive_list_values::(rows), - DataType::UInt32 => self.read_primitive_list_values::(rows), - DataType::UInt64 => self.read_primitive_list_values::(rows), - DataType::Float16 => { - return Err(SchemaError("Float16 not supported".to_string())) - } - DataType::Float32 => self.read_primitive_list_values::(rows), - DataType::Float64 => self.read_primitive_list_values::(rows), - DataType::Timestamp(_, _) - | DataType::Date32 - | DataType::Date64 - | DataType::Time32(_) - | DataType::Time64(_) => { - return Err(SchemaError( - "Temporal types are not yet supported, see ARROW-4803".to_string(), - )) - } - DataType::Utf8 => flatten_string_values(rows) - .into_iter() - .collect::() - .into_data(), - DataType::LargeUtf8 => flatten_string_values(rows) - .into_iter() - .collect::() - .into_data(), - DataType::List(field) => { - let child = self.build_nested_list_array::( - parent_field_name, - &flatten_values(rows), - field, - )?; - child.to_data() - } - DataType::LargeList(field) => { - let child = self.build_nested_list_array::( - parent_field_name, - &flatten_values(rows), - field, - )?; - child.to_data() - } - DataType::Struct(fields) => { - // extract list values, with non-lists converted to Value::Null - let array_item_count = rows - .iter() - .map(|row| match maybe_resolve_union(row) { - Value::Array(values) => values.len(), - _ => 1, - }) - .sum(); - let num_bytes = bit_util::ceil(array_item_count, 8); - let mut null_buffer = MutableBuffer::from_len_zeroed(num_bytes); - let mut struct_index = 0; - let null_struct_array = vec![("null".to_string(), Value::Null)]; - let rows: Vec<&Vec<(String, Value)>> = rows - .iter() - .map(|v| maybe_resolve_union(v)) - .flat_map(|row| { - if let Value::Array(values) = row { - values - .iter() - .map(maybe_resolve_union) - .map(|v| match v { - Value::Record(record) => { - bit_util::set_bit(&mut null_buffer, struct_index); - struct_index += 1; - record - } - Value::Null => { - struct_index += 1; - &null_struct_array - } - other => panic!("expected Record, got {other:?}"), - }) - .collect::>>() - } else { - struct_index += 1; - vec![&null_struct_array] - } - }) - .collect(); - - let sub_parent_field_name = - format!("{}.{}", parent_field_name, list_field.name()); - let arrays = - self.build_struct_array(&rows, &sub_parent_field_name, fields)?; - let data_type = DataType::Struct(fields.clone()); - ArrayDataBuilder::new(data_type) - .len(rows.len()) - .null_bit_buffer(Some(null_buffer.into())) - .child_data(arrays.into_iter().map(|a| a.to_data()).collect()) - .build() - .unwrap() - } - datatype => { - return Err(SchemaError(format!( - "Nested list of {datatype} not supported" - ))); - } - }; - // build list - let list_data = ArrayData::builder(DataType::List(Arc::new(list_field.clone()))) - .len(list_len) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_child_data(array_data) - .null_bit_buffer(Some(list_nulls.into())) - .build() - .unwrap(); - Ok(Arc::new(GenericListArray::::from(list_data))) - } - - /// Builds the child values of a `StructArray`, falling short of constructing the StructArray. - /// The function does not construct the StructArray as some callers would want the child arrays. - /// - /// *Note*: The function is recursive, and will read nested structs. - fn build_struct_array( - &self, - rows: RecordSlice, - parent_field_name: &str, - struct_fields: &Fields, - ) -> ArrowResult> { - let arrays: ArrowResult> = struct_fields - .iter() - .map(|field| { - let field_path = if parent_field_name.is_empty() { - field.name().to_string() - } else { - format!("{}.{}", parent_field_name, field.name()) - }; - let arr = match field.data_type() { - DataType::Null => Arc::new(NullArray::new(rows.len())) as ArrayRef, - DataType::Boolean => self.build_boolean_array(rows, &field_path), - DataType::Float64 => { - self.build_primitive_array::(rows, &field_path) - } - DataType::Float32 => { - self.build_primitive_array::(rows, &field_path) - } - DataType::Int64 => { - self.build_primitive_array::(rows, &field_path) - } - DataType::Int32 => { - self.build_primitive_array::(rows, &field_path) - } - DataType::Int16 => { - self.build_primitive_array::(rows, &field_path) - } - DataType::Int8 => { - self.build_primitive_array::(rows, &field_path) - } - DataType::UInt64 => { - self.build_primitive_array::(rows, &field_path) - } - DataType::UInt32 => { - self.build_primitive_array::(rows, &field_path) - } - DataType::UInt16 => { - self.build_primitive_array::(rows, &field_path) - } - DataType::UInt8 => { - self.build_primitive_array::(rows, &field_path) - } - // TODO: this is incomplete - DataType::Timestamp(unit, _) => match unit { - TimeUnit::Second => self - .build_primitive_array::( - rows, - &field_path, - ), - TimeUnit::Microsecond => self - .build_primitive_array::( - rows, - &field_path, - ), - TimeUnit::Millisecond => self - .build_primitive_array::( - rows, - &field_path, - ), - TimeUnit::Nanosecond => self - .build_primitive_array::( - rows, - &field_path, - ), - }, - DataType::Date64 => { - self.build_primitive_array::(rows, &field_path) - } - DataType::Date32 => { - self.build_primitive_array::(rows, &field_path) - } - DataType::Time64(unit) => match unit { - TimeUnit::Microsecond => self - .build_primitive_array::( - rows, - &field_path, - ), - TimeUnit::Nanosecond => self - .build_primitive_array::( - rows, - &field_path, - ), - t => { - return Err(SchemaError(format!( - "TimeUnit {t:?} not supported with Time64" - ))) - } - }, - DataType::Time32(unit) => match unit { - TimeUnit::Second => self - .build_primitive_array::(rows, &field_path), - TimeUnit::Millisecond => self - .build_primitive_array::( - rows, - &field_path, - ), - t => { - return Err(SchemaError(format!( - "TimeUnit {t:?} not supported with Time32" - ))) - } - }, - DataType::Utf8 | DataType::LargeUtf8 => Arc::new( - rows.iter() - .map(|row| { - let maybe_value = self.field_lookup(&field_path, row); - match maybe_value { - None => Ok(None), - Some(v) => resolve_string(v), - } - }) - .collect::>()?, - ) - as ArrayRef, - DataType::Binary | DataType::LargeBinary => Arc::new( - rows.iter() - .map(|row| { - let maybe_value = self.field_lookup(&field_path, row); - maybe_value.and_then(resolve_bytes) - }) - .collect::(), - ) - as ArrayRef, - DataType::FixedSizeBinary(ref size) => { - Arc::new(FixedSizeBinaryArray::try_from_sparse_iter_with_size( - rows.iter().map(|row| { - let maybe_value = self.field_lookup(&field_path, row); - maybe_value.and_then(|v| resolve_fixed(v, *size as usize)) - }), - *size, - )?) as ArrayRef - } - DataType::List(ref list_field) => { - match list_field.data_type() { - DataType::Dictionary(ref key_ty, _) => { - self.build_wrapped_list_array(rows, &field_path, key_ty)? - } - _ => { - // extract rows by name - let extracted_rows = rows - .iter() - .map(|row| { - self.field_lookup(&field_path, row) - .unwrap_or(&Value::Null) - }) - .collect::>(); - self.build_nested_list_array::( - &field_path, - &extracted_rows, - list_field, - )? - } - } - } - DataType::Dictionary(ref key_ty, ref val_ty) => self - .build_string_dictionary_array( - rows, - &field_path, - key_ty, - val_ty, - )?, - DataType::Struct(fields) => { - let len = rows.len(); - let num_bytes = bit_util::ceil(len, 8); - let mut null_buffer = MutableBuffer::from_len_zeroed(num_bytes); - let empty_vec = vec![]; - let struct_rows = rows - .iter() - .enumerate() - .map(|(i, row)| (i, self.field_lookup(&field_path, row))) - .map(|(i, v)| { - let v = v.map(maybe_resolve_union); - match v { - Some(Value::Record(value)) => { - bit_util::set_bit(&mut null_buffer, i); - value - } - None | Some(Value::Null) => &empty_vec, - other => { - panic!("expected struct got {other:?}"); - } - } - }) - .collect::>>(); - let arrays = - self.build_struct_array(&struct_rows, &field_path, fields)?; - // construct a struct array's data in order to set null buffer - let data_type = DataType::Struct(fields.clone()); - let data = ArrayDataBuilder::new(data_type) - .len(len) - .null_bit_buffer(Some(null_buffer.into())) - .child_data(arrays.into_iter().map(|a| a.to_data()).collect()) - .build()?; - make_array(data) - } - _ => { - return Err(SchemaError(format!( - "type {} not supported", - field.data_type() - ))) - } - }; - Ok(arr) - }) - .collect(); - arrays - } - - /// Read the primitive list's values into ArrayData - fn read_primitive_list_values(&self, rows: &[&Value]) -> ArrayData - where - T: ArrowPrimitiveType + ArrowNumericType, - T::Native: NumCast, - { - let values = rows - .iter() - .flat_map(|row| { - let row = maybe_resolve_union(row); - if let Value::Array(values) = row { - values - .iter() - .map(resolve_item::) - .collect::>>() - } else if let Some(f) = resolve_item::(row) { - vec![Some(f)] - } else { - vec![] - } - }) - .collect::>>(); - let array = values.iter().collect::>(); - array.to_data() - } - - fn field_lookup<'b>( - &self, - name: &str, - row: &'b [(String, Value)], - ) -> Option<&'b Value> { - self.schema_lookup - .get(name) - .and_then(|i| row.get(*i)) - .map(|o| &o.1) - } -} - -/// Flattens a list of Avro values, by flattening lists, and treating all other values as -/// single-value lists. -/// This is used to read into nested lists (list of list, list of struct) and non-dictionary lists. -#[inline] -fn flatten_values<'a>(values: &[&'a Value]) -> Vec<&'a Value> { - values - .iter() - .flat_map(|row| { - let v = maybe_resolve_union(row); - if let Value::Array(values) = v { - values.iter().collect() - } else { - // we interpret a scalar as a single-value list to minimise data loss - vec![v] - } - }) - .collect() -} - -/// Flattens a list into string values, dropping Value::Null in the process. -/// This is useful for interpreting any Avro array as string, dropping nulls. -/// See `value_as_string`. -#[inline] -fn flatten_string_values(values: &[&Value]) -> Vec> { - values - .iter() - .flat_map(|row| { - let row = maybe_resolve_union(row); - if let Value::Array(values) = row { - values - .iter() - .map(|s| resolve_string(s).ok().flatten()) - .collect::>>() - } else if let Value::Null = row { - vec![] - } else { - vec![resolve_string(row).ok().flatten()] - } - }) - .collect::>>() -} - -/// Reads an Avro value as a string, regardless of its type. -/// This is useful if the expected datatype is a string, in which case we preserve -/// all the values regardless of they type. -fn resolve_string(v: &Value) -> ArrowResult> { - let v = if let Value::Union(_, b) = v { b } else { v }; - match v { - Value::String(s) => Ok(Some(s.clone())), - Value::Bytes(bytes) => String::from_utf8(bytes.to_vec()) - .map_err(|e| AvroError::new(AvroErrorDetails::ConvertToUtf8(e))) - .map(Some), - Value::Enum(_, s) => Ok(Some(s.clone())), - Value::Null => Ok(None), - other => Err(AvroError::new(AvroErrorDetails::GetString(other.clone()))), - } - .map_err(|e| SchemaError(format!("expected resolvable string : {e}"))) -} - -fn resolve_u8(v: &Value) -> Option { - let v = match v { - Value::Union(_, inner) => inner.as_ref(), - _ => v, - }; - - match v { - Value::Int(n) => u8::try_from(*n).ok(), - Value::Long(n) => u8::try_from(*n).ok(), - _ => None, - } -} - -fn resolve_bytes(v: &Value) -> Option> { - let v = match v { - Value::Union(_, inner) => inner.as_ref(), - _ => v, - }; - - match v { - Value::Bytes(bytes) => Some(bytes.clone()), - Value::String(s) => Some(s.as_bytes().to_vec()), - Value::Array(items) => items.iter().map(resolve_u8).collect::>>(), - _ => None, - } -} - -fn resolve_fixed(v: &Value, size: usize) -> Option> { - let v = if let Value::Union(_, b) = v { b } else { v }; - match v { - Value::Fixed(n, bytes) => { - if *n == size { - Some(bytes.clone()) - } else { - None - } - } - _ => None, - } -} - -fn resolve_boolean(value: &Value) -> Option { - let v = if let Value::Union(_, b) = value { - b - } else { - value - }; - match v { - Value::Boolean(boolean) => Some(*boolean), - _ => None, - } -} - -trait Resolver: ArrowPrimitiveType { - fn resolve(value: &Value) -> Option; -} - -fn resolve_item(value: &Value) -> Option { - T::resolve(value) -} - -fn maybe_resolve_union(value: &Value) -> &Value { - if SchemaKind::from(value) == SchemaKind::Union { - // Pull out the Union, and attempt to resolve against it. - match value { - Value::Union(_, b) => b, - _ => unreachable!(), - } - } else { - value - } -} - -impl Resolver for N -where - N: ArrowNumericType, - N::Native: NumCast, -{ - fn resolve(value: &Value) -> Option { - let value = maybe_resolve_union(value); - match value { - Value::Int(i) | Value::TimeMillis(i) | Value::Date(i) => NumCast::from(*i), - Value::Long(l) - | Value::TimeMicros(l) - | Value::TimestampMillis(l) - | Value::TimestampMicros(l) => NumCast::from(*l), - Value::Float(f) => NumCast::from(*f), - Value::Double(f) => NumCast::from(*f), - Value::Duration(_d) => unimplemented!(), // shenanigans type - Value::Null => None, - _ => unreachable!(), - } - } -} - -#[cfg(test)] -mod test { - use crate::avro_to_arrow::{Reader, ReaderBuilder}; - use arrow::array::Array; - use arrow::datatypes::DataType; - use arrow::datatypes::{Field, TimeUnit}; - use datafusion_common::assert_batches_eq; - use datafusion_common::cast::{ - as_int32_array, as_int64_array, as_list_array, as_timestamp_microsecond_array, - }; - use std::fs::File; - use std::sync::Arc; - - fn build_reader(name: &'_ str, batch_size: usize) -> Reader<'_, File> { - let testdata = datafusion_common::test_util::arrow_test_data(); - let filename = format!("{testdata}/avro/{name}"); - let builder = ReaderBuilder::new() - .read_schema() - .with_batch_size(batch_size); - builder.build(File::open(filename).unwrap()).unwrap() - } - - // TODO: Fixed, Enum, Dictionary - - #[test] - fn test_time_avro_milliseconds() { - let mut reader = build_reader("alltypes_plain.avro", 10); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(11, batch.num_columns()); - assert_eq!(8, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let timestamp_col = schema.column_with_name("timestamp_col").unwrap(); - assert_eq!( - &DataType::Timestamp(TimeUnit::Microsecond, None), - timestamp_col.1.data_type() - ); - let timestamp_array = - as_timestamp_microsecond_array(batch.column(timestamp_col.0)).unwrap(); - for i in 0..timestamp_array.len() { - assert!(timestamp_array.is_valid(i)); - } - assert_eq!(1235865600000000, timestamp_array.value(0)); - assert_eq!(1235865660000000, timestamp_array.value(1)); - assert_eq!(1238544000000000, timestamp_array.value(2)); - assert_eq!(1238544060000000, timestamp_array.value(3)); - assert_eq!(1233446400000000, timestamp_array.value(4)); - assert_eq!(1233446460000000, timestamp_array.value(5)); - assert_eq!(1230768000000000, timestamp_array.value(6)); - assert_eq!(1230768060000000, timestamp_array.value(7)); - } - - #[test] - fn test_avro_read_list() { - let mut reader = build_reader("list_columns.avro", 3); - let schema = reader.schema(); - let (col_id_index, _) = schema.column_with_name("int64_list").unwrap(); - let batch = reader.next().unwrap().unwrap(); - assert_eq!(batch.num_columns(), 2); - assert_eq!(batch.num_rows(), 3); - let a_array = as_list_array(batch.column(col_id_index)).unwrap(); - assert_eq!( - *a_array.data_type(), - DataType::List(Arc::new(Field::new("element", DataType::Int64, true))) - ); - let array = a_array.value(0); - assert_eq!(*array.data_type(), DataType::Int64); - - assert_eq!( - 6, - as_int64_array(&array) - .unwrap() - .iter() - .flatten() - .sum::() - ); - } - #[test] - fn test_avro_read_nested_list() { - let mut reader = build_reader("nested_lists.snappy.avro", 3); - let batch = reader.next().unwrap().unwrap(); - assert_eq!(batch.num_columns(), 2); - assert_eq!(batch.num_rows(), 3); - } - - #[test] - fn test_complex_list() { - let schema = apache_avro::Schema::parse_str( - r#" - { - "type": "record", - "name": "r1", - "fields": [ - { - "name": "headers", - "type": ["null", { - "type": "array", - "items": ["null",{ - "name":"r2", - "type": "record", - "fields":[ - {"name":"name", "type": ["null", "string"], "default": null}, - {"name":"value", "type": ["null", "string"], "default": null} - ] - }] - }], - "default": null - } - ] - }"#, - ) - .unwrap(); - let r1 = apache_avro::to_value(serde_json::json!({ - "headers": [ - { - "name": "a", - "value": "b" - } - ] - })) - .unwrap() - .resolve(&schema) - .unwrap(); - - let mut w = apache_avro::Writer::new(&schema, vec![]); - w.append(r1).unwrap(); - let bytes = w.into_inner().unwrap(); - - let mut reader = ReaderBuilder::new() - .read_schema() - .with_batch_size(2) - .build(std::io::Cursor::new(bytes)) - .unwrap(); - - let batch = reader.next().unwrap().unwrap(); - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 1); - let expected = [ - "+-----------------------+", - "| headers |", - "+-----------------------+", - "| [{name: a, value: b}] |", - "+-----------------------+", - ]; - assert_batches_eq!(expected, &[batch]); - } - - #[test] - fn test_complex_struct() { - let schema = apache_avro::Schema::parse_str( - r#" - { - "type": "record", - "name": "r1", - "fields": [ - { - "name": "dns", - "type": [ - "null", - { - "type": "record", - "name": "r13", - "fields": [ - { - "name": "answers", - "type": [ - "null", - { - "type": "array", - "items": [ - "null", - { - "type": "record", - "name": "r292", - "fields": [ - { - "name": "class", - "type": ["null", "string"], - "default": null - }, - { - "name": "data", - "type": ["null", "string"], - "default": null - }, - { - "name": "name", - "type": ["null", "string"], - "default": null - }, - { - "name": "ttl", - "type": ["null", "long"], - "default": null - }, - { - "name": "type", - "type": ["null", "string"], - "default": null - } - ] - } - ] - } - ], - "default": null - }, - { - "name": "header_flags", - "type": [ - "null", - { - "type": "array", - "items": ["null", "string"] - } - ], - "default": null - }, - { - "name": "id", - "type": ["null", "string"], - "default": null - }, - { - "name": "op_code", - "type": ["null", "string"], - "default": null - }, - { - "name": "question", - "type": [ - "null", - { - "type": "record", - "name": "r288", - "fields": [ - { - "name": "class", - "type": ["null", "string"], - "default": null - }, - { - "name": "name", - "type": ["null", "string"], - "default": null - }, - { - "name": "registered_domain", - "type": ["null", "string"], - "default": null - }, - { - "name": "subdomain", - "type": ["null", "string"], - "default": null - }, - { - "name": "top_level_domain", - "type": ["null", "string"], - "default": null - }, - { - "name": "type", - "type": ["null", "string"], - "default": null - } - ] - } - ], - "default": null - }, - { - "name": "resolved_ip", - "type": [ - "null", - { - "type": "array", - "items": ["null", "string"] - } - ], - "default": null - }, - { - "name": "response_code", - "type": ["null", "string"], - "default": null - }, - { - "name": "type", - "type": ["null", "string"], - "default": null - } - ] - } - ], - "default": null - } - ] - }"#, - ) - .unwrap(); - - let jv1 = serde_json::json!({ - "dns": { - "answers": [ - { - "data": "CHNlY3VyaXR5BnVidW50dQMjb20AAAEAAQAAAAgABLl9vic=", - "type": "1" - }, - { - "data": "CHNlY3VyaXR5BnVidW50dQNjb20AAAEAABAAAAgABLl9viQ=", - "type": "1" - }, - { - "data": "CHNlT3VyaXR5BnVidW50dQNjb20AAAEAAQAAAAgABFu9Wyc=", - "type": "1" - } - ], - "question": { - "name": "security.ubuntu.com", - "type": "A" - }, - "resolved_ip": [ - "67.43.156.1", - "67.43.156.2", - "67.43.156.3" - ], - "response_code": "0" - } - }); - let r1 = apache_avro::to_value(jv1) - .unwrap() - .resolve(&schema) - .unwrap(); - - let mut w = apache_avro::Writer::new(&schema, vec![]); - w.append(r1).unwrap(); - let bytes = w.into_inner().unwrap(); - - let mut reader = ReaderBuilder::new() - .read_schema() - .with_batch_size(1) - .build(std::io::Cursor::new(bytes)) - .unwrap(); - - let batch = reader.next().unwrap().unwrap(); - assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 1); - - let expected = [ - "+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+", - "| dns |", - "+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+", - "| {answers: [{class: , data: CHNlY3VyaXR5BnVidW50dQMjb20AAAEAAQAAAAgABLl9vic=, name: , ttl: , type: 1}, {class: , data: CHNlY3VyaXR5BnVidW50dQNjb20AAAEAABAAAAgABLl9viQ=, name: , ttl: , type: 1}, {class: , data: CHNlT3VyaXR5BnVidW50dQNjb20AAAEAAQAAAAgABFu9Wyc=, name: , ttl: , type: 1}], header_flags: , id: , op_code: , question: {class: , name: security.ubuntu.com, registered_domain: , subdomain: , top_level_domain: , type: A}, resolved_ip: [67.43.156.1, 67.43.156.2, 67.43.156.3], response_code: 0, type: } |", - "+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+", - ]; - assert_batches_eq!(expected, &[batch]); - } - - #[test] - fn test_deep_nullable_struct() { - let schema = apache_avro::Schema::parse_str( - r#" - { - "type": "record", - "name": "r1", - "fields": [ - { - "name": "col1", - "type": [ - "null", - { - "type": "record", - "name": "r2", - "fields": [ - { - "name": "col2", - "type": [ - "null", - { - "type": "record", - "name": "r3", - "fields": [ - { - "name": "col3", - "type": [ - "null", - { - "type": "record", - "name": "r4", - "fields": [ - { - "name": "col4", - "type": [ - "null", - { - "type": "record", - "name": "r5", - "fields": [ - { - "name": "col5", - "type": ["null", "string"] - } - ] - } - ] - } - ] - } - ] - } - ] - } - ] - } - ] - } - ] - } - ] - } - "#, - ) - .unwrap(); - let r1 = apache_avro::to_value(serde_json::json!({ - "col1": { - "col2": { - "col3": { - "col4": { - "col5": "hello" - } - } - } - } - })) - .unwrap() - .resolve(&schema) - .unwrap(); - let r2 = apache_avro::to_value(serde_json::json!({ - "col1": { - "col2": { - "col3": { - "col4": { - "col5": null - } - } - } - } - })) - .unwrap() - .resolve(&schema) - .unwrap(); - let r3 = apache_avro::to_value(serde_json::json!({ - "col1": { - "col2": { - "col3": null - } - } - })) - .unwrap() - .resolve(&schema) - .unwrap(); - let r4 = apache_avro::to_value(serde_json::json!({ "col1": null })) - .unwrap() - .resolve(&schema) - .unwrap(); - - let mut w = apache_avro::Writer::new(&schema, vec![]); - w.append(r1).unwrap(); - w.append(r2).unwrap(); - w.append(r3).unwrap(); - w.append(r4).unwrap(); - let bytes = w.into_inner().unwrap(); - - let mut reader = ReaderBuilder::new() - .read_schema() - .with_batch_size(4) - .build(std::io::Cursor::new(bytes)) - .unwrap(); - - let batch = reader.next().unwrap().unwrap(); - - let expected = [ - "+---------------------------------------+", - "| col1 |", - "+---------------------------------------+", - "| {col2: {col3: {col4: {col5: hello}}}} |", - "| {col2: {col3: {col4: {col5: }}}} |", - "| {col2: {col3: }} |", - "| |", - "+---------------------------------------+", - ]; - assert_batches_eq!(expected, &[batch]); - } - - #[test] - fn test_avro_nullable_struct() { - let schema = apache_avro::Schema::parse_str( - r#" - { - "type": "record", - "name": "r1", - "fields": [ - { - "name": "col1", - "type": [ - "null", - { - "type": "record", - "name": "r2", - "fields": [ - { - "name": "col2", - "type": ["null", "string"] - } - ] - } - ], - "default": null - } - ] - }"#, - ) - .unwrap(); - let r1 = apache_avro::to_value(serde_json::json!({ "col1": null })) - .unwrap() - .resolve(&schema) - .unwrap(); - let r2 = apache_avro::to_value(serde_json::json!({ - "col1": { - "col2": "hello" - } - })) - .unwrap() - .resolve(&schema) - .unwrap(); - let r3 = apache_avro::to_value(serde_json::json!({ - "col1": { - "col2": null - } - })) - .unwrap() - .resolve(&schema) - .unwrap(); - - let mut w = apache_avro::Writer::new(&schema, vec![]); - w.append(r1).unwrap(); - w.append(r2).unwrap(); - w.append(r3).unwrap(); - let bytes = w.into_inner().unwrap(); - - let mut reader = ReaderBuilder::new() - .read_schema() - .with_batch_size(3) - .build(std::io::Cursor::new(bytes)) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - assert_eq!(batch.num_rows(), 3); - assert_eq!(batch.num_columns(), 1); - - let expected = [ - "+---------------+", - "| col1 |", - "+---------------+", - "| |", - "| {col2: hello} |", - "| {col2: } |", - "+---------------+", - ]; - assert_batches_eq!(expected, &[batch]); - } - - #[test] - fn test_avro_nullable_struct_array() { - let schema = apache_avro::Schema::parse_str( - r#" - { - "type": "record", - "name": "r1", - "fields": [ - { - "name": "col1", - "type": [ - "null", - { - "type": "array", - "items": { - "type": [ - "null", - { - "type": "record", - "name": "Item", - "fields": [ - { - "name": "id", - "type": "long" - } - ] - } - ] - } - } - ], - "default": null - } - ] - }"#, - ) - .unwrap(); - let jv1 = serde_json::json!({ - "col1": [ - { - "id": 234 - }, - { - "id": 345 - } - ] - }); - let r1 = apache_avro::to_value(jv1) - .unwrap() - .resolve(&schema) - .unwrap(); - let r2 = apache_avro::to_value(serde_json::json!({ "col1": null })) - .unwrap() - .resolve(&schema) - .unwrap(); - - let mut w = apache_avro::Writer::new(&schema, vec![]); - for _i in 0..5 { - w.append(r1.clone()).unwrap(); - } - w.append(r2).unwrap(); - let bytes = w.into_inner().unwrap(); - - let mut reader = ReaderBuilder::new() - .read_schema() - .with_batch_size(20) - .build(std::io::Cursor::new(bytes)) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - assert_eq!(batch.num_rows(), 6); - assert_eq!(batch.num_columns(), 1); - - let expected = [ - "+------------------------+", - "| col1 |", - "+------------------------+", - "| [{id: 234}, {id: 345}] |", - "| [{id: 234}, {id: 345}] |", - "| [{id: 234}, {id: 345}] |", - "| [{id: 234}, {id: 345}] |", - "| [{id: 234}, {id: 345}] |", - "| |", - "+------------------------+", - ]; - assert_batches_eq!(expected, &[batch]); - } - - #[test] - fn test_avro_iterator() { - let reader = build_reader("alltypes_plain.avro", 5); - let schema = reader.schema(); - let (col_id_index, _) = schema.column_with_name("id").unwrap(); - - let mut sum_num_rows = 0; - let mut num_batches = 0; - let mut sum_id = 0; - for batch in reader { - let batch = batch.unwrap(); - assert_eq!(11, batch.num_columns()); - sum_num_rows += batch.num_rows(); - num_batches += 1; - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - let a_array = as_int32_array(batch.column(col_id_index)).unwrap(); - sum_id += (0..a_array.len()).map(|i| a_array.value(i)).sum::(); - } - assert_eq!(8, sum_num_rows); - assert_eq!(2, num_batches); - assert_eq!(28, sum_id); - } -} diff --git a/datafusion/datasource-avro/src/avro_to_arrow/mod.rs b/datafusion/datasource-avro/src/avro_to_arrow/mod.rs deleted file mode 100644 index 4d262a396eb3b..0000000000000 --- a/datafusion/datasource-avro/src/avro_to_arrow/mod.rs +++ /dev/null @@ -1,38 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! This module contains code for reading [Avro] data into `RecordBatch`es -//! -//! [Avro]: https://avro.apache.org/docs/1.2.0/ - -// mod arrow_array_reader; -// mod reader; -// mod schema; - -use arrow::datatypes::Schema; -use arrow_avro::reader::ReaderBuilder; - -// pub use schema::to_arrow_schema; -use std::io::{BufReader, Read}; - -/// Read Avro schema given a reader -pub fn read_avro_schema_from_reader( - reader: &mut R, -) -> datafusion_common::Result { - let avro_reader = ReaderBuilder::new().build(BufReader::new(reader))?; - Ok(avro_reader.schema().as_ref().clone()) -} diff --git a/datafusion/datasource-avro/src/avro_to_arrow/reader.rs b/datafusion/datasource-avro/src/avro_to_arrow/reader.rs deleted file mode 100644 index 9a4d13fc191da..0000000000000 --- a/datafusion/datasource-avro/src/avro_to_arrow/reader.rs +++ /dev/null @@ -1,357 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use super::arrow_array_reader::AvroArrowArrayReader; -use arrow::datatypes::{Fields, SchemaRef}; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; -use datafusion_common::Result; -use std::io::{Read, Seek}; -use std::sync::Arc; - -/// Avro file reader builder -#[derive(Debug)] -pub struct ReaderBuilder { - /// Optional schema for the Avro file - /// - /// If the schema is not supplied, the reader will try to read the schema. - schema: Option, - /// Batch size (number of records to load each time) - /// - /// The default batch size when using the `ReaderBuilder` is 1024 records - batch_size: usize, - /// Optional projection for which columns to load (zero-based column indices) - projection: Option>, -} - -impl Default for ReaderBuilder { - fn default() -> Self { - Self { - schema: None, - batch_size: 1024, - projection: None, - } - } -} - -impl ReaderBuilder { - /// Create a new builder for configuring Avro parsing options. - /// - /// To convert a builder into a reader, call `Reader::from_builder` - /// - /// # Example - /// - /// ``` - /// use std::fs::File; - /// - /// use datafusion_datasource_avro::avro_to_arrow::{Reader, ReaderBuilder}; - /// - /// fn example() -> Reader<'static, File> { - /// let file = File::open("test/data/basic.avro").unwrap(); - /// - /// // create a builder, inferring the schema with the first 100 records - /// let builder = ReaderBuilder::new() - /// .read_schema() - /// .with_batch_size(100); - /// - /// let reader = builder - /// .build::(file) - /// .unwrap(); - /// - /// reader - /// } - /// ``` - pub fn new() -> Self { - Self::default() - } - - /// Set the Avro file's schema - pub fn with_schema(mut self, schema: SchemaRef) -> Self { - self.schema = Some(schema); - self - } - - /// Set the Avro reader to infer the schema of the file - pub fn read_schema(mut self) -> Self { - // remove any schema that is set - self.schema = None; - self - } - - /// Set the batch size (number of records to load at one time) - pub fn with_batch_size(mut self, batch_size: usize) -> Self { - self.batch_size = batch_size; - self - } - - /// Set the reader's column projection - pub fn with_projection(mut self, projection: Vec) -> Self { - self.projection = Some(projection); - self - } - - /// Create a new `Reader` from the `ReaderBuilder` - pub fn build<'a, R>(self, source: R) -> Result> - where - R: Read + Seek, - { - let mut source = source; - - // check if schema should be inferred - let schema = match self.schema { - Some(schema) => schema, - None => Arc::new(super::read_avro_schema_from_reader(&mut source)?), - }; - source.rewind()?; - Reader::try_new(source, schema, self.batch_size, self.projection) - } -} - -/// Avro file record reader -pub struct Reader<'a, R: Read> { - array_reader: AvroArrowArrayReader<'a, R>, - schema: SchemaRef, - batch_size: usize, -} - -impl Reader<'_, R> { - /// Create a new Avro Reader from any value that implements the `Read` trait. - /// - /// If reading a `File`, you can customise the Reader, such as to enable schema - /// inference, use `ReaderBuilder`. - /// - /// If projection is provided, it uses a schema with only the fields in the projection, respecting their order. - /// Only the first level of projection is handled. No further projection currently occurs, but would be - /// useful if plucking values from a struct, e.g. getting `a.b.c.e` from `a.b.c.{d, e}`. - pub fn try_new( - reader: R, - schema: SchemaRef, - batch_size: usize, - projection: Option>, - ) -> Result { - let projected_schema = projection.as_ref().filter(|p| !p.is_empty()).map_or_else( - || Arc::clone(&schema), - |proj| { - Arc::new(arrow::datatypes::Schema::new( - proj.iter() - .filter_map(|name| { - schema.column_with_name(name).map(|(_, f)| f.clone()) - }) - .collect::(), - )) - }, - ); - - Ok(Self { - array_reader: AvroArrowArrayReader::try_new( - reader, - Arc::clone(&projected_schema), - )?, - schema: projected_schema, - batch_size, - }) - } - - /// Returns the schema of the reader, useful for getting the schema without reading - /// record batches - pub fn schema(&self) -> SchemaRef { - Arc::clone(&self.schema) - } -} - -impl Iterator for Reader<'_, R> { - type Item = ArrowResult; - - /// Returns the next batch of results (defined by `self.batch_size`), or `None` if there - /// are no more results. - fn next(&mut self) -> Option { - self.array_reader.next_batch(self.batch_size) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow::array::*; - use arrow::array::{ - BinaryArray, BooleanArray, Float32Array, Float64Array, Int32Array, Int64Array, - TimestampMicrosecondArray, - }; - use arrow::datatypes::TimeUnit; - use arrow::datatypes::{DataType, Field}; - use std::fs::File; - - fn build_reader(name: &'_ str, projection: Option>) -> Reader<'_, File> { - let testdata = datafusion_common::test_util::arrow_test_data(); - let filename = format!("{testdata}/avro/{name}"); - let mut builder = ReaderBuilder::new().read_schema().with_batch_size(64); - if let Some(projection) = projection { - builder = builder.with_projection(projection); - } - builder.build(File::open(filename).unwrap()).unwrap() - } - - fn get_col<'a, T: 'static>( - batch: &'a RecordBatch, - col: (usize, &Field), - ) -> Option<&'a T> { - batch.column(col.0).as_any().downcast_ref::() - } - - #[test] - fn test_avro_basic() { - let mut reader = build_reader("alltypes_dictionary.avro", None); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(11, batch.num_columns()); - assert_eq!(2, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let id = schema.column_with_name("id").unwrap(); - assert_eq!(0, id.0); - assert_eq!(&DataType::Int32, id.1.data_type()); - let col = get_col::(&batch, id).unwrap(); - assert_eq!(0, col.value(0)); - assert_eq!(1, col.value(1)); - let bool_col = schema.column_with_name("bool_col").unwrap(); - assert_eq!(1, bool_col.0); - assert_eq!(&DataType::Boolean, bool_col.1.data_type()); - let col = get_col::(&batch, bool_col).unwrap(); - assert!(col.value(0)); - assert!(!col.value(1)); - let tinyint_col = schema.column_with_name("tinyint_col").unwrap(); - assert_eq!(2, tinyint_col.0); - assert_eq!(&DataType::Int32, tinyint_col.1.data_type()); - let col = get_col::(&batch, tinyint_col).unwrap(); - assert_eq!(0, col.value(0)); - assert_eq!(1, col.value(1)); - let smallint_col = schema.column_with_name("smallint_col").unwrap(); - assert_eq!(3, smallint_col.0); - assert_eq!(&DataType::Int32, smallint_col.1.data_type()); - let col = get_col::(&batch, smallint_col).unwrap(); - assert_eq!(0, col.value(0)); - assert_eq!(1, col.value(1)); - let int_col = schema.column_with_name("int_col").unwrap(); - assert_eq!(4, int_col.0); - let col = get_col::(&batch, int_col).unwrap(); - assert_eq!(0, col.value(0)); - assert_eq!(1, col.value(1)); - assert_eq!(&DataType::Int32, int_col.1.data_type()); - let col = get_col::(&batch, int_col).unwrap(); - assert_eq!(0, col.value(0)); - assert_eq!(1, col.value(1)); - let bigint_col = schema.column_with_name("bigint_col").unwrap(); - assert_eq!(5, bigint_col.0); - let col = get_col::(&batch, bigint_col).unwrap(); - assert_eq!(0, col.value(0)); - assert_eq!(10, col.value(1)); - assert_eq!(&DataType::Int64, bigint_col.1.data_type()); - let float_col = schema.column_with_name("float_col").unwrap(); - assert_eq!(6, float_col.0); - let col = get_col::(&batch, float_col).unwrap(); - assert_eq!(0.0, col.value(0)); - assert_eq!(1.1, col.value(1)); - assert_eq!(&DataType::Float32, float_col.1.data_type()); - let col = get_col::(&batch, float_col).unwrap(); - assert_eq!(0.0, col.value(0)); - assert_eq!(1.1, col.value(1)); - let double_col = schema.column_with_name("double_col").unwrap(); - assert_eq!(7, double_col.0); - assert_eq!(&DataType::Float64, double_col.1.data_type()); - let col = get_col::(&batch, double_col).unwrap(); - assert_eq!(0.0, col.value(0)); - assert_eq!(10.1, col.value(1)); - let date_string_col = schema.column_with_name("date_string_col").unwrap(); - assert_eq!(8, date_string_col.0); - assert_eq!(&DataType::Binary, date_string_col.1.data_type()); - let col = get_col::(&batch, date_string_col).unwrap(); - assert_eq!("01/01/09".as_bytes(), col.value(0)); - assert_eq!("01/01/09".as_bytes(), col.value(1)); - let string_col = schema.column_with_name("string_col").unwrap(); - assert_eq!(9, string_col.0); - assert_eq!(&DataType::Binary, string_col.1.data_type()); - let col = get_col::(&batch, string_col).unwrap(); - assert_eq!("0".as_bytes(), col.value(0)); - assert_eq!("1".as_bytes(), col.value(1)); - let timestamp_col = schema.column_with_name("timestamp_col").unwrap(); - assert_eq!(10, timestamp_col.0); - assert_eq!( - &DataType::Timestamp(TimeUnit::Microsecond, None), - timestamp_col.1.data_type() - ); - let col = get_col::(&batch, timestamp_col).unwrap(); - assert_eq!(1230768000000000, col.value(0)); - assert_eq!(1230768060000000, col.value(1)); - } - - #[test] - fn test_avro_with_projection() { - // Test projection to filter and reorder columns - let projection = Some(vec![ - "string_col".to_string(), - "double_col".to_string(), - "bool_col".to_string(), - ]); - let mut reader = build_reader("alltypes_dictionary.avro", projection); - let batch = reader.next().unwrap().unwrap(); - - // Only 3 columns should be present (not all 11) - assert_eq!(3, batch.num_columns()); - assert_eq!(2, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - // Verify columns are in the order specified in projection - // First column should be string_col (was at index 9 in original) - assert_eq!("string_col", schema.field(0).name()); - assert_eq!(&DataType::Binary, schema.field(0).data_type()); - let col = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!("0".as_bytes(), col.value(0)); - assert_eq!("1".as_bytes(), col.value(1)); - - // Second column should be double_col (was at index 7 in original) - assert_eq!("double_col", schema.field(1).name()); - assert_eq!(&DataType::Float64, schema.field(1).data_type()); - let col = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(0.0, col.value(0)); - assert_eq!(10.1, col.value(1)); - - // Third column should be bool_col (was at index 1 in original) - assert_eq!("bool_col", schema.field(2).name()); - assert_eq!(&DataType::Boolean, schema.field(2).data_type()); - let col = batch - .column(2) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(col.value(0)); - assert!(!col.value(1)); - } -} diff --git a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs b/datafusion/datasource-avro/src/avro_to_arrow/schema.rs deleted file mode 100644 index 3fce0d4826a22..0000000000000 --- a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs +++ /dev/null @@ -1,523 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use apache_avro::schema::{ - Alias, DecimalSchema, EnumSchema, FixedSchema, Name, RecordSchema, -}; -use apache_avro::types::Value; -use apache_avro::Schema as AvroSchema; -use arrow::datatypes::{DataType, IntervalUnit, Schema, TimeUnit, UnionMode}; -use arrow::datatypes::{Field, UnionFields}; -use datafusion_common::error::Result; -use std::collections::HashMap; -use std::sync::Arc; - -/// Converts an avro schema to an arrow schema -pub fn to_arrow_schema(avro_schema: &apache_avro::Schema) -> Result { - let mut schema_fields = vec![]; - match avro_schema { - AvroSchema::Record(RecordSchema { fields, .. }) => { - for field in fields { - schema_fields.push(schema_to_field_with_props( - &field.schema, - Some(&field.name), - field.is_nullable(), - Some(external_props(&field.schema)), - )?) - } - } - schema => schema_fields.push(schema_to_field(schema, Some(""), false)?), - } - - let schema = Schema::new(schema_fields); - Ok(schema) -} - -fn schema_to_field( - schema: &apache_avro::Schema, - name: Option<&str>, - nullable: bool, -) -> Result { - schema_to_field_with_props(schema, name, nullable, Default::default()) -} - -fn schema_to_field_with_props( - schema: &AvroSchema, - name: Option<&str>, - nullable: bool, - props: Option>, -) -> Result { - let mut nullable = nullable; - let field_type: DataType = match schema { - AvroSchema::Ref { .. } => todo!("Add support for AvroSchema::Ref"), - AvroSchema::Null => DataType::Null, - AvroSchema::Boolean => DataType::Boolean, - AvroSchema::Int => DataType::Int32, - AvroSchema::Long => DataType::Int64, - AvroSchema::Float => DataType::Float32, - AvroSchema::Double => DataType::Float64, - AvroSchema::Bytes => DataType::Binary, - AvroSchema::String => DataType::Utf8, - AvroSchema::Array(item_schema) => DataType::List(Arc::new( - schema_to_field_with_props(&item_schema.items, Some("element"), false, None)?, - )), - AvroSchema::Map(value_schema) => { - let value_field = schema_to_field_with_props( - &value_schema.types, - Some("value"), - false, - None, - )?; - DataType::Dictionary( - Box::new(DataType::Utf8), - Box::new(value_field.data_type().clone()), - ) - } - AvroSchema::Union(us) => { - // If there are only two variants and one of them is null, set the other type as the field data type - let has_nullable = us - .find_schema_with_known_schemata::( - &Value::Null, - None, - &None, - ) - .is_some(); - let sub_schemas = us.variants(); - if has_nullable && sub_schemas.len() == 2 { - nullable = true; - if let Some(schema) = sub_schemas - .iter() - .find(|&schema| !matches!(schema, AvroSchema::Null)) - { - schema_to_field_with_props(schema, None, has_nullable, None)? - .data_type() - .clone() - } else { - return Err(apache_avro::Error::new( - apache_avro::error::Details::GetUnionDuplicate, - ) - .into()); - } - } else { - let fields = sub_schemas - .iter() - .map(|s| schema_to_field_with_props(s, None, has_nullable, None)) - .collect::>>()?; - let type_ids = 0_i8..fields.len() as i8; - DataType::Union(UnionFields::new(type_ids, fields), UnionMode::Dense) - } - } - AvroSchema::Record(RecordSchema { fields, .. }) => { - let fields: Result<_> = fields - .iter() - .map(|field| { - let mut props = HashMap::new(); - if let Some(doc) = &field.doc { - props.insert("avro::doc".to_string(), doc.clone()); - } - /*if let Some(aliases) = fields.aliases { - props.insert("aliases", aliases); - }*/ - schema_to_field_with_props( - &field.schema, - Some(&field.name), - false, - Some(props), - ) - }) - .collect(); - DataType::Struct(fields?) - } - AvroSchema::Enum(EnumSchema { .. }) => DataType::Utf8, - AvroSchema::Fixed(FixedSchema { size, .. }) => { - DataType::FixedSizeBinary(*size as i32) - } - AvroSchema::Decimal(DecimalSchema { - precision, scale, .. - }) => DataType::Decimal128(*precision as u8, *scale as i8), - AvroSchema::BigDecimal => DataType::LargeBinary, - AvroSchema::Uuid => DataType::FixedSizeBinary(16), - AvroSchema::Date => DataType::Date32, - AvroSchema::TimeMillis => DataType::Time32(TimeUnit::Millisecond), - AvroSchema::TimeMicros => DataType::Time64(TimeUnit::Microsecond), - AvroSchema::TimestampMillis => DataType::Timestamp(TimeUnit::Millisecond, None), - AvroSchema::TimestampMicros => DataType::Timestamp(TimeUnit::Microsecond, None), - AvroSchema::TimestampNanos => DataType::Timestamp(TimeUnit::Nanosecond, None), - AvroSchema::LocalTimestampMillis => todo!(), - AvroSchema::LocalTimestampMicros => todo!(), - AvroSchema::LocalTimestampNanos => todo!(), - AvroSchema::Duration => DataType::Duration(TimeUnit::Millisecond), - }; - - let data_type = field_type.clone(); - let name = name.unwrap_or_else(|| default_field_name(&data_type)); - - let mut field = Field::new(name, field_type, nullable); - field.set_metadata(props.unwrap_or_default()); - Ok(field) -} - -fn default_field_name(dt: &DataType) -> &str { - match dt { - DataType::Null => "null", - DataType::Boolean => "bit", - DataType::Int8 => "tinyint", - DataType::Int16 => "smallint", - DataType::Int32 => "int", - DataType::Int64 => "bigint", - DataType::UInt8 => "uint1", - DataType::UInt16 => "uint2", - DataType::UInt32 => "uint4", - DataType::UInt64 => "uint8", - DataType::Float16 => "float2", - DataType::Float32 => "float4", - DataType::Float64 => "float8", - DataType::Date32 => "dateday", - DataType::Date64 => "datemilli", - DataType::Time32(tu) | DataType::Time64(tu) => match tu { - TimeUnit::Second => "timesec", - TimeUnit::Millisecond => "timemilli", - TimeUnit::Microsecond => "timemicro", - TimeUnit::Nanosecond => "timenano", - }, - DataType::Timestamp(tu, tz) => { - if tz.is_some() { - match tu { - TimeUnit::Second => "timestampsectz", - TimeUnit::Millisecond => "timestampmillitz", - TimeUnit::Microsecond => "timestampmicrotz", - TimeUnit::Nanosecond => "timestampnanotz", - } - } else { - match tu { - TimeUnit::Second => "timestampsec", - TimeUnit::Millisecond => "timestampmilli", - TimeUnit::Microsecond => "timestampmicro", - TimeUnit::Nanosecond => "timestampnano", - } - } - } - DataType::Duration(_) => "duration", - DataType::Interval(unit) => match unit { - IntervalUnit::YearMonth => "intervalyear", - IntervalUnit::DayTime => "intervalmonth", - IntervalUnit::MonthDayNano => "intervalmonthdaynano", - }, - DataType::Binary => "varbinary", - DataType::FixedSizeBinary(_) => "fixedsizebinary", - DataType::LargeBinary => "largevarbinary", - DataType::Utf8 => "varchar", - DataType::LargeUtf8 => "largevarchar", - DataType::List(_) => "list", - DataType::FixedSizeList(_, _) => "fixed_size_list", - DataType::LargeList(_) => "largelist", - DataType::Struct(_) => "struct", - DataType::Union(_, _) => "union", - DataType::Dictionary(_, _) => "map", - DataType::Map(_, _) => unimplemented!("Map support not implemented"), - DataType::RunEndEncoded(_, _) => { - unimplemented!("RunEndEncoded support not implemented") - } - DataType::Utf8View - | DataType::BinaryView - | DataType::ListView(_) - | DataType::LargeListView(_) => { - unimplemented!("View support not implemented") - } - DataType::Decimal32(_, _) => "decimal", - DataType::Decimal64(_, _) => "decimal", - DataType::Decimal128(_, _) => "decimal", - DataType::Decimal256(_, _) => "decimal", - } -} - -fn external_props(schema: &AvroSchema) -> HashMap { - let mut props = HashMap::new(); - match &schema { - AvroSchema::Record(RecordSchema { - doc: Some(ref doc), .. - }) - | AvroSchema::Enum(EnumSchema { - doc: Some(ref doc), .. - }) - | AvroSchema::Fixed(FixedSchema { - doc: Some(ref doc), .. - }) => { - props.insert("avro::doc".to_string(), doc.clone()); - } - _ => {} - } - match &schema { - AvroSchema::Record(RecordSchema { - name: Name { namespace, .. }, - aliases: Some(aliases), - .. - }) - | AvroSchema::Enum(EnumSchema { - name: Name { namespace, .. }, - aliases: Some(aliases), - .. - }) - | AvroSchema::Fixed(FixedSchema { - name: Name { namespace, .. }, - aliases: Some(aliases), - .. - }) => { - let aliases: Vec = aliases - .iter() - .map(|alias| aliased(alias, namespace.as_deref(), None)) - .collect(); - props.insert( - "avro::aliases".to_string(), - format!("[{}]", aliases.join(",")), - ); - } - _ => {} - } - props -} - -/// Returns the fully qualified name for a field -pub fn aliased( - alias: &Alias, - namespace: Option<&str>, - default_namespace: Option<&str>, -) -> String { - if alias.namespace().is_some() { - alias.fullname(None) - } else { - let namespace = namespace.as_ref().copied().or(default_namespace); - - match namespace { - Some(ref namespace) => format!("{}.{}", namespace, alias.name()), - None => alias.fullname(None), - } - } -} - -#[cfg(test)] -mod test { - use super::{aliased, external_props, to_arrow_schema}; - use apache_avro::schema::{Alias, EnumSchema, FixedSchema, Name, RecordSchema}; - use apache_avro::Schema as AvroSchema; - use arrow::datatypes::DataType::{Binary, Float32, Float64, Timestamp, Utf8}; - use arrow::datatypes::DataType::{Boolean, Int32, Int64}; - use arrow::datatypes::TimeUnit::Microsecond; - use arrow::datatypes::{Field, Schema}; - - fn alias(name: &str) -> Alias { - Alias::new(name).unwrap() - } - - #[test] - fn test_alias() { - assert_eq!(aliased(&alias("foo.bar"), None, None), "foo.bar"); - assert_eq!(aliased(&alias("bar"), Some("foo"), None), "foo.bar"); - assert_eq!(aliased(&alias("bar"), Some("foo"), Some("cat")), "foo.bar"); - assert_eq!(aliased(&alias("bar"), None, Some("cat")), "cat.bar"); - } - - #[test] - fn test_external_props() { - let record_schema = AvroSchema::Record(RecordSchema { - name: Name { - name: "record".to_string(), - namespace: None, - }, - aliases: Some(vec![alias("fooalias"), alias("baralias")]), - doc: Some("record documentation".to_string()), - fields: vec![], - lookup: Default::default(), - attributes: Default::default(), - }); - let props = external_props(&record_schema); - assert_eq!( - props.get("avro::doc"), - Some(&"record documentation".to_string()) - ); - assert_eq!( - props.get("avro::aliases"), - Some(&"[fooalias,baralias]".to_string()) - ); - let enum_schema = AvroSchema::Enum(EnumSchema { - name: Name { - name: "enum".to_string(), - namespace: None, - }, - aliases: Some(vec![alias("fooenum"), alias("barenum")]), - doc: Some("enum documentation".to_string()), - symbols: vec![], - default: None, - attributes: Default::default(), - }); - let props = external_props(&enum_schema); - assert_eq!( - props.get("avro::doc"), - Some(&"enum documentation".to_string()) - ); - assert_eq!( - props.get("avro::aliases"), - Some(&"[fooenum,barenum]".to_string()) - ); - let fixed_schema = AvroSchema::Fixed(FixedSchema { - name: Name { - name: "fixed".to_string(), - namespace: None, - }, - aliases: Some(vec![alias("foofixed"), alias("barfixed")]), - size: 1, - doc: None, - default: None, - attributes: Default::default(), - }); - let props = external_props(&fixed_schema); - assert_eq!( - props.get("avro::aliases"), - Some(&"[foofixed,barfixed]".to_string()) - ); - } - - #[test] - fn test_invalid_avro_schema() {} - - #[test] - fn test_plain_types_schema() { - let schema = AvroSchema::parse_str( - r#" - { - "type" : "record", - "name" : "topLevelRecord", - "fields" : [ { - "name" : "id", - "type" : [ "int", "null" ] - }, { - "name" : "bool_col", - "type" : [ "boolean", "null" ] - }, { - "name" : "tinyint_col", - "type" : [ "int", "null" ] - }, { - "name" : "smallint_col", - "type" : [ "int", "null" ] - }, { - "name" : "int_col", - "type" : [ "int", "null" ] - }, { - "name" : "bigint_col", - "type" : [ "long", "null" ] - }, { - "name" : "float_col", - "type" : [ "float", "null" ] - }, { - "name" : "double_col", - "type" : [ "double", "null" ] - }, { - "name" : "date_string_col", - "type" : [ "bytes", "null" ] - }, { - "name" : "string_col", - "type" : [ "bytes", "null" ] - }, { - "name" : "timestamp_col", - "type" : [ { - "type" : "long", - "logicalType" : "timestamp-micros" - }, "null" ] - } ] - }"#, - ); - assert!(schema.is_ok(), "{schema:?}"); - let arrow_schema = to_arrow_schema(&schema.unwrap()); - assert!(arrow_schema.is_ok(), "{arrow_schema:?}"); - let expected = Schema::new(vec![ - Field::new("id", Int32, true), - Field::new("bool_col", Boolean, true), - Field::new("tinyint_col", Int32, true), - Field::new("smallint_col", Int32, true), - Field::new("int_col", Int32, true), - Field::new("bigint_col", Int64, true), - Field::new("float_col", Float32, true), - Field::new("double_col", Float64, true), - Field::new("date_string_col", Binary, true), - Field::new("string_col", Binary, true), - Field::new("timestamp_col", Timestamp(Microsecond, None), true), - ]); - assert_eq!(arrow_schema.unwrap(), expected); - } - - #[test] - fn test_nested_schema() { - let avro_schema = apache_avro::Schema::parse_str( - r#" - { - "type": "record", - "name": "r1", - "fields": [ - { - "name": "col1", - "type": [ - "null", - { - "type": "record", - "name": "r2", - "fields": [ - { - "name": "col2", - "type": "string" - }, - { - "name": "col3", - "type": ["null", "string"], - "default": null - } - ] - } - ], - "default": null - } - ] - }"#, - ) - .unwrap(); - // should not use Avro Record names. - let expected_arrow_schema = Schema::new(vec![Field::new( - "col1", - arrow::datatypes::DataType::Struct( - vec![ - Field::new("col2", Utf8, false), - Field::new("col3", Utf8, true), - ] - .into(), - ), - true, - )]); - assert_eq!( - to_arrow_schema(&avro_schema).unwrap(), - expected_arrow_schema - ); - } - - #[test] - fn test_non_record_schema() { - let arrow_schema = to_arrow_schema(&AvroSchema::String); - assert!(arrow_schema.is_ok(), "{arrow_schema:?}"); - assert_eq!( - arrow_schema.unwrap(), - Schema::new(vec![Field::new("", Utf8, false)]) - ); - } -} diff --git a/datafusion/datasource-avro/src/file_format.rs b/datafusion/datasource-avro/src/file_format.rs index 3a55d6b5ee2b1..1acf6d28b45ac 100644 --- a/datafusion/datasource-avro/src/file_format.rs +++ b/datafusion/datasource-avro/src/file_format.rs @@ -16,13 +16,12 @@ // under the License. //! Apache Avro [`FileFormat`] abstractions -// todo - Check if it can be replaced with arrow-avro use std::any::Any; use std::collections::HashMap; use std::fmt; use std::sync::Arc; -use crate::avro_to_arrow::read_avro_schema_from_reader; +use crate::read_avro_schema_from_reader; use crate::source::AvroSource; use arrow::datatypes::Schema; diff --git a/datafusion/datasource-avro/src/mod.rs b/datafusion/datasource-avro/src/mod.rs index fd69a323a33d7..461a4f5454201 100644 --- a/datafusion/datasource-avro/src/mod.rs +++ b/datafusion/datasource-avro/src/mod.rs @@ -26,9 +26,19 @@ //! An [Avro](https://avro.apache.org/) based [`FileSource`](datafusion_datasource::file::FileSource) implementation and related functionality. -pub mod avro_to_arrow; pub mod file_format; pub mod source; +use arrow::datatypes::Schema; pub use arrow_avro; +use arrow_avro::reader::ReaderBuilder; pub use file_format::*; +use std::io::{BufReader, Read}; + +/// Read Avro schema given a reader +pub fn read_avro_schema_from_reader( + reader: &mut R, +) -> datafusion_common::Result { + let avro_reader = ReaderBuilder::new().build(BufReader::new(reader))?; + Ok(avro_reader.schema().as_ref().clone()) +} diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index 9a0b97babdb4d..ce17ea18f9ca1 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -29,11 +29,9 @@ use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::file_stream::FileOpener; use datafusion_datasource::schema_adapter::SchemaAdapterFactory; -use datafusion_physical_expr_common::sort_expr::LexOrdering; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use object_store::ObjectStore; - /// AvroSource holds the extra configuration that is necessary for opening avro files #[derive(Clone, Default)] pub struct AvroSource { @@ -56,7 +54,7 @@ impl AvroSource { .schema .as_ref() .expect("Schema must set before open") - .as_ref(); + .as_ref(); // todo - avro metadata loading let projected_schema = if let Some(projection) = &self.file_projection { &schema.project(projection)? @@ -129,16 +127,6 @@ impl FileSource for AvroSource { "avro" } - fn repartitioned( - &self, - _target_partitions: usize, - _repartition_file_min_size: usize, - _output_ordering: Option, - _config: &FileScanConfig, - ) -> Result> { - Ok(None) - } - fn with_schema_adapter_factory( &self, schema_adapter_factory: Arc, @@ -195,3 +183,181 @@ mod private { } } } + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::*; + use arrow::array::{ + BinaryArray, BooleanArray, Float32Array, Float64Array, Int32Array, Int64Array, + TimestampMicrosecondArray, + }; + use arrow::datatypes::{DataType, Field}; + use arrow::datatypes::{Schema, TimeUnit}; + use std::fs::File; + use std::io::BufReader; + + fn build_reader(name: &'_ str, schema: Option<&Schema>) -> Reader> { + let testdata = datafusion_common::test_util::arrow_test_data(); + let filename = format!("{testdata}/avro/{name}"); + let mut builder = ReaderBuilder::new().with_batch_size(64); + if let Some(schema) = schema { + builder = builder.with_reader_schema(AvroSchema::try_from(schema).unwrap()); + } + builder + .build(BufReader::new(File::open(filename).unwrap())) + .unwrap() + } + + fn get_col<'a, T: 'static>( + batch: &'a RecordBatch, + col: (usize, &Field), + ) -> Option<&'a T> { + batch.column(col.0).as_any().downcast_ref::() + } + + #[test] + fn test_avro_basic() { + let mut reader = build_reader("alltypes_dictionary.avro", None); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(11, batch.num_columns()); + assert_eq!(2, batch.num_rows()); + + let schema = reader.schema(); + let batch_schema = batch.schema(); + assert_eq!(schema, batch_schema); + + let id = schema.column_with_name("id").unwrap(); + assert_eq!(0, id.0); + assert_eq!(&DataType::Int32, id.1.data_type()); + let col = get_col::(&batch, id).unwrap(); + assert_eq!(0, col.value(0)); + assert_eq!(1, col.value(1)); + let bool_col = schema.column_with_name("bool_col").unwrap(); + assert_eq!(1, bool_col.0); + assert_eq!(&DataType::Boolean, bool_col.1.data_type()); + let col = get_col::(&batch, bool_col).unwrap(); + assert!(col.value(0)); + assert!(!col.value(1)); + let tinyint_col = schema.column_with_name("tinyint_col").unwrap(); + assert_eq!(2, tinyint_col.0); + assert_eq!(&DataType::Int32, tinyint_col.1.data_type()); + let col = get_col::(&batch, tinyint_col).unwrap(); + assert_eq!(0, col.value(0)); + assert_eq!(1, col.value(1)); + let smallint_col = schema.column_with_name("smallint_col").unwrap(); + assert_eq!(3, smallint_col.0); + assert_eq!(&DataType::Int32, smallint_col.1.data_type()); + let col = get_col::(&batch, smallint_col).unwrap(); + assert_eq!(0, col.value(0)); + assert_eq!(1, col.value(1)); + let int_col = schema.column_with_name("int_col").unwrap(); + assert_eq!(4, int_col.0); + let col = get_col::(&batch, int_col).unwrap(); + assert_eq!(0, col.value(0)); + assert_eq!(1, col.value(1)); + assert_eq!(&DataType::Int32, int_col.1.data_type()); + let col = get_col::(&batch, int_col).unwrap(); + assert_eq!(0, col.value(0)); + assert_eq!(1, col.value(1)); + let bigint_col = schema.column_with_name("bigint_col").unwrap(); + assert_eq!(5, bigint_col.0); + let col = get_col::(&batch, bigint_col).unwrap(); + assert_eq!(0, col.value(0)); + assert_eq!(10, col.value(1)); + assert_eq!(&DataType::Int64, bigint_col.1.data_type()); + let float_col = schema.column_with_name("float_col").unwrap(); + assert_eq!(6, float_col.0); + let col = get_col::(&batch, float_col).unwrap(); + assert_eq!(0.0, col.value(0)); + assert_eq!(1.1, col.value(1)); + assert_eq!(&DataType::Float32, float_col.1.data_type()); + let col = get_col::(&batch, float_col).unwrap(); + assert_eq!(0.0, col.value(0)); + assert_eq!(1.1, col.value(1)); + let double_col = schema.column_with_name("double_col").unwrap(); + assert_eq!(7, double_col.0); + assert_eq!(&DataType::Float64, double_col.1.data_type()); + let col = get_col::(&batch, double_col).unwrap(); + assert_eq!(0.0, col.value(0)); + assert_eq!(10.1, col.value(1)); + let date_string_col = schema.column_with_name("date_string_col").unwrap(); + assert_eq!(8, date_string_col.0); + assert_eq!(&DataType::Binary, date_string_col.1.data_type()); + let col = get_col::(&batch, date_string_col).unwrap(); + assert_eq!("01/01/09".as_bytes(), col.value(0)); + assert_eq!("01/01/09".as_bytes(), col.value(1)); + let string_col = schema.column_with_name("string_col").unwrap(); + assert_eq!(9, string_col.0); + assert_eq!(&DataType::Binary, string_col.1.data_type()); + let col = get_col::(&batch, string_col).unwrap(); + assert_eq!("0".as_bytes(), col.value(0)); + assert_eq!("1".as_bytes(), col.value(1)); + let timestamp_col = schema.column_with_name("timestamp_col").unwrap(); + assert_eq!(10, timestamp_col.0); + assert_eq!( + &DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())), + timestamp_col.1.data_type() + ); + let col = get_col::(&batch, timestamp_col).unwrap(); + assert_eq!(1230768000000000, col.value(0)); + assert_eq!(1230768060000000, col.value(1)); + } + + #[test] + fn test_avro_with_projection() { + // Test projection to filter and reorder columns + let projected_schema = Schema::new(vec![ + Field::new("string_col", DataType::Binary, false), + Field::new("double_col", DataType::Float64, false), + Field::new("bool_col", DataType::Boolean, false), + ]); + + let mut reader = + build_reader("alltypes_dictionary.avro", Some(&projected_schema)); + let batch = reader.next().unwrap().unwrap(); + + // Only 3 columns should be present (not all 11) + assert_eq!(3, batch.num_columns()); + assert_eq!(2, batch.num_rows()); + + let schema = reader.schema(); + let batch_schema = batch.schema(); + assert_eq!(schema, batch_schema); + + // Verify columns are in the order specified in projection + // First column should be string_col (was at index 9 in original) + assert_eq!("string_col", schema.field(0).name()); + assert_eq!(&DataType::Binary, schema.field(0).data_type()); + let col = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!("0".as_bytes(), col.value(0)); + assert_eq!("1".as_bytes(), col.value(1)); + + // Second column should be double_col (was at index 7 in original) + assert_eq!("double_col", schema.field(1).name()); + assert_eq!(&DataType::Float64, schema.field(1).data_type()); + let col = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(0.0, col.value(0)); + assert_eq!(10.1, col.value(1)); + + // Third column should be bool_col (was at index 1 in original) + assert_eq!("bool_col", schema.field(2).name()); + assert_eq!(&DataType::Boolean, schema.field(2).data_type()); + let col = batch + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(col.value(0)); + assert!(!col.value(1)); + } +} diff --git a/datafusion/sqllogictest/test_files/avro.slt b/datafusion/sqllogictest/test_files/avro.slt index 2ad60c0082e87..d83303e520b98 100644 --- a/datafusion/sqllogictest/test_files/avro.slt +++ b/datafusion/sqllogictest/test_files/avro.slt @@ -107,7 +107,7 @@ LOCATION '../../testing/data/avro/alltypes_plain.zstandard.avro'; statement ok CREATE EXTERNAL TABLE single_nan ( - mycol FLOAT + mycol DOUBLE ) STORED AS AVRO LOCATION '../../testing/data/avro/single_nan.avro'; From ac972458e659ce40ba2f366d7df003fa59abb32e Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Wed, 5 Nov 2025 00:07:37 +0900 Subject: [PATCH 16/35] replace with arrow-avro --- datafusion/datasource-avro/src/avro_to_arrow/reader.rs | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 datafusion/datasource-avro/src/avro_to_arrow/reader.rs diff --git a/datafusion/datasource-avro/src/avro_to_arrow/reader.rs b/datafusion/datasource-avro/src/avro_to_arrow/reader.rs deleted file mode 100644 index e69de29bb2d1d..0000000000000 From b50df22012756d270ba934fe2ef43bf50f605389 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Wed, 5 Nov 2025 00:39:28 +0900 Subject: [PATCH 17/35] remove unused dependencies --- Cargo.lock | 3 --- datafusion/datasource-avro/Cargo.toml | 4 +--- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a9c17890fcedd..2566e3d13babf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2091,13 +2091,10 @@ dependencies = [ "bytes", "datafusion-common", "datafusion-datasource", - "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-session", "futures", - "num-traits", "object_store", - "serde_json", ] [[package]] diff --git a/datafusion/datasource-avro/Cargo.toml b/datafusion/datasource-avro/Cargo.toml index 83a8f5ca1aedb..2e4594a92d73f 100644 --- a/datafusion/datasource-avro/Cargo.toml +++ b/datafusion/datasource-avro/Cargo.toml @@ -37,15 +37,13 @@ async-trait = { workspace = true } bytes = { workspace = true } datafusion-common = { workspace = true, features = ["object_store"] } datafusion-datasource = { workspace = true } -datafusion-physical-expr-common = { workspace = true } datafusion-physical-plan = { workspace = true } datafusion-session = { workspace = true } futures = { workspace = true } -num-traits = { workspace = true } object_store = { workspace = true } [dev-dependencies] -serde_json = { workspace = true } + [lints] workspace = true From b4c3b5c6397785d0be76541d3f03b522e964f22c Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Wed, 5 Nov 2025 00:46:38 +0900 Subject: [PATCH 18/35] parallel avro file reading --- datafusion/sqllogictest/test_files/repartition_scan.slt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index 41718b3aebc27..c6f0d10676f93 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -285,7 +285,7 @@ query TT EXPLAIN SELECT * FROM avro_table ---- logical_plan TableScan: avro_table projection=[f1, f2, f3] -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/avro/simple_enum.avro]]}, projection=[f1, f2, f3], file_type=avro +physical_plan DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/testing/data/avro/simple_enum.avro:0..103], [WORKSPACE_ROOT/testing/data/avro/simple_enum.avro:103..206], [WORKSPACE_ROOT/testing/data/avro/simple_enum.avro:206..309], [WORKSPACE_ROOT/testing/data/avro/simple_enum.avro:309..411]]}, projection=[f1, f2, f3], file_type=avro # Cleanup statement ok From 3014b8a2165ba1e8e4670cfd7d28c834cd1541d1 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Wed, 5 Nov 2025 00:59:12 +0900 Subject: [PATCH 19/35] parallel avro file reading --- datafusion/datasource-avro/Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/datasource-avro/Cargo.toml b/datafusion/datasource-avro/Cargo.toml index 2e4594a92d73f..d1c0e3ca054bb 100644 --- a/datafusion/datasource-avro/Cargo.toml +++ b/datafusion/datasource-avro/Cargo.toml @@ -44,7 +44,6 @@ object_store = { workspace = true } [dev-dependencies] - [lints] workspace = true From d55f945eef0da16a7b36dfc2298774fbe6b444ce Mon Sep 17 00:00:00 2001 From: Namgung Chan <33323415+getChan@users.noreply.github.com> Date: Wed, 26 Nov 2025 11:42:39 +0900 Subject: [PATCH 20/35] Apply suggestions from code review Co-authored-by: Connor Sanders <170039284+jecsand838@users.noreply.github.com> --- datafusion/datasource-avro/src/mod.rs | 177 ++++++++++++++++++++++- datafusion/datasource-avro/src/source.rs | 99 +++++++++++-- 2 files changed, 264 insertions(+), 12 deletions(-) diff --git a/datafusion/datasource-avro/src/mod.rs b/datafusion/datasource-avro/src/mod.rs index 461a4f5454201..e324b4e09f784 100644 --- a/datafusion/datasource-avro/src/mod.rs +++ b/datafusion/datasource-avro/src/mod.rs @@ -33,12 +33,187 @@ use arrow::datatypes::Schema; pub use arrow_avro; use arrow_avro::reader::ReaderBuilder; pub use file_format::*; +use arrow_avro::schema::SCHEMA_METADATA_KEY; +use datafusion_common::DataFusionError; use std::io::{BufReader, Read}; +use std::sync::Arc; /// Read Avro schema given a reader pub fn read_avro_schema_from_reader( reader: &mut R, ) -> datafusion_common::Result { let avro_reader = ReaderBuilder::new().build(BufReader::new(reader))?; - Ok(avro_reader.schema().as_ref().clone()) + let schema_ref = avro_reader.schema(); + // Extract the raw Avro JSON schema from the OCF header. + let raw_json = avro_reader + .avro_header() + .get(SCHEMA_METADATA_KEY.as_bytes()) + .map(|bytes| { + std::str::from_utf8(bytes).map_err(|e| { + DataFusionError::Execution(format!( + "Invalid UTF-8 in Avro schema metadata ({SCHEMA_METADATA_KEY}): {e}" + )) + }) + }) + .transpose()? + .map(str::to_owned); + drop(avro_reader); + if let Some(raw_json) = raw_json { + let mut schema = Arc::unwrap_or_clone(schema_ref); + // Insert the raw Avro JSON schema using `SCHEMA_METADATA_KEY`. + // This should enable the avro schema metadata to be picked downstream. + schema + .metadata + .insert(SCHEMA_METADATA_KEY.to_string(), raw_json); + Ok(schema) + } else { + // Return error because Avro spec requires the Avro schema metadata to be present in the OCF header. + Err(DataFusionError::Execution(format!( + "Avro schema metadata ({SCHEMA_METADATA_KEY}) is missing from OCF header" + ))) + } +} + +#[cfg(test)] +mod test { + use super::*; + use arrow::array::{BinaryArray, BooleanArray, Float64Array}; + use arrow::datatypes::DataType; + use arrow_avro::reader::ReaderBuilder; + use arrow_avro::schema::{AvroSchema, SCHEMA_METADATA_KEY}; + use datafusion_common::test_util::arrow_test_data; + use datafusion_common::{DataFusionError, Result as DFResult}; + use serde_json::Value; + use std::collections::HashMap; + use std::fs::File; + use std::io::BufReader; + + fn avro_test_file(name: &str) -> String { + format!("{}/avro/{name}", arrow_test_data()) + } + + #[test] + fn read_avro_schema_includes_avro_json_metadata() -> DFResult<()> { + let path = avro_test_file("alltypes_plain.avro"); + let mut file = File::open(&path)?; + let schema = read_avro_schema_from_reader(&mut file)?; + let meta_json = schema + .metadata() + .get(SCHEMA_METADATA_KEY) + .expect("schema metadata missing avro.schema entry"); + assert!( + !meta_json.is_empty(), + "avro.schema metadata should not be empty" + ); + let mut raw = File::open(&path)?; + let avro_reader = ReaderBuilder::new().build(BufReader::new(&mut raw))?; + let header_json = avro_reader + .avro_header() + .get(SCHEMA_METADATA_KEY.as_bytes()) + .and_then(|bytes| std::str::from_utf8(bytes).ok()) + .expect("missing avro.schema metadata in OCF header"); + assert_eq!( + meta_json, header_json, + "schema metadata avro.schema should match OCF header" + ); + Ok(()) + } + + #[test] + fn read_and_project_using_schema_metadata() -> DFResult<()> { + let path = avro_test_file("alltypes_dictionary.avro"); + let mut file = File::open(&path)?; + let file_schema = read_avro_schema_from_reader(&mut file)?; + let projected_field_names = vec!["string_col", "double_col", "bool_col"]; + let avro_json = file_schema + .metadata() + .get(SCHEMA_METADATA_KEY) + .expect("schema metadata missing avro.schema entry"); + let projected_avro_schema = + build_projected_reader_schema(avro_json, &projected_field_names)?; + let mut reader = ReaderBuilder::new() + .with_reader_schema(projected_avro_schema) + .with_batch_size(64) + .build(BufReader::new(File::open(&path)?))?; + let batch = reader.next().expect("no batch produced")?; + assert_eq!(3, batch.num_columns()); + assert_eq!(2, batch.num_rows()); + let schema = batch.schema(); + assert_eq!("string_col", schema.field(0).name()); + assert_eq!(&DataType::Binary, schema.field(0).data_type()); + let col = batch + .column(0) + .as_any() + .downcast_ref::() + .expect("column 0 not BinaryArray"); + assert_eq!("0".as_bytes(), col.value(0)); + assert_eq!("1".as_bytes(), col.value(1)); + assert_eq!("double_col", schema.field(1).name()); + assert_eq!(&DataType::Float64, schema.field(1).data_type()); + let col = batch + .column(1) + .as_any() + .downcast_ref::() + .expect("column 1 not Float64Array"); + assert_eq!(0.0, col.value(0)); + assert_eq!(10.1, col.value(1)); + assert_eq!("bool_col", schema.field(2).name()); + assert_eq!(&DataType::Boolean, schema.field(2).data_type()); + let col = batch + .column(2) + .as_any() + .downcast_ref::() + .expect("column 2 not BooleanArray"); + assert!(col.value(0)); + assert!(!col.value(1)); + Ok(()) + } + + fn build_projected_reader_schema( + avro_json: &str, + projected_field_names: &[&str], + ) -> DFResult { + let mut schema_json: Value = serde_json::from_str(avro_json).map_err(|e| { + DataFusionError::Execution(format!( + "Failed to parse Avro schema JSON from metadata: {e}" + )) + })?; + let obj = schema_json.as_object_mut().ok_or_else(|| { + DataFusionError::Execution( + "Top-level Avro schema JSON is not an object".to_string(), + ) + })?; + let fields_val = obj.get_mut("fields").ok_or_else(|| { + DataFusionError::Execution( + "Top-level Avro schema JSON has no `fields` key".to_string(), + ) + })?; + let fields = fields_val.as_array_mut().ok_or_else(|| { + DataFusionError::Execution( + "Top-level Avro schema `fields` is not an array".to_string(), + ) + })?; + let mut by_name: HashMap = HashMap::new(); + for field in fields.iter() { + if let Some(name) = field.get("name").and_then(|v| v.as_str()) { + by_name.insert(name.to_string(), field.clone()); + } + } + let mut projected_fields = Vec::with_capacity(projected_field_names.len()); + for name in projected_field_names { + let Some(field) = by_name.get(*name) else { + return Err(DataFusionError::Execution(format!( + "Projected field `{name}` not found in Avro writer schema" + ))); + }; + projected_fields.push(field.clone()); + } + *fields_val = Value::Array(projected_fields); + let projected_json = serde_json::to_string(&schema_json).map_err(|e| { + DataFusionError::Execution(format!( + "Failed to serialize projected Avro schema JSON: {e}" + )) + })?; + Ok(AvroSchema::new(projected_json)) + } } diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index 87b52762514e7..c9e892931e192 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -58,22 +58,99 @@ impl AvroSource { } fn open(&self, reader: R) -> Result> { - let schema = self.table_schema.file_schema().as_ref(); // todo - avro metadata loading - - let projected_schema = if let Some(projection) = &self.file_projection { - &schema.project(projection)? - } else { - schema - }; - - let avro_schema = AvroSchema::try_from(projected_schema)?; - + // TODO: Once `ReaderBuilder::with_projection` is available, we should use it instead. + // This should be an easy change. We'd simply need to: + // 1. Use the full file schema to generate the reader `AvroSchema`. + // 2. Pass `&self.file_projection` into `ReaderBuilder::with_projection`. + // 3. Remove the `build_projected_reader_schema` methods. ReaderBuilder::new() - .with_reader_schema(avro_schema) // Used for projection on read. + .with_reader_schema(self.build_projected_reader_schema()?) .with_batch_size(self.batch_size.expect("Batch size must set before open")) .build(reader) .map_err(Into::into) } + + fn build_projected_reader_schema(&self) -> Result { + let file_schema = self.table_schema.file_schema().as_ref(); + // Fast path: no projection. If we have the original writer schema JSON + // in metadata, just reuse it as-is without parsing. + if self.file_projection.is_none() { + return if let Some(avro_json) = + file_schema.metadata().get(SCHEMA_METADATA_KEY) + { + Ok(AvroSchema::new(avro_json.clone())) + } else { + // Fall back to deriving Avro from the full Arrow file schema, should be ok + // if not using projection. + Ok(AvroSchema::try_from(file_schema) + .map_err(Into::::into)?) + }; + } + // Use the writer Avro schema JSON tagged upstream to build a projected reader schema + match file_schema.metadata().get(SCHEMA_METADATA_KEY) { + Some(avro_json) => { + let mut schema_json: Value = + serde_json::from_str(avro_json).map_err(|e| { + DataFusionError::Execution(format!( + "Failed to parse Avro schema JSON from metadata: {e}" + )) + })?; + let obj = schema_json.as_object_mut().ok_or_else(|| { + DataFusionError::Execution( + "Top-level Avro schema JSON must be an object".to_string(), + ) + })?; + let fields_val = obj.get_mut("fields").ok_or_else(|| { + DataFusionError::Execution( + "Top-level Avro schema JSON must contain a `fields` array" + .to_string(), + ) + })?; + let fields_arr = fields_val.as_array_mut().ok_or_else(|| { + DataFusionError::Execution( + "Top-level Avro schema `fields` must be an array".to_string(), + ) + })?; + // Move existing fields out so we can rebuild them in projected order. + let original_fields = std::mem::take(fields_arr); + let mut by_name: HashMap = + HashMap::with_capacity(original_fields.len()); + for field in original_fields { + if let Some(name) = field.get("name").and_then(|v| v.as_str()) { + by_name.insert(name.to_string(), field); + } + } + // Rebuild `fields` in the same order as the projected Arrow schema. + let projection = self.file_projection.as_ref().ok_or_else(|| { + DataFusionError::Internal("checked file_projection is Some above".to_string()) + })?; + let projected_schema = file_schema.project(projection)?; + let mut projected_fields = + Vec::with_capacity(projected_schema.fields().len()); + for arrow_field in projected_schema.fields() { + let name = arrow_field.name(); + let field = by_name.remove(name).ok_or_else(|| { + DataFusionError::Execution(format!( + "Projected field `{name}` not found in Avro writer schema" + )) + })?; + projected_fields.push(field); + } + *fields_val = Value::Array(projected_fields); + let projected_json = + serde_json::to_string(&schema_json).map_err(|e| { + DataFusionError::Execution(format!( + "Failed to serialize projected Avro schema JSON: {e}" + )) + })?; + Ok(AvroSchema::new(projected_json)) + } + None => Err(DataFusionError::Execution(format!( + "Avro schema metadata ({SCHEMA_METADATA_KEY}) is missing from file schema, but is required for projection" + ))), + } + } +} } impl FileSource for AvroSource { From dbc082c35198d039d57bb86b1161cef5c6ad4a5d Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Wed, 26 Nov 2025 11:54:12 +0900 Subject: [PATCH 21/35] merge suggestions --- Cargo.lock | 1 + datafusion/datasource-avro/Cargo.toml | 1 + datafusion/datasource-avro/src/source.rs | 7 ++++--- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 12afc22b7ef11..2f0072435feeb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2095,6 +2095,7 @@ dependencies = [ "datafusion-session", "futures", "object_store", + "serde_json", ] [[package]] diff --git a/datafusion/datasource-avro/Cargo.toml b/datafusion/datasource-avro/Cargo.toml index 28439c46addda..1fd74d5dfe551 100644 --- a/datafusion/datasource-avro/Cargo.toml +++ b/datafusion/datasource-avro/Cargo.toml @@ -41,6 +41,7 @@ datafusion-physical-plan = { workspace = true } datafusion-session = { workspace = true } futures = { workspace = true } object_store = { workspace = true } +serde_json = { workspace = true } [dev-dependencies] diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index c9e892931e192..d8f422377531a 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -18,12 +18,13 @@ //! Execution plan for reading line-delimited Avro files use std::any::Any; +use std::collections::HashMap; use std::sync::Arc; use arrow_avro::reader::{Reader, ReaderBuilder}; -use arrow_avro::schema::AvroSchema; +use arrow_avro::schema::{AvroSchema, SCHEMA_METADATA_KEY}; use datafusion_common::error::Result; -use datafusion_common::Statistics; +use datafusion_common::{DataFusionError, Statistics}; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::file_stream::FileOpener; @@ -32,6 +33,7 @@ use datafusion_datasource::TableSchema; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use object_store::ObjectStore; +use serde_json::Value; /// AvroSource holds the extra configuration that is necessary for opening avro files #[derive(Clone)] @@ -151,7 +153,6 @@ impl AvroSource { } } } -} impl FileSource for AvroSource { fn create_file_opener( From 3cd67125c345660cd9fba39038543070b2a8661d Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Wed, 26 Nov 2025 18:37:53 +0900 Subject: [PATCH 22/35] remove empty file --- .../datasource-avro/src/avro_to_arrow/arrow_array_reader.rs | 0 datafusion/datasource-avro/src/avro_to_arrow/reader.rs | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs delete mode 100644 datafusion/datasource-avro/src/avro_to_arrow/reader.rs diff --git a/datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs b/datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/datafusion/datasource-avro/src/avro_to_arrow/reader.rs b/datafusion/datasource-avro/src/avro_to_arrow/reader.rs deleted file mode 100644 index e69de29bb2d1d..0000000000000 From c6cebbd28902c419bec0d353abe944d34bb82408 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Wed, 26 Nov 2025 22:25:03 +0900 Subject: [PATCH 23/35] remove empty file --- Cargo.lock | 1 + datafusion/datasource-avro/Cargo.toml | 1 + datafusion/datasource-avro/src/mod.rs | 2 +- datafusion/datasource-avro/src/source.rs | 7 ++++--- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 46f6de17a9722..5ff1776c0e127 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2040,6 +2040,7 @@ dependencies = [ "bytes", "datafusion-common", "datafusion-datasource", + "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-session", "futures", diff --git a/datafusion/datasource-avro/Cargo.toml b/datafusion/datasource-avro/Cargo.toml index 1fd74d5dfe551..8f9ed075f8468 100644 --- a/datafusion/datasource-avro/Cargo.toml +++ b/datafusion/datasource-avro/Cargo.toml @@ -37,6 +37,7 @@ async-trait = { workspace = true } bytes = { workspace = true } datafusion-common = { workspace = true, features = ["object_store"] } datafusion-datasource = { workspace = true } +datafusion-physical-expr-common = { workspace = true } datafusion-physical-plan = { workspace = true } datafusion-session = { workspace = true } futures = { workspace = true } diff --git a/datafusion/datasource-avro/src/mod.rs b/datafusion/datasource-avro/src/mod.rs index 4c2fc9b3c1c16..ce17f98b00ee6 100644 --- a/datafusion/datasource-avro/src/mod.rs +++ b/datafusion/datasource-avro/src/mod.rs @@ -34,9 +34,9 @@ pub mod source; use arrow::datatypes::Schema; pub use arrow_avro; use arrow_avro::reader::ReaderBuilder; -pub use file_format::*; use arrow_avro::schema::SCHEMA_METADATA_KEY; use datafusion_common::DataFusionError; +pub use file_format::*; use std::io::{BufReader, Read}; use std::sync::Arc; diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index 67d416b6ea4a0..febb73d761ed2 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -24,12 +24,13 @@ use std::sync::Arc; use arrow_avro::reader::{Reader, ReaderBuilder}; use arrow_avro::schema::{AvroSchema, SCHEMA_METADATA_KEY}; use datafusion_common::error::Result; -use datafusion_common::{DataFusionError, Statistics}; +use datafusion_common::DataFusionError; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::file_stream::FileOpener; use datafusion_datasource::schema_adapter::SchemaAdapterFactory; use datafusion_datasource::TableSchema; +use datafusion_physical_expr_common::sort_expr::LexOrdering; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use object_store::ObjectStore; @@ -40,7 +41,7 @@ use serde_json::Value; pub struct AvroSource { table_schema: TableSchema, batch_size: Option, - projection: Option>, + projection: Option>, metrics: ExecutionPlanMetricsSet, schema_adapter_factory: Option>, } @@ -181,7 +182,7 @@ impl FileSource for AvroSource { fn with_projection(&self, config: &FileScanConfig) -> Arc { let mut conf = self.clone(); - conf.projection = config.file_column_projection_indices(); + conf.projection = config.projected_file_column_names(); Arc::new(conf) } From 3e2667a5091b9c4666ab01bc475c79807adb48ff Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Wed, 26 Nov 2025 23:26:52 +0900 Subject: [PATCH 24/35] projection indices --- datafusion/datasource-avro/src/source.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index febb73d761ed2..c44fc310cfbaf 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -41,7 +41,7 @@ use serde_json::Value; pub struct AvroSource { table_schema: TableSchema, batch_size: Option, - projection: Option>, + projection: Option>, metrics: ExecutionPlanMetricsSet, schema_adapter_factory: Option>, } @@ -182,7 +182,7 @@ impl FileSource for AvroSource { fn with_projection(&self, config: &FileScanConfig) -> Arc { let mut conf = self.clone(); - conf.projection = config.projected_file_column_names(); + conf.projection = config.file_column_projection_indices(); Arc::new(conf) } From d574fe721c8cd1d97547d88a12c31f187bf1ac25 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Wed, 3 Dec 2025 02:00:07 +0900 Subject: [PATCH 25/35] check schema should be inferred --- datafusion/datasource-avro/src/source.rs | 38 +++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index c44fc310cfbaf..ad15c18d4c9e9 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -33,6 +33,7 @@ use datafusion_datasource::TableSchema; use datafusion_physical_expr_common::sort_expr::LexOrdering; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; +use crate::read_avro_schema_from_reader; use object_store::ObjectStore; use serde_json::Value; @@ -235,9 +236,44 @@ mod private { impl FileOpener for AvroOpener { fn open(&self, partitioned_file: PartitionedFile) -> Result { - let config = Arc::clone(&self.config); let object_store = Arc::clone(&self.object_store); + let config = self.config.clone(); + Ok(Box::pin(async move { + // check if schema should be inferred + let r = object_store + .get(&partitioned_file.object_meta.location) + .await?; + let config = Arc::new(match r.payload { + GetResultPayload::File(mut file, _) => { + let schema = match config.table_schema.file_schema().metadata.get(SCHEMA_METADATA_KEY) { + Some(_) => config.table_schema.file_schema().clone(), + None => Arc::new(read_avro_schema_from_reader(&mut file).unwrap()), // if not inferred, read schema from file + }; + AvroSource { + table_schema: TableSchema::new(schema, config.table_schema.table_partition_cols().clone()), + batch_size: config.batch_size, + projection: config.projection.clone(), + metrics: config.metrics.clone(), + schema_adapter_factory: config.schema_adapter_factory.clone(), + } + } + GetResultPayload::Stream(_) => { + let bytes = r.bytes().await?; + let schema = match config.table_schema.file_schema().metadata.get(SCHEMA_METADATA_KEY) { + Some(_) => config.table_schema.file_schema().clone(), + None => Arc::new(read_avro_schema_from_reader(&mut bytes.reader()).unwrap()), // if not inferred, read schema from file + }; + AvroSource { + table_schema: TableSchema::new(schema, config.table_schema.table_partition_cols().clone()), + batch_size: config.batch_size, + projection: config.projection.clone(), + metrics: config.metrics.clone(), + schema_adapter_factory: config.schema_adapter_factory.clone(), + } + } + }); + let r = object_store .get(&partitioned_file.object_meta.location) .await?; From d647ec657564b7d3a2b1dfbcd69acefa2cb2176e Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Sun, 7 Dec 2025 23:13:12 +0900 Subject: [PATCH 26/35] fix test table column type --- datafusion/sqllogictest/test_files/avro.slt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/datafusion/sqllogictest/test_files/avro.slt b/datafusion/sqllogictest/test_files/avro.slt index d83303e520b98..eed8af475b406 100644 --- a/datafusion/sqllogictest/test_files/avro.slt +++ b/datafusion/sqllogictest/test_files/avro.slt @@ -31,7 +31,7 @@ CREATE EXTERNAL TABLE alltypes_plain ( float_col FLOAT NOT NULL, double_col DOUBLE NOT NULL, date_string_col BYTEA NOT NULL, - string_col VARCHAR NOT NULL, + string_col BYTEA NOT NULL, timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO @@ -48,7 +48,7 @@ CREATE EXTERNAL TABLE alltypes_plain_snappy ( float_col FLOAT NOT NULL, double_col DOUBLE NOT NULL, date_string_col BYTEA NOT NULL, - string_col VARCHAR NOT NULL, + string_col BYTEA NOT NULL, timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO @@ -65,7 +65,7 @@ CREATE EXTERNAL TABLE alltypes_plain_bzip2 ( float_col FLOAT NOT NULL, double_col DOUBLE NOT NULL, date_string_col BYTEA NOT NULL, - string_col VARCHAR NOT NULL, + string_col BYTEA NOT NULL, timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO @@ -82,7 +82,7 @@ CREATE EXTERNAL TABLE alltypes_plain_xz ( float_col FLOAT NOT NULL, double_col DOUBLE NOT NULL, date_string_col BYTEA NOT NULL, - string_col VARCHAR NOT NULL, + string_col BYTEA NOT NULL, timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO @@ -99,7 +99,7 @@ CREATE EXTERNAL TABLE alltypes_plain_zstandard ( float_col FLOAT NOT NULL, double_col DOUBLE NOT NULL, date_string_col BYTEA NOT NULL, - string_col VARCHAR NOT NULL, + string_col BYTEA NOT NULL, timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO @@ -260,7 +260,7 @@ physical_plan # test column projection order from avro file query ITII -SELECT id, string_col, int_col, bigint_col FROM alltypes_plain ORDER BY id LIMIT 5 +SELECT id, CAST(string_col AS varchar), int_col, bigint_col FROM alltypes_plain ORDER BY id LIMIT 5 ---- 0 0 0 0 1 1 1 10 From f6d80fa1c5c0ae6b1722a0e87079ac2e367201f8 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Tue, 16 Dec 2025 00:43:53 +0900 Subject: [PATCH 27/35] resolve conflicts --- Cargo.lock | 112 +++++++++++++----- Cargo.toml | 2 +- .../src/avro_to_arrow/arrow_array_reader.rs | 0 .../src/avro_to_arrow/schema.rs | 0 datafusion/datasource-avro/src/source.rs | 59 +++++---- 5 files changed, 120 insertions(+), 53 deletions(-) delete mode 100644 datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs delete mode 100644 datafusion/datasource-avro/src/avro_to_arrow/schema.rs diff --git a/Cargo.lock b/Cargo.lock index 5ff1776c0e127..dd46b93eb0f96 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -105,6 +105,15 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "alloca" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5a7d05ea6aea7e9e64d25b9156ba2fee3fdd659e34e41063cd2fc7cd020d7f4" +dependencies = [ + "cc", +] + [[package]] name = "allocator-api2" version = "0.2.21" @@ -510,9 +519,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.34" +version = "0.4.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e86f6d3dc9dc4352edeea6b8e499e13e3f5dc3b964d7ca5fd411415a3498473" +checksum = "98ec5f6c2f8bc326c994cb9e241cc257ddaba9afa8555a43cffbb5dd86efaa37" dependencies = [ "compression-codecs", "compression-core", @@ -1434,9 +1443,9 @@ dependencies = [ [[package]] name = "compression-codecs" -version = "0.4.33" +version = "0.4.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "302266479cb963552d11bd042013a58ef1adc56768016c8b82b4199488f2d4ad" +checksum = "b0f7ac3e5b97fdce45e8922fb05cae2c37f7bbd63d30dd94821dacfd8f3f2bf2" dependencies = [ "bzip2", "compression-core", @@ -1589,10 +1598,11 @@ dependencies = [ [[package]] name = "criterion" -version = "0.7.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1c047a62b0cc3e145fa84415a3191f628e980b194c2755aa12300a4e6cbd928" +checksum = "4d883447757bb0ee46f233e9dc22eb84d93a9508c9b868687b274fc431d886bf" dependencies = [ + "alloca", "anes", "cast", "ciborium", @@ -1602,6 +1612,7 @@ dependencies = [ "itertools 0.13.0", "num-traits", "oorandom", + "page_size", "plotters", "rayon", "regex", @@ -1614,9 +1625,9 @@ dependencies = [ [[package]] name = "criterion-plot" -version = "0.6.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b1bcc0dc7dfae599d84ad0b1a55f80cde8af3725da8313b528da95ef783e338" +checksum = "ed943f81ea2faa8dcecbbfa50164acf95d555afec96a27871663b300e387b2e4" dependencies = [ "cast", "itertools 0.13.0", @@ -2134,10 +2145,14 @@ dependencies = [ "bytes", "dashmap", "datafusion", + "datafusion-common", + "datafusion-expr", "datafusion-physical-expr-adapter", "datafusion-proto", + "datafusion-sql", "env_logger", "futures", + "insta", "log", "mimalloc", "nix", @@ -2145,6 +2160,8 @@ dependencies = [ "prost", "rand 0.9.2", "serde_json", + "strum", + "strum_macros", "tempfile", "test-utils", "tokio", @@ -2221,10 +2238,21 @@ dependencies = [ "async-ffi", "async-trait", "datafusion", + "datafusion-catalog", "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions", + "datafusion-functions-aggregate", "datafusion-functions-aggregate-common", + "datafusion-functions-window", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", "datafusion-proto", "datafusion-proto-common", + "datafusion-session", "doc-comment", "futures", "log", @@ -2482,6 +2510,7 @@ dependencies = [ "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", + "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-aggregate-common", "datafusion-functions-window", @@ -2508,6 +2537,7 @@ name = "datafusion-proto" version = "51.0.0" dependencies = [ "arrow", + "async-trait", "chrono", "datafusion", "datafusion-catalog", @@ -2595,7 +2625,9 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-functions", + "datafusion-functions-nested", "log", + "percent-encoding", "rand 0.9.2", "sha1", "url", @@ -3811,9 +3843,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.82" +version = "0.3.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b011eec8cc36da2aab2d5cff675ec18454fad408585853910a202391cf9f8e65" +checksum = "464a3709c7f55f1f721e5389aa6ea4e3bc6aba669353300af094b29ffbdde1d8" dependencies = [ "once_cell", "wasm-bindgen", @@ -4324,6 +4356,16 @@ version = "4.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c6901729fa79e91a0913333229e9ca5dc725089d1c363b2f4b4760709dc4a52" +[[package]] +name = "page_size" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30d5b2194ed13191c1999ae0704b7839fb18384fa22e49b57eeaa97d79ce40da" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "parking_lot" version = "0.12.5" @@ -5892,6 +5934,12 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "strum" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" + [[package]] name = "strum_macros" version = "0.27.2" @@ -6689,9 +6737,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.18.1" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" +checksum = "e2e054861b4bd027cd373e18e8d8d8e6548085000e41290d95ce0c373a654b4a" dependencies = [ "getrandom 0.3.4", "js-sys", @@ -6758,9 +6806,9 @@ checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" [[package]] name = "wasm-bindgen" -version = "0.2.105" +version = "0.2.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da95793dfc411fbbd93f5be7715b0578ec61fe87cb1a42b12eb625caa5c5ea60" +checksum = "0d759f433fa64a2d763d1340820e46e111a7a5ab75f993d1852d70b03dbb80fd" dependencies = [ "cfg-if", "once_cell", @@ -6771,9 +6819,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.55" +version = "0.4.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "551f88106c6d5e7ccc7cd9a16f312dd3b5d36ea8b4954304657d5dfba115d4a0" +checksum = "836d9622d604feee9e5de25ac10e3ea5f2d65b41eac0d9ce72eb5deae707ce7c" dependencies = [ "cfg-if", "js-sys", @@ -6784,9 +6832,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.105" +version = "0.2.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04264334509e04a7bf8690f2384ef5265f05143a4bff3889ab7a3269adab59c2" +checksum = "48cb0d2638f8baedbc542ed444afc0644a29166f1595371af4fecf8ce1e7eeb3" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -6794,9 +6842,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.105" +version = "0.2.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "420bc339d9f322e562942d52e115d57e950d12d88983a14c79b86859ee6c7ebc" +checksum = "cefb59d5cd5f92d9dcf80e4683949f15ca4b511f4ac0a6e14d4e1ac60c6ecd40" dependencies = [ "bumpalo", "proc-macro2", @@ -6807,21 +6855,29 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.105" +version = "0.2.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76f218a38c84bcb33c25ec7059b07847d465ce0e0a76b995e134a45adcb6af76" +checksum = "cbc538057e648b67f72a982e708d485b2efa771e1ac05fec311f9f63e5800db4" dependencies = [ "unicode-ident", ] [[package]] name = "wasm-bindgen-test" -version = "0.3.55" +version = "0.3.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfc379bfb624eb59050b509c13e77b4eb53150c350db69628141abce842f2373" +checksum = "25e90e66d265d3a1efc0e72a54809ab90b9c0c515915c67cdf658689d2c22c6c" dependencies = [ + "async-trait", + "cast", "js-sys", + "libm", "minicov", + "nu-ansi-term", + "num-traits", + "oorandom", + "serde", + "serde_json", "wasm-bindgen", "wasm-bindgen-futures", "wasm-bindgen-test-macro", @@ -6829,9 +6885,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-test-macro" -version = "0.3.55" +version = "0.3.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "085b2df989e1e6f9620c1311df6c996e83fe16f57792b272ce1e024ac16a90f1" +checksum = "7150335716dce6028bead2b848e72f47b45e7b9422f64cccdc23bedca89affc1" dependencies = [ "proc-macro2", "quote", @@ -6853,9 +6909,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.82" +version = "0.3.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a1f95c0d03a47f4ae1f7a64643a6bb97465d9b740f0fa8f90ea33915c99a9a1" +checksum = "9b32828d774c412041098d182a8b38b16ea816958e07cf40eec2bc080ae137ac" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index 81dd199847e20..c11d0b0c0bde4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -94,7 +94,7 @@ arrow = { version = "57.1.0", features = [ "prettyprint", "chrono-tz", ] } -arrow-avro = { version = "57.0.0", default-features = false, features = [ +arrow-avro = { version = "57.1.0", default-features = false, features = [ "deflate", "snappy", "zstd", diff --git a/datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs b/datafusion/datasource-avro/src/avro_to_arrow/arrow_array_reader.rs deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs b/datafusion/datasource-avro/src/avro_to_arrow/schema.rs deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index e2d5b48f43e9e..57fe9a9bd51e7 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -23,14 +23,14 @@ use std::sync::Arc; use arrow_avro::reader::{Reader, ReaderBuilder}; use arrow_avro::schema::{AvroSchema, SCHEMA_METADATA_KEY}; -use datafusion_common::error::Result; use datafusion_common::DataFusionError; +use datafusion_common::error::Result; +use datafusion_datasource::TableSchema; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::file_stream::FileOpener; use datafusion_datasource::projection::{ProjectionOpener, SplitProjection}; use datafusion_datasource::schema_adapter::SchemaAdapterFactory; -use datafusion_datasource::TableSchema; use datafusion_physical_expr_common::sort_expr::LexOrdering; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_physical_plan::projection::ProjectionExprs; @@ -63,14 +63,6 @@ impl AvroSource { } fn open(&self, reader: R) -> Result> { - let file_schema = self.table_schema.file_schema(); - let projection = Some( - self.projection - .file_indices - .iter() - .map(|&idx| file_schema.field(idx).name().clone()) - .collect::>(), - ); // TODO: Once `ReaderBuilder::with_projection` is available, we should use it instead. // This should be an easy change. We'd simply need to: // 1. Use the full file schema to generate the reader `AvroSchema`. @@ -87,7 +79,7 @@ impl AvroSource { let file_schema = self.table_schema.file_schema().as_ref(); // Fast path: no projection. If we have the original writer schema JSON // in metadata, just reuse it as-is without parsing. - if self.projection.is_none() { + if self.projection.file_indices.is_empty() { return if let Some(avro_json) = file_schema.metadata().get(SCHEMA_METADATA_KEY) { @@ -134,9 +126,7 @@ impl AvroSource { } } // Rebuild `fields` in the same order as the projected Arrow schema. - let projection = self.projection.as_ref().ok_or_else(|| { - DataFusionError::Internal("checked projection is Some above".to_string()) - })?; + let projection = self.projection.file_indices.as_ref(); let projected_schema = file_schema.project(projection)?; let mut projected_fields = Vec::with_capacity(projected_schema.fields().len()); @@ -252,7 +242,7 @@ mod private { use std::io::BufReader; use bytes::Buf; - use datafusion_datasource::{file_stream::FileOpenFuture, PartitionedFile}; + use datafusion_datasource::{PartitionedFile, file_stream::FileOpenFuture}; use futures::StreamExt; use object_store::{GetResultPayload, ObjectStore}; @@ -264,7 +254,7 @@ mod private { impl FileOpener for AvroOpener { fn open(&self, partitioned_file: PartitionedFile) -> Result { let object_store = Arc::clone(&self.object_store); - let config = self.config.clone(); + let config = Arc::clone(&self.config); Ok(Box::pin(async move { // check if schema should be inferred @@ -273,12 +263,22 @@ mod private { .await?; let config = Arc::new(match r.payload { GetResultPayload::File(mut file, _) => { - let schema = match config.table_schema.file_schema().metadata.get(SCHEMA_METADATA_KEY) { - Some(_) => config.table_schema.file_schema().clone(), - None => Arc::new(read_avro_schema_from_reader(&mut file).unwrap()), // if not inferred, read schema from file + let schema = match config + .table_schema + .file_schema() + .metadata + .get(SCHEMA_METADATA_KEY) + { + Some(_) => Arc::clone(&config.table_schema.file_schema()), + None => { + Arc::new(read_avro_schema_from_reader(&mut file).unwrap()) + } // if not inferred, read schema from file }; AvroSource { - table_schema: TableSchema::new(schema, config.table_schema.table_partition_cols().clone()), + table_schema: TableSchema::new( + schema, + config.table_schema.table_partition_cols().clone(), + ), batch_size: config.batch_size, projection: config.projection.clone(), metrics: config.metrics.clone(), @@ -287,12 +287,23 @@ mod private { } GetResultPayload::Stream(_) => { let bytes = r.bytes().await?; - let schema = match config.table_schema.file_schema().metadata.get(SCHEMA_METADATA_KEY) { - Some(_) => config.table_schema.file_schema().clone(), - None => Arc::new(read_avro_schema_from_reader(&mut bytes.reader()).unwrap()), // if not inferred, read schema from file + let schema = match config + .table_schema + .file_schema() + .metadata + .get(SCHEMA_METADATA_KEY) + { + Some(_) => Arc::clone(&config.table_schema.file_schema()), + None => Arc::new( + read_avro_schema_from_reader(&mut bytes.reader()) + .unwrap(), + ), // if not inferred, read schema from file }; AvroSource { - table_schema: TableSchema::new(schema, config.table_schema.table_partition_cols().clone()), + table_schema: TableSchema::new( + schema, + config.table_schema.table_partition_cols().clone(), + ), batch_size: config.batch_size, projection: config.projection.clone(), metrics: config.metrics.clone(), From 9404bd69d298f0e2dac00cf6cae8cc3747092498 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Tue, 16 Dec 2025 00:57:05 +0900 Subject: [PATCH 28/35] resolve conflicts --- datafusion/datasource-avro/src/source.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index 57fe9a9bd51e7..e63e33fe8ea75 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -269,7 +269,7 @@ mod private { .metadata .get(SCHEMA_METADATA_KEY) { - Some(_) => Arc::clone(&config.table_schema.file_schema()), + Some(_) => Arc::clone(config.table_schema.file_schema()), None => { Arc::new(read_avro_schema_from_reader(&mut file).unwrap()) } // if not inferred, read schema from file @@ -293,7 +293,7 @@ mod private { .metadata .get(SCHEMA_METADATA_KEY) { - Some(_) => Arc::clone(&config.table_schema.file_schema()), + Some(_) => Arc::clone(config.table_schema.file_schema()), None => Arc::new( read_avro_schema_from_reader(&mut bytes.reader()) .unwrap(), From c6b9a7c488cf03e0237eba5861f06e8c37696be7 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Tue, 16 Dec 2025 00:59:17 +0900 Subject: [PATCH 29/35] resolve conflicts --- parquet-testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-testing b/parquet-testing index 107b36603e051..a3d96a65e11e2 160000 --- a/parquet-testing +++ b/parquet-testing @@ -1 +1 @@ -Subproject commit 107b36603e051aee26bd93e04b871034f6c756c0 +Subproject commit a3d96a65e11e2bbca7d22a894e8313ede90a33a3 From c40ecb8bad79b320adbfa2fb5f3d5e60fded49cc Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Wed, 17 Dec 2025 01:03:46 +0900 Subject: [PATCH 30/35] use default repartition --- datafusion/datasource-avro/src/source.rs | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index e63e33fe8ea75..7921cf2d9e053 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -23,15 +23,14 @@ use std::sync::Arc; use arrow_avro::reader::{Reader, ReaderBuilder}; use arrow_avro::schema::{AvroSchema, SCHEMA_METADATA_KEY}; -use datafusion_common::DataFusionError; use datafusion_common::error::Result; -use datafusion_datasource::TableSchema; +use datafusion_common::DataFusionError; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::file_stream::FileOpener; use datafusion_datasource::projection::{ProjectionOpener, SplitProjection}; use datafusion_datasource::schema_adapter::SchemaAdapterFactory; -use datafusion_physical_expr_common::sort_expr::LexOrdering; +use datafusion_datasource::TableSchema; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_physical_plan::projection::ProjectionExprs; @@ -212,16 +211,6 @@ impl FileSource for AvroSource { "avro" } - fn repartitioned( - &self, - _target_partitions: usize, - _repartition_file_min_size: usize, - _output_ordering: Option, - _config: &FileScanConfig, - ) -> Result> { - Ok(None) - } - fn with_schema_adapter_factory( &self, schema_adapter_factory: Arc, @@ -242,7 +231,7 @@ mod private { use std::io::BufReader; use bytes::Buf; - use datafusion_datasource::{PartitionedFile, file_stream::FileOpenFuture}; + use datafusion_datasource::{file_stream::FileOpenFuture, PartitionedFile}; use futures::StreamExt; use object_store::{GetResultPayload, ObjectStore}; From d8e0828c635f6c709d4a25259e69ba4abfd16c74 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Sat, 20 Dec 2025 16:39:27 +0900 Subject: [PATCH 31/35] use default repartition --- Cargo.lock | 114 ++++++++--------------- datafusion/datasource-avro/src/source.rs | 70 +------------- 2 files changed, 40 insertions(+), 144 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 02b88e99fd1db..541d79d903999 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -191,35 +191,6 @@ version = "1.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" -[[package]] -name = "apache-avro" -version = "0.21.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36fa98bc79671c7981272d91a8753a928ff6a1cd8e4f20a44c45bd5d313840bf" -dependencies = [ - "bigdecimal", - "bon", - "bzip2", - "crc32fast", - "digest", - "liblzma", - "log", - "miniz_oxide", - "num-bigint", - "quad-rand", - "rand 0.9.2", - "regex-lite", - "serde", - "serde_bytes", - "serde_json", - "snap", - "strum 0.27.2", - "strum_macros 0.27.2", - "thiserror", - "uuid", - "zstd", -] - [[package]] name = "arrayref" version = "0.3.9" @@ -288,6 +259,28 @@ dependencies = [ "num-traits", ] +[[package]] +name = "arrow-avro" +version = "57.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "582f1459cdc77a082b345f03c7be047ec9b80221d50325398d47c88fad54188e" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-schema", + "bzip2", + "crc", + "flate2", + "indexmap 2.12.1", + "rand 0.9.2", + "serde", + "serde_json", + "snap", + "strum_macros 0.27.2", + "uuid", + "zstd", +] + [[package]] name = "arrow-buffer" version = "57.1.0" @@ -1029,7 +1022,6 @@ dependencies = [ "num-bigint", "num-integer", "num-traits", - "serde", ] [[package]] @@ -1184,31 +1176,6 @@ dependencies = [ "serde_with", ] -[[package]] -name = "bon" -version = "3.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebeb9aaf9329dff6ceb65c689ca3db33dbf15f324909c60e4e5eef5701ce31b1" -dependencies = [ - "bon-macros", - "rustversion", -] - -[[package]] -name = "bon-macros" -version = "3.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77e9d642a7e3a318e37c2c9427b5a6a48aa1ad55dcd986f3034ab2239045a645" -dependencies = [ - "darling", - "ident_case", - "prettyplease", - "proc-macro2", - "quote", - "rustversion", - "syn 2.0.111", -] - [[package]] name = "borsh" version = "1.5.7" @@ -1640,6 +1607,21 @@ dependencies = [ "libc", ] +[[package]] +name = "crc" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + [[package]] name = "crc32fast" version = "1.5.0" @@ -2007,7 +1989,6 @@ name = "datafusion-common" version = "51.0.0" dependencies = [ "ahash 0.8.12", - "apache-avro", "arrow", "arrow-ipc", "chrono", @@ -2099,8 +2080,8 @@ dependencies = [ name = "datafusion-datasource-avro" version = "51.0.0" dependencies = [ - "apache-avro", "arrow", + "arrow-avro", "async-trait", "bytes", "datafusion-common", @@ -2109,7 +2090,6 @@ dependencies = [ "datafusion-physical-plan", "datafusion-session", "futures", - "num-traits", "object_store", "serde_json", ] @@ -4260,7 +4240,6 @@ checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ "num-integer", "num-traits", - "serde", ] [[package]] @@ -4930,12 +4909,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "quad-rand" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" - [[package]] name = "quick-xml" version = "0.38.3" @@ -5640,16 +5613,6 @@ dependencies = [ "serde_derive", ] -[[package]] -name = "serde_bytes" -version = "0.11.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5d440709e79d88e51ac01c4b72fc6cb7314017bb7da9eeff678aa94c10e3ea8" -dependencies = [ - "serde", - "serde_core", -] - [[package]] name = "serde_core" version = "1.0.228" @@ -6849,7 +6812,6 @@ checksum = "e2e054861b4bd027cd373e18e8d8d8e6548085000e41290d95ce0c373a654b4a" dependencies = [ "getrandom 0.3.4", "js-sys", - "serde_core", "wasm-bindgen", ] diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index 7921cf2d9e053..17b0b80de144d 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -34,7 +34,6 @@ use datafusion_datasource::TableSchema; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_physical_plan::projection::ProjectionExprs; -use crate::read_avro_schema_from_reader; use object_store::ObjectStore; use serde_json::Value; @@ -76,19 +75,9 @@ impl AvroSource { fn build_projected_reader_schema(&self) -> Result { let file_schema = self.table_schema.file_schema().as_ref(); - // Fast path: no projection. If we have the original writer schema JSON - // in metadata, just reuse it as-is without parsing. + // Fast path: no projection. if self.projection.file_indices.is_empty() { - return if let Some(avro_json) = - file_schema.metadata().get(SCHEMA_METADATA_KEY) - { - Ok(AvroSchema::new(avro_json.clone())) - } else { - // Fall back to deriving Avro from the full Arrow file schema, should be ok - // if not using projection. - Ok(AvroSchema::try_from(file_schema) - .map_err(Into::::into)?) - }; + return Ok(AvroSchema::try_from(file_schema).map_err(Into::::into)?) } // Use the writer Avro schema JSON tagged upstream to build a projected reader schema match file_schema.metadata().get(SCHEMA_METADATA_KEY) { @@ -246,61 +235,6 @@ mod private { let config = Arc::clone(&self.config); Ok(Box::pin(async move { - // check if schema should be inferred - let r = object_store - .get(&partitioned_file.object_meta.location) - .await?; - let config = Arc::new(match r.payload { - GetResultPayload::File(mut file, _) => { - let schema = match config - .table_schema - .file_schema() - .metadata - .get(SCHEMA_METADATA_KEY) - { - Some(_) => Arc::clone(config.table_schema.file_schema()), - None => { - Arc::new(read_avro_schema_from_reader(&mut file).unwrap()) - } // if not inferred, read schema from file - }; - AvroSource { - table_schema: TableSchema::new( - schema, - config.table_schema.table_partition_cols().clone(), - ), - batch_size: config.batch_size, - projection: config.projection.clone(), - metrics: config.metrics.clone(), - schema_adapter_factory: config.schema_adapter_factory.clone(), - } - } - GetResultPayload::Stream(_) => { - let bytes = r.bytes().await?; - let schema = match config - .table_schema - .file_schema() - .metadata - .get(SCHEMA_METADATA_KEY) - { - Some(_) => Arc::clone(config.table_schema.file_schema()), - None => Arc::new( - read_avro_schema_from_reader(&mut bytes.reader()) - .unwrap(), - ), // if not inferred, read schema from file - }; - AvroSource { - table_schema: TableSchema::new( - schema, - config.table_schema.table_partition_cols().clone(), - ), - batch_size: config.batch_size, - projection: config.projection.clone(), - metrics: config.metrics.clone(), - schema_adapter_factory: config.schema_adapter_factory.clone(), - } - } - }); - let r = object_store .get(&partitioned_file.object_meta.location) .await?; From 0b632b31d818dec75737da80dcd5d01e1ce2d4ce Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Thu, 15 Jan 2026 01:15:28 +0900 Subject: [PATCH 32/35] apply projection feature in arrow-avro --- Cargo.lock | 86 +++++------ Cargo.toml | 20 +-- datafusion/datasource-avro/src/mod.rs | 184 ++++------------------- datafusion/datasource-avro/src/source.rs | 100 ++---------- 4 files changed, 82 insertions(+), 308 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 541d79d903999..49a1e0d9a22a2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -205,9 +205,8 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "57.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb372a7cbcac02a35d3fb7b3fc1f969ec078e871f9bb899bf00a2e1809bec8a3" +version = "57.2.0" +source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" dependencies = [ "arrow-arith", "arrow-array", @@ -228,9 +227,8 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "57.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f377dcd19e440174596d83deb49cd724886d91060c07fec4f67014ef9d54049" +version = "57.2.0" +source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" dependencies = [ "arrow-array", "arrow-buffer", @@ -242,9 +240,8 @@ dependencies = [ [[package]] name = "arrow-array" -version = "57.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a23eaff85a44e9fa914660fb0d0bb00b79c4a3d888b5334adb3ea4330c84f002" +version = "57.2.0" +source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" dependencies = [ "ahash 0.8.12", "arrow-buffer", @@ -261,9 +258,8 @@ dependencies = [ [[package]] name = "arrow-avro" -version = "57.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "582f1459cdc77a082b345f03c7be047ec9b80221d50325398d47c88fad54188e" +version = "57.2.0" +source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" dependencies = [ "arrow-array", "arrow-buffer", @@ -283,9 +279,8 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "57.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2819d893750cb3380ab31ebdc8c68874dd4429f90fd09180f3c93538bd21626" +version = "57.2.0" +source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" dependencies = [ "bytes", "half", @@ -295,9 +290,8 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "57.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3d131abb183f80c450d4591dc784f8d7750c50c6e2bc3fcaad148afc8361271" +version = "57.2.0" +source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" dependencies = [ "arrow-array", "arrow-buffer", @@ -317,9 +311,8 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "57.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2275877a0e5e7e7c76954669366c2aa1a829e340ab1f612e647507860906fb6b" +version = "57.2.0" +source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" dependencies = [ "arrow-array", "arrow-cast", @@ -332,9 +325,8 @@ dependencies = [ [[package]] name = "arrow-data" -version = "57.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05738f3d42cb922b9096f7786f606fcb8669260c2640df8490533bb2fa38c9d3" +version = "57.2.0" +source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" dependencies = [ "arrow-buffer", "arrow-schema", @@ -345,9 +337,8 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "57.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b5f57c3d39d1b1b7c1376a772ea86a131e7da310aed54ebea9363124bb885e3" +version = "57.2.0" +source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" dependencies = [ "arrow-arith", "arrow-array", @@ -373,9 +364,8 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "57.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d09446e8076c4b3f235603d9ea7c5494e73d441b01cd61fb33d7254c11964b3" +version = "57.2.0" +source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" dependencies = [ "arrow-array", "arrow-buffer", @@ -389,9 +379,8 @@ dependencies = [ [[package]] name = "arrow-json" -version = "57.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "371ffd66fa77f71d7628c63f209c9ca5341081051aa32f9c8020feb0def787c0" +version = "57.2.0" +source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" dependencies = [ "arrow-array", "arrow-buffer", @@ -413,9 +402,8 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "57.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbc94fc7adec5d1ba9e8cd1b1e8d6f72423b33fe978bf1f46d970fafab787521" +version = "57.2.0" +source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" dependencies = [ "arrow-array", "arrow-buffer", @@ -426,9 +414,8 @@ dependencies = [ [[package]] name = "arrow-row" -version = "57.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "169676f317157dc079cc5def6354d16db63d8861d61046d2f3883268ced6f99f" +version = "57.2.0" +source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" dependencies = [ "arrow-array", "arrow-buffer", @@ -439,9 +426,8 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "57.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d27609cd7dd45f006abae27995c2729ef6f4b9361cde1ddd019dc31a5aa017e0" +version = "57.2.0" +source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" dependencies = [ "bitflags 2.9.4", "serde", @@ -451,9 +437,8 @@ dependencies = [ [[package]] name = "arrow-select" -version = "57.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae980d021879ea119dd6e2a13912d81e64abed372d53163e804dfe84639d8010" +version = "57.2.0" +source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -465,9 +450,8 @@ dependencies = [ [[package]] name = "arrow-string" -version = "57.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf35e8ef49dcf0c5f6d175edee6b8af7b45611805333129c541a8b89a0fc0534" +version = "57.2.0" +source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" dependencies = [ "arrow-array", "arrow-buffer", @@ -4440,14 +4424,12 @@ dependencies = [ [[package]] name = "parquet" -version = "57.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be3e4f6d320dd92bfa7d612e265d7d08bba0a240bab86af3425e1d255a511d89" +version = "57.2.0" +source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" dependencies = [ "ahash 0.8.12", "arrow-array", "arrow-buffer", - "arrow-cast", "arrow-data", "arrow-ipc", "arrow-schema", diff --git a/Cargo.toml b/Cargo.toml index a5a76ac53616d..6a6e643594640 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -90,25 +90,25 @@ version = "51.0.0" ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } -arrow = { version = "57.1.0", features = [ +arrow = { git = "https://github.com/jecsand838/arrow-rs", branch = "avro-reader-projection", features = [ "prettyprint", "chrono-tz", -] } -arrow-avro = { version = "57.1.0", default-features = false, features = [ +] } # fixme +arrow-avro = { git = "https://github.com/jecsand838/arrow-rs", branch = "avro-reader-projection", default-features = false, features = [ "deflate", "snappy", "zstd", "bzip2", -] } -arrow-buffer = { version = "57.1.0", default-features = false } -arrow-flight = { version = "57.1.0", features = [ +] }# fixme +arrow-buffer = { git = "https://github.com/jecsand838/arrow-rs", branch = "avro-reader-projection", default-features = false }# fixme +arrow-flight = { git = "https://github.com/jecsand838/arrow-rs", branch = "avro-reader-projection", features = [# fixme "flight-sql-experimental", ] } -arrow-ipc = { version = "57.1.0", default-features = false, features = [ +arrow-ipc = { git = "https://github.com/jecsand838/arrow-rs", branch = "avro-reader-projection", default-features = false, features = [# fixme "lz4", ] } -arrow-ord = { version = "57.1.0", default-features = false } -arrow-schema = { version = "57.1.0", default-features = false } +arrow-ord = { git = "https://github.com/jecsand838/arrow-rs", branch = "avro-reader-projection", default-features = false }# fixme +arrow-schema = { git = "https://github.com/jecsand838/arrow-rs", branch = "avro-reader-projection", default-features = false }# fixme async-trait = "0.1.89" bigdecimal = "0.4.8" bytes = "1.11" @@ -171,7 +171,7 @@ log = "^0.4" num-traits = { version = "0.2" } object_store = { version = "0.12.4", default-features = false } parking_lot = "0.12" -parquet = { version = "57.1.0", default-features = false, features = [ +parquet = { git = "https://github.com/jecsand838/arrow-rs", branch = "avro-reader-projection", default-features = false, features = [ # fixme "arrow", "async", "object_store", diff --git a/datafusion/datasource-avro/src/mod.rs b/datafusion/datasource-avro/src/mod.rs index 009750ecb5828..946330166582b 100644 --- a/datafusion/datasource-avro/src/mod.rs +++ b/datafusion/datasource-avro/src/mod.rs @@ -34,188 +34,58 @@ pub mod source; use arrow::datatypes::Schema; pub use arrow_avro; use arrow_avro::reader::ReaderBuilder; -use arrow_avro::schema::SCHEMA_METADATA_KEY; -use datafusion_common::DataFusionError; pub use file_format::*; use std::io::{BufReader, Read}; -use std::sync::Arc; /// Read Avro schema given a reader pub fn read_avro_schema_from_reader( reader: &mut R, ) -> datafusion_common::Result { let avro_reader = ReaderBuilder::new().build(BufReader::new(reader))?; - let schema_ref = avro_reader.schema(); - // Extract the raw Avro JSON schema from the OCF header. - let raw_json = avro_reader - .avro_header() - .get(SCHEMA_METADATA_KEY.as_bytes()) - .map(|bytes| { - std::str::from_utf8(bytes).map_err(|e| { - DataFusionError::Execution(format!( - "Invalid UTF-8 in Avro schema metadata ({SCHEMA_METADATA_KEY}): {e}" - )) - }) - }) - .transpose()? - .map(str::to_owned); - drop(avro_reader); - if let Some(raw_json) = raw_json { - let mut schema = Arc::unwrap_or_clone(schema_ref); - // Insert the raw Avro JSON schema using `SCHEMA_METADATA_KEY`. - // This should enable the avro schema metadata to be picked downstream. - schema - .metadata - .insert(SCHEMA_METADATA_KEY.to_string(), raw_json); - Ok(schema) - } else { - // Return error because Avro spec requires the Avro schema metadata to be present in the OCF header. - Err(DataFusionError::Execution(format!( - "Avro schema metadata ({SCHEMA_METADATA_KEY}) is missing from OCF header" - ))) - } + Ok(avro_reader.schema().as_ref().clone()) } #[cfg(test)] mod test { use super::*; - use arrow::array::{BinaryArray, BooleanArray, Float64Array}; - use arrow::datatypes::DataType; - use arrow_avro::reader::ReaderBuilder; - use arrow_avro::schema::{AvroSchema, SCHEMA_METADATA_KEY}; use datafusion_common::test_util::arrow_test_data; - use datafusion_common::{DataFusionError, Result as DFResult}; - use serde_json::Value; - use std::collections::HashMap; + use datafusion_common::Result as DFResult; use std::fs::File; - use std::io::BufReader; + use arrow::datatypes::{DataType, Field, TimeUnit}; fn avro_test_file(name: &str) -> String { format!("{}/avro/{name}", arrow_test_data()) } #[test] - fn read_avro_schema_includes_avro_json_metadata() -> DFResult<()> { - let path = avro_test_file("alltypes_plain.avro"); - let mut file = File::open(&path)?; - let schema = read_avro_schema_from_reader(&mut file)?; - let meta_json = schema - .metadata() - .get(SCHEMA_METADATA_KEY) - .expect("schema metadata missing avro.schema entry"); - assert!( - !meta_json.is_empty(), - "avro.schema metadata should not be empty" - ); - let mut raw = File::open(&path)?; - let avro_reader = ReaderBuilder::new().build(BufReader::new(&mut raw))?; - let header_json = avro_reader - .avro_header() - .get(SCHEMA_METADATA_KEY.as_bytes()) - .and_then(|bytes| std::str::from_utf8(bytes).ok()) - .expect("missing avro.schema metadata in OCF header"); - assert_eq!( - meta_json, header_json, - "schema metadata avro.schema should match OCF header" - ); - Ok(()) - } - - #[test] - fn read_and_project_using_schema_metadata() -> DFResult<()> { + fn test_read_avro_schema_from_reader() -> DFResult<()> { let path = avro_test_file("alltypes_dictionary.avro"); let mut file = File::open(&path)?; let file_schema = read_avro_schema_from_reader(&mut file)?; - let projected_field_names = vec!["string_col", "double_col", "bool_col"]; - let avro_json = file_schema - .metadata() - .get(SCHEMA_METADATA_KEY) - .expect("schema metadata missing avro.schema entry"); - let projected_avro_schema = - build_projected_reader_schema(avro_json, &projected_field_names)?; - let mut reader = ReaderBuilder::new() - .with_reader_schema(projected_avro_schema) - .with_batch_size(64) - .build(BufReader::new(File::open(&path)?))?; - let batch = reader.next().expect("no batch produced")?; - assert_eq!(3, batch.num_columns()); - assert_eq!(2, batch.num_rows()); - let schema = batch.schema(); - assert_eq!("string_col", schema.field(0).name()); - assert_eq!(&DataType::Binary, schema.field(0).data_type()); - let col = batch - .column(0) - .as_any() - .downcast_ref::() - .expect("column 0 not BinaryArray"); - assert_eq!("0".as_bytes(), col.value(0)); - assert_eq!("1".as_bytes(), col.value(1)); - assert_eq!("double_col", schema.field(1).name()); - assert_eq!(&DataType::Float64, schema.field(1).data_type()); - let col = batch - .column(1) - .as_any() - .downcast_ref::() - .expect("column 1 not Float64Array"); - assert_eq!(0.0, col.value(0)); - assert_eq!(10.1, col.value(1)); - assert_eq!("bool_col", schema.field(2).name()); - assert_eq!(&DataType::Boolean, schema.field(2).data_type()); - let col = batch - .column(2) - .as_any() - .downcast_ref::() - .expect("column 2 not BooleanArray"); - assert!(col.value(0)); - assert!(!col.value(1)); - Ok(()) - } - fn build_projected_reader_schema( - avro_json: &str, - projected_field_names: &[&str], - ) -> DFResult { - let mut schema_json: Value = serde_json::from_str(avro_json).map_err(|e| { - DataFusionError::Execution(format!( - "Failed to parse Avro schema JSON from metadata: {e}" - )) - })?; - let obj = schema_json.as_object_mut().ok_or_else(|| { - DataFusionError::Execution( - "Top-level Avro schema JSON is not an object".to_string(), - ) - })?; - let fields_val = obj.get_mut("fields").ok_or_else(|| { - DataFusionError::Execution( - "Top-level Avro schema JSON has no `fields` key".to_string(), - ) - })?; - let fields = fields_val.as_array_mut().ok_or_else(|| { - DataFusionError::Execution( - "Top-level Avro schema `fields` is not an array".to_string(), - ) - })?; - let mut by_name: HashMap = HashMap::new(); - for field in fields.iter() { - if let Some(name) = field.get("name").and_then(|v| v.as_str()) { - by_name.insert(name.to_string(), field.clone()); - } - } - let mut projected_fields = Vec::with_capacity(projected_field_names.len()); - for name in projected_field_names { - let Some(field) = by_name.get(*name) else { - return Err(DataFusionError::Execution(format!( - "Projected field `{name}` not found in Avro writer schema" - ))); - }; - projected_fields.push(field.clone()); + let expected_fields = vec![ + Field::new("id", DataType::Int32, true), + Field::new("bool_col", DataType::Boolean, true), + Field::new("tinyint_col", DataType::Int32, true), + Field::new("smallint_col", DataType::Int32, true), + Field::new("int_col", DataType::Int32, true), + Field::new("bigint_col", DataType::Int64, true), + Field::new("float_col", DataType::Float32, true), + Field::new("double_col", DataType::Float64, true), + Field::new("date_string_col", DataType::Binary, true), + Field::new("string_col", DataType::Binary, true), + Field::new( + "timestamp_col", + DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())), + true, + ), + ]; + + assert_eq!(file_schema.fields.len(), expected_fields.len()); + for (i, field) in file_schema.fields.iter().enumerate() { + assert_eq!(field.as_ref(), &expected_fields[i]); } - *fields_val = Value::Array(projected_fields); - let projected_json = serde_json::to_string(&schema_json).map_err(|e| { - DataFusionError::Execution(format!( - "Failed to serialize projected Avro schema JSON: {e}" - )) - })?; - Ok(AvroSchema::new(projected_json)) + + Ok(()) } } diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index 17b0b80de144d..455806676ab4f 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -18,13 +18,11 @@ //! Execution plan for reading line-delimited Avro files use std::any::Any; -use std::collections::HashMap; use std::sync::Arc; use arrow_avro::reader::{Reader, ReaderBuilder}; -use arrow_avro::schema::{AvroSchema, SCHEMA_METADATA_KEY}; +use arrow_avro::schema::AvroSchema; use datafusion_common::error::Result; -use datafusion_common::DataFusionError; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::file_stream::FileOpener; @@ -35,7 +33,6 @@ use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_physical_plan::projection::ProjectionExprs; use object_store::ObjectStore; -use serde_json::Value; /// AvroSource holds the extra configuration that is necessary for opening avro files #[derive(Clone)] @@ -61,86 +58,14 @@ impl AvroSource { } fn open(&self, reader: R) -> Result> { - // TODO: Once `ReaderBuilder::with_projection` is available, we should use it instead. - // This should be an easy change. We'd simply need to: - // 1. Use the full file schema to generate the reader `AvroSchema`. - // 2. Pass `&self.projection` into `ReaderBuilder::with_projection`. - // 3. Remove the `build_projected_reader_schema` methods. ReaderBuilder::new() - .with_reader_schema(self.build_projected_reader_schema()?) + .with_reader_schema(AvroSchema::try_from(self.table_schema.file_schema().as_ref()).unwrap()) .with_batch_size(self.batch_size.expect("Batch size must set before open")) + .with_projection(self.projection.file_indices.clone()) .build(reader) .map_err(Into::into) } - fn build_projected_reader_schema(&self) -> Result { - let file_schema = self.table_schema.file_schema().as_ref(); - // Fast path: no projection. - if self.projection.file_indices.is_empty() { - return Ok(AvroSchema::try_from(file_schema).map_err(Into::::into)?) - } - // Use the writer Avro schema JSON tagged upstream to build a projected reader schema - match file_schema.metadata().get(SCHEMA_METADATA_KEY) { - Some(avro_json) => { - let mut schema_json: Value = - serde_json::from_str(avro_json).map_err(|e| { - DataFusionError::Execution(format!( - "Failed to parse Avro schema JSON from metadata: {e}" - )) - })?; - let obj = schema_json.as_object_mut().ok_or_else(|| { - DataFusionError::Execution( - "Top-level Avro schema JSON must be an object".to_string(), - ) - })?; - let fields_val = obj.get_mut("fields").ok_or_else(|| { - DataFusionError::Execution( - "Top-level Avro schema JSON must contain a `fields` array" - .to_string(), - ) - })?; - let fields_arr = fields_val.as_array_mut().ok_or_else(|| { - DataFusionError::Execution( - "Top-level Avro schema `fields` must be an array".to_string(), - ) - })?; - // Move existing fields out so we can rebuild them in projected order. - let original_fields = std::mem::take(fields_arr); - let mut by_name: HashMap = - HashMap::with_capacity(original_fields.len()); - for field in original_fields { - if let Some(name) = field.get("name").and_then(|v| v.as_str()) { - by_name.insert(name.to_string(), field); - } - } - // Rebuild `fields` in the same order as the projected Arrow schema. - let projection = self.projection.file_indices.as_ref(); - let projected_schema = file_schema.project(projection)?; - let mut projected_fields = - Vec::with_capacity(projected_schema.fields().len()); - for arrow_field in projected_schema.fields() { - let name = arrow_field.name(); - let field = by_name.remove(name).ok_or_else(|| { - DataFusionError::Execution(format!( - "Projected field `{name}` not found in Avro writer schema" - )) - })?; - projected_fields.push(field); - } - *fields_val = Value::Array(projected_fields); - let projected_json = - serde_json::to_string(&schema_json).map_err(|e| { - DataFusionError::Execution(format!( - "Failed to serialize projected Avro schema JSON: {e}" - )) - })?; - Ok(AvroSchema::new(projected_json)) - } - None => Err(DataFusionError::Execution(format!( - "Avro schema metadata ({SCHEMA_METADATA_KEY}) is missing from file schema, but is required for projection" - ))), - } - } } impl FileSource for AvroSource { @@ -267,16 +192,17 @@ mod tests { TimestampMicrosecondArray, }; use arrow::datatypes::{DataType, Field}; - use arrow::datatypes::{Schema, TimeUnit}; + use arrow::datatypes::TimeUnit; use std::fs::File; use std::io::BufReader; - fn build_reader(name: &'_ str, schema: Option<&Schema>) -> Reader> { + fn build_reader(name: &'_ str, projection : Option>) -> Reader> { let testdata = datafusion_common::test_util::arrow_test_data(); let filename = format!("{testdata}/avro/{name}"); - let mut builder = ReaderBuilder::new().with_batch_size(64); - if let Some(schema) = schema { - builder = builder.with_reader_schema(AvroSchema::try_from(schema).unwrap()); + let mut builder = ReaderBuilder::new() + .with_batch_size(64); + if let Some(proj) = projection { + builder = builder.with_projection(proj); } builder .build(BufReader::new(File::open(filename).unwrap())) @@ -382,14 +308,10 @@ mod tests { #[test] fn test_avro_with_projection() { // Test projection to filter and reorder columns - let projected_schema = Schema::new(vec![ - Field::new("string_col", DataType::Binary, false), - Field::new("double_col", DataType::Float64, false), - Field::new("bool_col", DataType::Boolean, false), - ]); + let projection = vec![9, 7, 1]; // string_col, double_col, bool_col let mut reader = - build_reader("alltypes_dictionary.avro", Some(&projected_schema)); + build_reader("alltypes_dictionary.avro", Some(projection)); let batch = reader.next().unwrap().unwrap(); // Only 3 columns should be present (not all 11) From bdd3820c42f5bc1e9a49be01267fdc9495b7be08 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Thu, 15 Jan 2026 01:19:37 +0900 Subject: [PATCH 33/35] apply projection feature in arrow-avro --- Cargo.lock | 3 +-- Cargo.toml | 1 + datafusion/datasource-avro/Cargo.toml | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 49a1e0d9a22a2..3c041c5445f5b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -268,6 +268,7 @@ dependencies = [ "crc", "flate2", "indexmap 2.12.1", + "liblzma", "rand 0.9.2", "serde", "serde_json", @@ -2070,12 +2071,10 @@ dependencies = [ "bytes", "datafusion-common", "datafusion-datasource", - "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-session", "futures", "object_store", - "serde_json", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 6a6e643594640..caee4499d1729 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -99,6 +99,7 @@ arrow-avro = { git = "https://github.com/jecsand838/arrow-rs", branch = "avro-re "snappy", "zstd", "bzip2", + "xz", ] }# fixme arrow-buffer = { git = "https://github.com/jecsand838/arrow-rs", branch = "avro-reader-projection", default-features = false }# fixme arrow-flight = { git = "https://github.com/jecsand838/arrow-rs", branch = "avro-reader-projection", features = [# fixme diff --git a/datafusion/datasource-avro/Cargo.toml b/datafusion/datasource-avro/Cargo.toml index 8f9ed075f8468..28439c46addda 100644 --- a/datafusion/datasource-avro/Cargo.toml +++ b/datafusion/datasource-avro/Cargo.toml @@ -37,12 +37,10 @@ async-trait = { workspace = true } bytes = { workspace = true } datafusion-common = { workspace = true, features = ["object_store"] } datafusion-datasource = { workspace = true } -datafusion-physical-expr-common = { workspace = true } datafusion-physical-plan = { workspace = true } datafusion-session = { workspace = true } futures = { workspace = true } object_store = { workspace = true } -serde_json = { workspace = true } [dev-dependencies] From 9da238802d7e6725870b068a03fd82eaed81fbe7 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Fri, 16 Jan 2026 19:36:51 +0900 Subject: [PATCH 34/35] apply projection feature in arrow-avro --- Cargo.lock | 34 +++++++++++++++++----------------- Cargo.toml | 20 ++++++++++---------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3c041c5445f5b..0aae6d8c085db 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -206,7 +206,7 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" version = "57.2.0" -source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" +source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ "arrow-arith", "arrow-array", @@ -228,7 +228,7 @@ dependencies = [ [[package]] name = "arrow-arith" version = "57.2.0" -source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" +source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ "arrow-array", "arrow-buffer", @@ -241,7 +241,7 @@ dependencies = [ [[package]] name = "arrow-array" version = "57.2.0" -source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" +source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ "ahash 0.8.12", "arrow-buffer", @@ -259,7 +259,7 @@ dependencies = [ [[package]] name = "arrow-avro" version = "57.2.0" -source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" +source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ "arrow-array", "arrow-buffer", @@ -281,7 +281,7 @@ dependencies = [ [[package]] name = "arrow-buffer" version = "57.2.0" -source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" +source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ "bytes", "half", @@ -292,7 +292,7 @@ dependencies = [ [[package]] name = "arrow-cast" version = "57.2.0" -source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" +source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ "arrow-array", "arrow-buffer", @@ -313,7 +313,7 @@ dependencies = [ [[package]] name = "arrow-csv" version = "57.2.0" -source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" +source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ "arrow-array", "arrow-cast", @@ -327,7 +327,7 @@ dependencies = [ [[package]] name = "arrow-data" version = "57.2.0" -source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" +source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ "arrow-buffer", "arrow-schema", @@ -339,7 +339,7 @@ dependencies = [ [[package]] name = "arrow-flight" version = "57.2.0" -source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" +source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ "arrow-arith", "arrow-array", @@ -366,7 +366,7 @@ dependencies = [ [[package]] name = "arrow-ipc" version = "57.2.0" -source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" +source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ "arrow-array", "arrow-buffer", @@ -381,7 +381,7 @@ dependencies = [ [[package]] name = "arrow-json" version = "57.2.0" -source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" +source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ "arrow-array", "arrow-buffer", @@ -404,7 +404,7 @@ dependencies = [ [[package]] name = "arrow-ord" version = "57.2.0" -source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" +source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ "arrow-array", "arrow-buffer", @@ -416,7 +416,7 @@ dependencies = [ [[package]] name = "arrow-row" version = "57.2.0" -source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" +source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ "arrow-array", "arrow-buffer", @@ -428,7 +428,7 @@ dependencies = [ [[package]] name = "arrow-schema" version = "57.2.0" -source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" +source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ "bitflags 2.9.4", "serde", @@ -439,7 +439,7 @@ dependencies = [ [[package]] name = "arrow-select" version = "57.2.0" -source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" +source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -452,7 +452,7 @@ dependencies = [ [[package]] name = "arrow-string" version = "57.2.0" -source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" +source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ "arrow-array", "arrow-buffer", @@ -4424,7 +4424,7 @@ dependencies = [ [[package]] name = "parquet" version = "57.2.0" -source = "git+https://github.com/jecsand838/arrow-rs?branch=avro-reader-projection#2b527656293781bbea03014c6e55ff5d4559371c" +source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ "ahash 0.8.12", "arrow-array", diff --git a/Cargo.toml b/Cargo.toml index caee4499d1729..8a8a98d34d244 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -90,26 +90,26 @@ version = "51.0.0" ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } -arrow = { git = "https://github.com/jecsand838/arrow-rs", branch = "avro-reader-projection", features = [ +arrow = { git = "https://github.com/apache/arrow-rs", rev = "1db1a8869cceb179aa885ed58da9f0b49c03eafe", features = [ # fixme "prettyprint", "chrono-tz", -] } # fixme -arrow-avro = { git = "https://github.com/jecsand838/arrow-rs", branch = "avro-reader-projection", default-features = false, features = [ +] } +arrow-avro = { git = "https://github.com/apache/arrow-rs", rev = "1db1a8869cceb179aa885ed58da9f0b49c03eafe", default-features = false, features = [ # fixme "deflate", "snappy", "zstd", "bzip2", "xz", -] }# fixme -arrow-buffer = { git = "https://github.com/jecsand838/arrow-rs", branch = "avro-reader-projection", default-features = false }# fixme -arrow-flight = { git = "https://github.com/jecsand838/arrow-rs", branch = "avro-reader-projection", features = [# fixme +] } +arrow-buffer = { git = "https://github.com/apache/arrow-rs", rev = "1db1a8869cceb179aa885ed58da9f0b49c03eafe", default-features = false } # fixme +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "1db1a8869cceb179aa885ed58da9f0b49c03eafe", features = [ # fixme "flight-sql-experimental", ] } -arrow-ipc = { git = "https://github.com/jecsand838/arrow-rs", branch = "avro-reader-projection", default-features = false, features = [# fixme +arrow-ipc = { git = "https://github.com/apache/arrow-rs", rev = "1db1a8869cceb179aa885ed58da9f0b49c03eafe", default-features = false, features = [ # fixme "lz4", ] } -arrow-ord = { git = "https://github.com/jecsand838/arrow-rs", branch = "avro-reader-projection", default-features = false }# fixme -arrow-schema = { git = "https://github.com/jecsand838/arrow-rs", branch = "avro-reader-projection", default-features = false }# fixme +arrow-ord = { git = "https://github.com/apache/arrow-rs", rev = "1db1a8869cceb179aa885ed58da9f0b49c03eafe", default-features = false } # fixme +arrow-schema = { git = "https://github.com/apache/arrow-rs", rev = "1db1a8869cceb179aa885ed58da9f0b49c03eafe", default-features = false } # fixme async-trait = "0.1.89" bigdecimal = "0.4.8" bytes = "1.11" @@ -172,7 +172,7 @@ log = "^0.4" num-traits = { version = "0.2" } object_store = { version = "0.12.4", default-features = false } parking_lot = "0.12" -parquet = { git = "https://github.com/jecsand838/arrow-rs", branch = "avro-reader-projection", default-features = false, features = [ # fixme +parquet = { git = "https://github.com/apache/arrow-rs", rev = "1db1a8869cceb179aa885ed58da9f0b49c03eafe", default-features = false, features = [ # fixme "arrow", "async", "object_store", From ee742eec873b31880347aa08bd6d7f1945eed910 Mon Sep 17 00:00:00 2001 From: Namgung Chan <9511chn@gmail.com> Date: Fri, 16 Jan 2026 19:43:31 +0900 Subject: [PATCH 35/35] resolve conflict --- Cargo.lock | 764 ++++++++++++++++++----------------------------------- Cargo.toml | 89 +++---- 2 files changed, 302 insertions(+), 551 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0aae6d8c085db..1cad87a9a1a79 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -56,17 +56,6 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" -[[package]] -name = "ahash" -version = "0.7.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "891477e0c6a8957309ee5c45a6368af3ae14bb510732d2684ffa19af310920f9" -dependencies = [ - "getrandom 0.2.16", - "once_cell", - "version_check", -] - [[package]] name = "ahash" version = "0.8.12" @@ -243,7 +232,7 @@ name = "arrow-array" version = "57.2.0" source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ - "ahash 0.8.12", + "ahash", "arrow-buffer", "arrow-data", "arrow-schema", @@ -267,7 +256,7 @@ dependencies = [ "bzip2", "crc", "flate2", - "indexmap 2.12.1", + "indexmap 2.13.0", "liblzma", "rand 0.9.2", "serde", @@ -390,7 +379,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap 2.12.1", + "indexmap 2.13.0", "itoa", "lexical-core", "memchr", @@ -430,7 +419,7 @@ name = "arrow-schema" version = "57.2.0" source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ - "bitflags 2.9.4", + "bitflags", "serde", "serde_core", "serde_json", @@ -441,7 +430,7 @@ name = "arrow-select" version = "57.2.0" source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ - "ahash 0.8.12", + "ahash", "arrow-array", "arrow-buffer", "arrow-data", @@ -495,13 +484,12 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.35" +version = "0.4.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07a926debf178f2d355197f9caddb08e54a9329d44748034bba349c5848cb519" +checksum = "d10e4f991a553474232bc0a31799f6d24b034a84c0971d80d2e2f78b2e576e40" dependencies = [ "compression-codecs", "compression-core", - "futures-core", "pin-project-lite", "tokio", ] @@ -523,7 +511,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -545,7 +533,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -556,7 +544,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -582,9 +570,9 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "aws-config" -version = "1.8.11" +version = "1.8.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0149602eeaf915158e14029ba0c78dedb8c08d554b024d54c8f239aab46511d" +checksum = "96571e6996817bf3d58f6b569e4b9fd2e9d2fcf9f7424eed07b2ce9bb87535e5" dependencies = [ "aws-credential-types", "aws-runtime", @@ -612,9 +600,9 @@ dependencies = [ [[package]] name = "aws-credential-types" -version = "1.2.10" +version = "1.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b01c9521fa01558f750d183c8c68c81b0155b9d193a4ba7f84c36bd1b6d04a06" +checksum = "3cd362783681b15d136480ad555a099e82ecd8e2d10a841e14dfd0078d67fee3" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -647,9 +635,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.5.16" +version = "1.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ce527fb7e53ba9626fc47824f25e256250556c40d8f81d27dd92aa38239d632" +checksum = "959dab27ce613e6c9658eb3621064d0e2027e5f2acb65bc526a43577facea557" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -671,15 +659,16 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.90.0" +version = "1.92.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f18e53542c522459e757f81e274783a78f8c81acdfc8d1522ee8a18b5fb1c66" +checksum = "b7d63bd2bdeeb49aa3f9b00c15e18583503b778b2e792fc06284d54e7d5b6566" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -693,15 +682,16 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.92.0" +version = "1.94.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "532f4d866012ffa724a4385c82e8dd0e59f0ca0e600f3f22d4c03b6824b34e4a" +checksum = "532d93574bf731f311bafb761366f9ece345a0416dbcc273d81d6d1a1205239b" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -715,15 +705,16 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.94.0" +version = "1.96.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1be6fbbfa1a57724788853a623378223fe828fc4c09b146c992f0c95b6256174" +checksum = "357e9a029c7524db6a0099cd77fbd5da165540339e7296cca603531bc783b56c" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-query", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -738,9 +729,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.3.6" +version = "1.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c35452ec3f001e1f2f6db107b6373f1f48f05ec63ba2c5c9fa91f07dad32af11" +checksum = "69e523e1c4e8e7e8ff219d732988e22bfeae8a1cafdbe6d9eca1546fa080be7c" dependencies = [ "aws-credential-types", "aws-smithy-http", @@ -760,9 +751,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.2.6" +version = "1.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "127fcfad33b7dfc531141fda7e1c402ac65f88aca5511a4d31e2e3d2cd01ce9c" +checksum = "9ee19095c7c4dda59f1697d028ce704c24b2d33c6718790c7f1d5a3015b4107c" dependencies = [ "futures-util", "pin-project-lite", @@ -771,9 +762,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.62.5" +version = "0.62.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445d5d720c99eed0b4aa674ed00d835d9b1427dd73e04adaf2f94c6b2d6f9fca" +checksum = "826141069295752372f8203c17f28e30c464d22899a43a0c9fd9c458d469c88b" dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", @@ -792,9 +783,9 @@ dependencies = [ [[package]] name = "aws-smithy-http-client" -version = "1.1.4" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "623254723e8dfd535f566ee7b2381645f8981da086b5c4aa26c0c41582bb1d2c" +checksum = "59e62db736db19c488966c8d787f52e6270be565727236fd5579eaa301e7bc4a" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -816,27 +807,27 @@ dependencies = [ [[package]] name = "aws-smithy-json" -version = "0.61.7" +version = "0.61.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2db31f727935fc63c6eeae8b37b438847639ec330a9161ece694efba257e0c54" +checksum = "49fa1213db31ac95288d981476f78d05d9cbb0353d22cdf3472cc05bb02f6551" dependencies = [ "aws-smithy-types", ] [[package]] name = "aws-smithy-observability" -version = "0.1.4" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d1881b1ea6d313f9890710d65c158bdab6fb08c91ea825f74c1c8c357baf4cc" +checksum = "ef1fcbefc7ece1d70dcce29e490f269695dfca2d2bacdeaf9e5c3f799e4e6a42" dependencies = [ "aws-smithy-runtime-api", ] [[package]] name = "aws-smithy-query" -version = "0.60.8" +version = "0.60.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d28a63441360c477465f80c7abac3b9c4d075ca638f982e605b7dc2a2c7156c9" +checksum = "ae5d689cf437eae90460e944a58b5668530d433b4ff85789e69d2f2a556e057d" dependencies = [ "aws-smithy-types", "urlencoding", @@ -844,9 +835,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.9.4" +version = "1.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bbe9d018d646b96c7be063dd07987849862b0e6d07c778aad7d93d1be6c1ef0" +checksum = "bb5b6167fcdf47399024e81ac08e795180c576a20e4d4ce67949f9a88ae37dc1" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -868,9 +859,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.9.2" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec7204f9fd94749a7c53b26da1b961b4ac36bf070ef1e0b94bb09f79d4f6c193" +checksum = "efce7aaaf59ad53c5412f14fc19b2d5c6ab2c3ec688d272fd31f76ec12f44fb0" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -885,9 +876,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.3.4" +version = "1.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25f535879a207fce0db74b679cfc3e91a3159c8144d717d55f5832aea9eef46e" +checksum = "65f172bcb02424eb94425db8aed1b6d583b5104d4d5ddddf22402c661a320048" dependencies = [ "base64-simd", "bytes", @@ -908,18 +899,18 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.60.12" +version = "0.60.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eab77cdd036b11056d2a30a7af7b775789fb024bf216acc13884c6c97752ae56" +checksum = "11b2f670422ff42bf7065031e72b45bc52a3508bd089f743ea90731ca2b6ea57" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "1.3.10" +version = "1.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d79fb68e3d7fe5d4833ea34dc87d2e97d26d3086cb3da660bb6b1f76d98680b6" +checksum = "1d980627d2dd7bfc32a3c025685a033eeab8d365cc840c631ef59d1b8f428164" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -1015,7 +1006,7 @@ version = "0.72.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" dependencies = [ - "bitflags 2.9.4", + "bitflags", "cexpr", "clang-sys", "itertools 0.13.0", @@ -1026,33 +1017,15 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.111", + "syn 2.0.114", ] -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" -[[package]] -name = "bitvec" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" -dependencies = [ - "funty", - "radium", - "tap", - "wyz", -] - [[package]] name = "blake2" version = "0.10.6" @@ -1086,13 +1059,13 @@ dependencies = [ [[package]] name = "bollard" -version = "0.19.3" +version = "0.19.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec7646ee90964aa59e9f832a67182791396a19a5b1d76eb17599a8310a7e2e09" +checksum = "87a52479c9237eb04047ddb94788c41ca0d26eaff8b697ecfbb4c32f7fdc3b1b" dependencies = [ "async-stream", "base64 0.22.1", - "bitflags 2.9.4", + "bitflags", "bollard-buildkit-proto", "bollard-stubs", "bytes", @@ -1161,29 +1134,6 @@ dependencies = [ "serde_with", ] -[[package]] -name = "borsh" -version = "1.5.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad8646f98db542e39fc66e68a20b2144f6a732636df7c2354e74645faaa433ce" -dependencies = [ - "borsh-derive", - "cfg_aliases", -] - -[[package]] -name = "borsh-derive" -version = "1.5.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdd1d3c0c2f5833f22386f252fe8ed005c7f59fdcddeef025c01b4c3b9fd9ac3" -dependencies = [ - "once_cell", - "proc-macro-crate", - "proc-macro2", - "quote", - "syn 2.0.111", -] - [[package]] name = "brotli" version = "8.0.2" @@ -1221,28 +1171,6 @@ version = "3.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" -[[package]] -name = "bytecheck" -version = "0.6.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23cdc57ce23ac53c931e88a43d06d070a6fd142f2617be5855eb75efc9beb1c2" -dependencies = [ - "bytecheck_derive", - "ptr_meta", - "simdutf8", -] - -[[package]] -name = "bytecheck_derive" -version = "0.6.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3db406d29fbcd95542e92559bed4d8ad92636d1ca8b3b72ede10b4bcc010e659" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "byteorder" version = "1.5.0" @@ -1375,17 +1303,6 @@ dependencies = [ "libloading 0.8.9", ] -[[package]] -name = "clap" -version = "2.34.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" -dependencies = [ - "bitflags 1.3.2", - "textwrap", - "unicode-width 0.1.14", -] - [[package]] name = "clap" version = "4.5.53" @@ -1414,10 +1331,10 @@ version = "4.5.49" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -1463,9 +1380,9 @@ dependencies = [ [[package]] name = "compression-codecs" -version = "0.4.34" +version = "0.4.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34a3cbbb8b6eca96f3a5c4bf6938d5b27ced3675d69f95bb51948722870bc323" +checksum = "00828ba6fd27b45a448e57dbfe84f1029d4c9f26b368157e9a448a5f49a2ec2a" dependencies = [ "bzip2", "compression-core", @@ -1626,7 +1543,7 @@ dependencies = [ "anes", "cast", "ciborium", - "clap 4.5.53", + "clap", "criterion-plot", "futures", "itertools 0.13.0", @@ -1767,7 +1684,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -1778,7 +1695,7 @@ checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" dependencies = [ "darling_core", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -1797,7 +1714,7 @@ dependencies = [ [[package]] name = "datafusion" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "arrow-schema", @@ -1853,6 +1770,7 @@ dependencies = [ "paste", "rand 0.9.2", "rand_distr", + "recursive", "regex", "rstest", "serde", @@ -1869,9 +1787,10 @@ dependencies = [ [[package]] name = "datafusion-benchmarks" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", + "clap", "datafusion", "datafusion-common", "datafusion-proto", @@ -1887,14 +1806,13 @@ dependencies = [ "serde", "serde_json", "snmalloc-rs", - "structopt", "tokio", "tokio-util", ] [[package]] name = "datafusion-catalog" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "async-trait", @@ -1917,7 +1835,7 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "async-trait", @@ -1939,14 +1857,14 @@ dependencies = [ [[package]] name = "datafusion-cli" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "async-trait", "aws-config", "aws-credential-types", "chrono", - "clap 4.5.53", + "clap", "ctor", "datafusion", "datafusion-common", @@ -1963,7 +1881,6 @@ dependencies = [ "regex", "rstest", "rustyline", - "testcontainers", "testcontainers-modules", "tokio", "url", @@ -1971,16 +1888,17 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "51.0.0" +version = "52.0.0" dependencies = [ - "ahash 0.8.12", + "ahash", "arrow", "arrow-ipc", "chrono", + "criterion", "half", - "hashbrown 0.14.5", + "hashbrown 0.16.1", "hex", - "indexmap 2.12.1", + "indexmap 2.13.0", "insta", "libc", "log", @@ -1996,7 +1914,7 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "51.0.0" +version = "52.0.0" dependencies = [ "futures", "log", @@ -2005,7 +1923,7 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "async-compression", @@ -2040,7 +1958,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-arrow" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "arrow-ipc", @@ -2063,7 +1981,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-avro" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "arrow-avro", @@ -2079,7 +1997,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "async-trait", @@ -2100,7 +2018,7 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "async-trait", @@ -2120,18 +2038,20 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "async-trait", "bytes", "chrono", + "criterion", "datafusion-common", "datafusion-common-runtime", "datafusion-datasource", "datafusion-execution", "datafusion-expr", "datafusion-functions-aggregate-common", + "datafusion-functions-nested", "datafusion-physical-expr", "datafusion-physical-expr-adapter", "datafusion-physical-expr-common", @@ -2144,16 +2064,17 @@ dependencies = [ "object_store", "parking_lot", "parquet", + "tempfile", "tokio", ] [[package]] name = "datafusion-doc" -version = "51.0.0" +version = "52.0.0" [[package]] name = "datafusion-examples" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "arrow-flight", @@ -2192,7 +2113,7 @@ dependencies = [ [[package]] name = "datafusion-execution" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "async-trait", @@ -2200,6 +2121,7 @@ dependencies = [ "dashmap", "datafusion-common", "datafusion-expr", + "datafusion-physical-expr-common", "futures", "insta", "log", @@ -2213,7 +2135,7 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "async-trait", @@ -2226,7 +2148,7 @@ dependencies = [ "datafusion-functions-window-common", "datafusion-physical-expr-common", "env_logger", - "indexmap 2.12.1", + "indexmap 2.13.0", "insta", "itertools 0.14.0", "paste", @@ -2237,18 +2159,18 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "datafusion-common", - "indexmap 2.12.1", + "indexmap 2.13.0", "itertools 0.14.0", "paste", ] [[package]] name = "datafusion-ffi" -version = "51.0.0" +version = "52.0.0" dependencies = [ "abi_stable", "arrow", @@ -2264,6 +2186,7 @@ dependencies = [ "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-aggregate-common", + "datafusion-functions-table", "datafusion-functions-window", "datafusion-physical-expr", "datafusion-physical-expr-common", @@ -2281,7 +2204,7 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "arrow-buffer", @@ -2289,6 +2212,7 @@ dependencies = [ "blake2", "blake3", "chrono", + "chrono-tz", "criterion", "ctor", "datafusion-common", @@ -2313,9 +2237,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "51.0.0" +version = "52.0.0" dependencies = [ - "ahash 0.8.12", + "ahash", "arrow", "criterion", "datafusion-common", @@ -2334,9 +2258,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "51.0.0" +version = "52.0.0" dependencies = [ - "ahash 0.8.12", + "ahash", "arrow", "criterion", "datafusion-common", @@ -2347,7 +2271,7 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "arrow-ord", @@ -2370,7 +2294,7 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "async-trait", @@ -2384,9 +2308,10 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", + "criterion", "datafusion-common", "datafusion-doc", "datafusion-expr", @@ -2400,7 +2325,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "51.0.0" +version = "52.0.0" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -2408,16 +2333,16 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "51.0.0" +version = "52.0.0" dependencies = [ "datafusion-doc", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] name = "datafusion-optimizer" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "async-trait", @@ -2433,7 +2358,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-sql", "env_logger", - "indexmap 2.12.1", + "indexmap 2.13.0", "insta", "itertools 0.14.0", "log", @@ -2444,9 +2369,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "51.0.0" +version = "52.0.0" dependencies = [ - "ahash 0.8.12", + "ahash", "arrow", "criterion", "datafusion-common", @@ -2456,21 +2381,22 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-physical-expr-common", "half", - "hashbrown 0.14.5", - "indexmap 2.12.1", + "hashbrown 0.16.1", + "indexmap 2.13.0", "insta", "itertools 0.14.0", "parking_lot", "paste", "petgraph 0.8.3", "rand 0.9.2", + "recursive", "rstest", "tokio", ] [[package]] name = "datafusion-physical-expr-adapter" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "datafusion-common", @@ -2483,19 +2409,22 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "51.0.0" +version = "52.0.0" dependencies = [ - "ahash 0.8.12", + "ahash", "arrow", + "chrono", "datafusion-common", "datafusion-expr-common", - "hashbrown 0.14.5", + "hashbrown 0.16.1", + "indexmap 2.13.0", "itertools 0.14.0", + "parking_lot", ] [[package]] name = "datafusion-physical-optimizer" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "datafusion-common", @@ -2515,9 +2444,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "51.0.0" +version = "52.0.0" dependencies = [ - "ahash 0.8.12", + "ahash", "arrow", "arrow-ord", "arrow-schema", @@ -2536,11 +2465,12 @@ dependencies = [ "datafusion-physical-expr-common", "futures", "half", - "hashbrown 0.14.5", - "indexmap 2.12.1", + "hashbrown 0.16.1", + "indexmap 2.13.0", "insta", "itertools 0.14.0", "log", + "num-traits", "parking_lot", "pin-project-lite", "rand 0.9.2", @@ -2551,7 +2481,7 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "async-trait", @@ -2588,7 +2518,7 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "datafusion-common", @@ -2600,7 +2530,7 @@ dependencies = [ [[package]] name = "datafusion-pruning" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "datafusion-common", @@ -2618,7 +2548,7 @@ dependencies = [ [[package]] name = "datafusion-session" -version = "51.0.0" +version = "52.0.0" dependencies = [ "async-trait", "datafusion-common", @@ -2630,7 +2560,7 @@ dependencies = [ [[package]] name = "datafusion-spark" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "bigdecimal", @@ -2642,6 +2572,7 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-functions", + "datafusion-functions-aggregate", "datafusion-functions-nested", "log", "percent-encoding", @@ -2652,7 +2583,7 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "bigdecimal", @@ -2665,7 +2596,7 @@ dependencies = [ "datafusion-functions-nested", "datafusion-functions-window", "env_logger", - "indexmap 2.12.1", + "indexmap 2.13.0", "insta", "itertools 0.14.0", "log", @@ -2678,14 +2609,14 @@ dependencies = [ [[package]] name = "datafusion-sqllogictest" -version = "51.0.0" +version = "52.0.0" dependencies = [ "arrow", "async-trait", "bigdecimal", "bytes", "chrono", - "clap 4.5.53", + "clap", "datafusion", "datafusion-spark", "datafusion-substrait", @@ -2696,14 +2627,11 @@ dependencies = [ "itertools 0.14.0", "log", "object_store", - "postgres-protocol", "postgres-types", "regex", - "rust_decimal", "sqllogictest", "sqlparser", "tempfile", - "testcontainers", "testcontainers-modules", "thiserror", "tokio", @@ -2712,7 +2640,7 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "51.0.0" +version = "52.0.0" dependencies = [ "async-recursion", "async-trait", @@ -2734,7 +2662,7 @@ dependencies = [ [[package]] name = "datafusion-wasmtest" -version = "51.0.0" +version = "52.0.0" dependencies = [ "chrono", "console_error_panic_hook", @@ -2809,7 +2737,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -2865,7 +2793,7 @@ dependencies = [ "enum-ordinalize", "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -2903,7 +2831,7 @@ checksum = "0d28318a75d4aead5c4db25382e8ef717932d0346600cacae6357eb5941bc5ff" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -2959,13 +2887,12 @@ checksum = "5692dd7b5a1978a5aeb0ce83b7655c58ca8efdcb79d21036ea249da95afec2c6" [[package]] name = "etcetera" -version = "0.10.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26c7b13d0780cb82722fd59f6f57f925e143427e4a75313a6c77243bf5326ae6" +checksum = "de48cc4d1c1d97a20fd819def54b890cadde72ed3ad0c614822a0a433361be96" dependencies = [ "cfg-if", - "home", - "windows-sys 0.59.0", + "windows-sys 0.61.0", ] [[package]] @@ -2991,6 +2918,17 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "ferroid" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb330bbd4cb7a5b9f559427f06f98a4f853a137c8298f3bd3f8ca57663e21986" +dependencies = [ + "portable-atomic", + "rand 0.9.2", + "web-time", +] + [[package]] name = "ffi_example_table_provider" version = "0.1.0" @@ -3016,6 +2954,7 @@ version = "0.1.0" dependencies = [ "abi_stable", "datafusion", + "datafusion-ffi", "ffi_module_interface", "tokio", ] @@ -3050,19 +2989,19 @@ version = "25.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1" dependencies = [ - "bitflags 2.9.4", + "bitflags", "rustc_version", ] [[package]] name = "flate2" -version = "1.1.5" +version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" +checksum = "b375d6465b98090a5f25b1c7703f3859783755aa9a80433b36e0379a3ec2f369" dependencies = [ "crc32fast", - "libz-rs-sys", "miniz_oxide", + "zlib-rs", ] [[package]] @@ -3077,6 +3016,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -3101,12 +3046,6 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" -[[package]] -name = "funty" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" - [[package]] name = "futures" version = "0.3.31" @@ -3163,7 +3102,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -3295,7 +3234,7 @@ dependencies = [ "futures-core", "futures-sink", "http 1.3.1", - "indexmap 2.12.1", + "indexmap 2.13.0", "slab", "tokio", "tokio-util", @@ -3319,19 +3258,12 @@ name = "hashbrown" version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" -dependencies = [ - "ahash 0.7.8", -] [[package]] name = "hashbrown" version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" -dependencies = [ - "ahash 0.8.12", - "allocator-api2", -] [[package]] name = "hashbrown" @@ -3341,7 +3273,7 @@ checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ "allocator-api2", "equivalent", - "foldhash", + "foldhash 0.1.5", ] [[package]] @@ -3349,14 +3281,10 @@ name = "hashbrown" version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" - -[[package]] -name = "heck" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" dependencies = [ - "unicode-segmentation", + "allocator-api2", + "equivalent", + "foldhash 0.2.0", ] [[package]] @@ -3720,9 +3648,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.12.1" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ad4bb2b565bca0645f4d68c5c9af97fba094e9791da685bf83cb5f3ce74acf2" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" dependencies = [ "equivalent", "hashbrown 0.16.1", @@ -3745,9 +3673,9 @@ dependencies = [ [[package]] name = "insta" -version = "1.44.3" +version = "1.46.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5c943d4415edd8153251b6f197de5eb1640e56d84e8d9159bea190421c73698" +checksum = "248b42847813a1550dafd15296fd9748c651d0c32194559dbc05d804d54b21e8" dependencies = [ "console 0.15.11", "globset", @@ -3755,6 +3683,7 @@ dependencies = [ "regex", "serde", "similar", + "tempfile", "walkdir", ] @@ -3842,7 +3771,7 @@ checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -3936,9 +3865,9 @@ checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" [[package]] name = "libc" -version = "0.2.177" +version = "0.2.180" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" +checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc" [[package]] name = "libloading" @@ -4003,7 +3932,7 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "416f7e718bdb06000964960ffa43b4335ad4012ae8b99060261aa4a8088d5ccb" dependencies = [ - "bitflags 2.9.4", + "bitflags", "libc", "redox_syscall", ] @@ -4016,19 +3945,10 @@ checksum = "5297962ef19edda4ce33aaa484386e0a5b3d7f2f4e037cbeee00503ef6b29d33" dependencies = [ "anstream", "anstyle", - "clap 4.5.53", + "clap", "escape8259", ] -[[package]] -name = "libz-rs-sys" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "840db8cf39d9ec4dd794376f38acc40d0fc65eec2a8f484f7fd375b84602becd" -dependencies = [ - "zlib-rs", -] - [[package]] name = "linux-raw-sys" version = "0.11.0" @@ -4167,7 +4087,7 @@ version = "0.30.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" dependencies = [ - "bitflags 2.9.4", + "bitflags", "cfg-if", "cfg_aliases", "libc", @@ -4287,7 +4207,7 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c10c2894a6fed806ade6027bcd50662746363a9589d3ec9d9bef30a4e4bc166" dependencies = [ - "bitflags 2.9.4", + "bitflags", ] [[package]] @@ -4426,7 +4346,7 @@ name = "parquet" version = "57.2.0" source = "git+https://github.com/apache/arrow-rs?rev=1db1a8869cceb179aa885ed58da9f0b49c03eafe#1db1a8869cceb179aa885ed58da9f0b49c03eafe" dependencies = [ - "ahash 0.8.12", + "ahash", "arrow-array", "arrow-buffer", "arrow-data", @@ -4479,7 +4399,7 @@ dependencies = [ "regex", "regex-syntax", "structmeta", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -4504,7 +4424,7 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af22d08a625a2213a78dbb0ffa253318c5c79ce3133d32d296655a7bdfb02095" dependencies = [ - "heck 0.5.0", + "heck", "itertools 0.14.0", "prost", "prost-types", @@ -4538,7 +4458,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" dependencies = [ "fixedbitset", - "indexmap 2.12.1", + "indexmap 2.13.0", ] [[package]] @@ -4549,7 +4469,7 @@ checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ "fixedbitset", "hashbrown 0.15.5", - "indexmap 2.12.1", + "indexmap 2.13.0", "serde", ] @@ -4607,7 +4527,7 @@ checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -4658,9 +4578,9 @@ dependencies = [ [[package]] name = "portable-atomic" -version = "1.11.1" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" +checksum = "f89776e4d69bb58bc6993e99ffa1d11f228b839984854c7daeb5d37f87cbe950" [[package]] name = "portable-atomic-util" @@ -4677,10 +4597,10 @@ version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56df96f5394370d1b20e49de146f9e6c25aa9ae750f449c9d665eafecb3ccae6" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -4755,7 +4675,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -4767,30 +4687,6 @@ dependencies = [ "toml_edit", ] -[[package]] -name = "proc-macro-error" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" -dependencies = [ - "proc-macro-error-attr", - "proc-macro2", - "quote", - "syn 1.0.109", - "version_check", -] - -[[package]] -name = "proc-macro-error-attr" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" -dependencies = [ - "proc-macro2", - "quote", - "version_check", -] - [[package]] name = "proc-macro2" version = "1.0.101" @@ -4816,7 +4712,7 @@ version = "0.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac6c3320f9abac597dcbc668774ef006702672474aad53c6d596b62e487b40b1" dependencies = [ - "heck 0.5.0", + "heck", "itertools 0.14.0", "log", "multimap", @@ -4826,7 +4722,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.111", + "syn 2.0.114", "tempfile", ] @@ -4840,7 +4736,7 @@ dependencies = [ "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -4870,26 +4766,6 @@ dependencies = [ "cc", ] -[[package]] -name = "ptr_meta" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0738ccf7ea06b608c10564b31debd4f5bc5e197fc8bfe088f68ae5ce81e7a4f1" -dependencies = [ - "ptr_meta_derive", -] - -[[package]] -name = "ptr_meta_derive" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b845dbfca988fa33db069c0e230574d15a3088f147a87b64c7589eb662c9ac" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "quick-xml" version = "0.38.3" @@ -4970,12 +4846,6 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" -[[package]] -name = "radium" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" - [[package]] name = "radix_trie" version = "0.2.1" @@ -5092,7 +4962,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -5101,7 +4971,7 @@ version = "0.5.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" dependencies = [ - "bitflags 2.9.4", + "bitflags", ] [[package]] @@ -5132,7 +5002,7 @@ checksum = "1165225c21bff1f3bbce98f5a1f889949bc902d3575308cc7b0de30b4f6d27c7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -5186,15 +5056,6 @@ version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2" -[[package]] -name = "rend" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71fe3824f5629716b1589be05dacd749f6aa084c87e00e016714a8cdfccc997c" -dependencies = [ - "bytecheck", -] - [[package]] name = "repr_offset" version = "0.2.2" @@ -5260,35 +5121,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "rkyv" -version = "0.7.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9008cd6385b9e161d8229e1f6549dd23c3d022f132a2ea37ac3a10ac4935779b" -dependencies = [ - "bitvec", - "bytecheck", - "bytes", - "hashbrown 0.12.3", - "ptr_meta", - "rend", - "rkyv_derive", - "seahash", - "tinyvec", - "uuid", -] - -[[package]] -name = "rkyv_derive" -version = "0.7.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "503d1d27590a2b0a3a4ca4c94755aa2875657196ecbf401a42eff41d7de532c0" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "rstest" version = "0.26.1" @@ -5314,7 +5146,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.111", + "syn 2.0.114", "unicode-ident", ] @@ -5326,24 +5158,7 @@ checksum = "b3a8fb4672e840a587a66fc577a5491375df51ddb88f2a2c2a792598c326fe14" dependencies = [ "quote", "rand 0.8.5", - "syn 2.0.111", -] - -[[package]] -name = "rust_decimal" -version = "1.38.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8975fc98059f365204d635119cf9c5a60ae67b841ed49b5422a9a7e56cdfac0" -dependencies = [ - "arrayvec", - "borsh", - "bytes", - "num-traits", - "postgres-types", - "rand 0.8.5", - "rkyv", - "serde", - "serde_json", + "syn 2.0.114", ] [[package]] @@ -5367,7 +5182,7 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" dependencies = [ - "bitflags 2.9.4", + "bitflags", "errno", "libc", "linux-raw-sys", @@ -5445,7 +5260,7 @@ version = "17.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e902948a25149d50edc1a8e0141aad50f54e22ba83ff988cf8f7c9ef07f50564" dependencies = [ - "bitflags 2.9.4", + "bitflags", "cfg-if", "clipboard-win", "fd-lock", @@ -5530,7 +5345,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -5539,19 +5354,13 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" -[[package]] -name = "seahash" -version = "4.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" - [[package]] name = "security-framework" version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc198e42d9b7510827939c9a15f5062a0c913f3371d765977e586d2fe6c16f4a" dependencies = [ - "bitflags 2.9.4", + "bitflags", "core-foundation", "core-foundation-sys", "libc", @@ -5611,7 +5420,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -5622,7 +5431,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -5646,7 +5455,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -5658,7 +5467,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -5683,7 +5492,7 @@ dependencies = [ "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.12.1", + "indexmap 2.13.0", "schemars 0.9.0", "schemars 1.0.4", "serde", @@ -5702,7 +5511,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -5711,7 +5520,7 @@ version = "0.9.34+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" dependencies = [ - "indexmap 2.12.1", + "indexmap 2.13.0", "itoa", "ryu", "serde", @@ -5836,9 +5645,9 @@ dependencies = [ [[package]] name = "sqllogictest" -version = "0.28.4" +version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3566426f72a13e393aa34ca3d542c5b0eb86da4c0db137ee9b5cfccc6179e52d" +checksum = "dffbf03091090a9330529c3926313be0a0570f036edfd490b11db39eea4b7118" dependencies = [ "async-trait", "educe", @@ -5861,9 +5670,9 @@ dependencies = [ [[package]] name = "sqlparser" -version = "0.59.0" +version = "0.60.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4591acadbcf52f0af60eafbb2c003232b2b4cd8de5f0e9437cb8b1b59046cc0f" +checksum = "505aa16b045c4c1375bf5f125cce3813d0176325bfe9ffc4a903f423de7774ff" dependencies = [ "log", "recursive", @@ -5872,13 +5681,13 @@ dependencies = [ [[package]] name = "sqlparser_derive" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" +checksum = "028e551d5e270b31b9f3ea271778d9d827148d4287a5d96167b6bb9787f5cc38" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -5926,7 +5735,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -5937,31 +5746,7 @@ checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", -] - -[[package]] -name = "structopt" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10" -dependencies = [ - "clap 2.34.0", - "lazy_static", - "structopt-derive", -] - -[[package]] -name = "structopt-derive" -version = "0.4.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" -dependencies = [ - "heck 0.3.3", - "proc-macro-error", - "proc-macro2", - "quote", - "syn 1.0.109", + "syn 2.0.114", ] [[package]] @@ -5982,11 +5767,11 @@ version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", "rustversion", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -5995,10 +5780,10 @@ version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -6017,7 +5802,7 @@ version = "0.62.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "21f1cb6d0bcd097a39fc25f7236236be29881fe122e282e4173d6d007a929927" dependencies = [ - "heck 0.5.0", + "heck", "pbjson", "pbjson-build", "pbjson-types", @@ -6032,7 +5817,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.111", + "syn 2.0.114", "typify", "walkdir", ] @@ -6056,9 +5841,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.111" +version = "2.0.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "390cc9a294ab71bdb1aa2e99d13be9c753cd2d7bd6560c77118597410c4d2e87" +checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a" dependencies = [ "proc-macro2", "quote", @@ -6082,7 +5867,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -6099,12 +5884,6 @@ dependencies = [ "windows", ] -[[package]] -name = "tap" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" - [[package]] name = "tempfile" version = "3.23.0" @@ -6131,9 +5910,9 @@ dependencies = [ [[package]] name = "testcontainers" -version = "0.25.2" +version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f3ac71069f20ecfa60c396316c283fbf35e6833a53dff551a31b5458da05edc" +checksum = "a81ec0158db5fbb9831e09d1813fe5ea9023a2b5e6e8e0a5fe67e2a820733629" dependencies = [ "astral-tokio-tar", "async-trait", @@ -6142,7 +5921,9 @@ dependencies = [ "docker_credential", "either", "etcetera", + "ferroid", "futures", + "itertools 0.14.0", "log", "memchr", "parse-display", @@ -6154,28 +5935,18 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util", - "ulid", "url", ] [[package]] name = "testcontainers-modules" -version = "0.13.0" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1966329d5bb3f89d33602d2db2da971fb839f9297dad16527abf4564e2ae0a6d" +checksum = "5e75e78ff453128a2c7da9a5d5a3325ea34ea214d4bf51eab3417de23a4e5147" dependencies = [ "testcontainers", ] -[[package]] -name = "textwrap" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" -dependencies = [ - "unicode-width 0.1.14", -] - [[package]] name = "thiserror" version = "2.0.17" @@ -6193,7 +5964,7 @@ checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -6316,7 +6087,7 @@ checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -6394,7 +6165,7 @@ version = "0.23.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3effe7c0e86fdff4f69cdd2ccc1b96f933e24811c5441d44904e8683e27184b" dependencies = [ - "indexmap 2.12.1", + "indexmap 2.13.0", "toml_datetime", "toml_parser", "winnow", @@ -6457,7 +6228,7 @@ checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" dependencies = [ "futures-core", "futures-util", - "indexmap 2.12.1", + "indexmap 2.13.0", "pin-project-lite", "slab", "sync_wrapper", @@ -6474,7 +6245,7 @@ version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" dependencies = [ - "bitflags 2.9.4", + "bitflags", "bytes", "futures-util", "http 1.3.1", @@ -6517,7 +6288,7 @@ checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -6616,7 +6387,7 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1eb359f7ffa4f9ebe947fa11a1b2da054564502968db5f317b7e37693cb2240" dependencies = [ - "heck 0.5.0", + "heck", "log", "proc-macro2", "quote", @@ -6625,7 +6396,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.111", + "syn 2.0.114", "thiserror", "unicode-ident", ] @@ -6643,20 +6414,10 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.111", + "syn 2.0.114", "typify-impl", ] -[[package]] -name = "ulid" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "470dbf6591da1b39d43c14523b2b469c86879a53e8b758c8e090a470fe7b1fbe" -dependencies = [ - "rand 0.9.2", - "web-time", -] - [[package]] name = "unicode-bidi" version = "0.3.18" @@ -6899,7 +6660,7 @@ dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", "wasm-bindgen-shared", ] @@ -6941,7 +6702,7 @@ checksum = "7150335716dce6028bead2b848e72f47b45e7b9422f64cccdc23bedca89affc1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -7082,7 +6843,7 @@ checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -7093,7 +6854,7 @@ checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -7331,15 +7092,6 @@ version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" -[[package]] -name = "wyz" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" -dependencies = [ - "tap", -] - [[package]] name = "xattr" version = "1.6.1" @@ -7382,7 +7134,7 @@ checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", "synstructure", ] @@ -7403,7 +7155,7 @@ checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] @@ -7423,7 +7175,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", "synstructure", ] @@ -7463,14 +7215,14 @@ checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.111", + "syn 2.0.114", ] [[package]] name = "zlib-rs" -version = "0.5.2" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f06ae92f42f5e5c42443fd094f245eb656abf56dd7cce9b8b263236565e00f2" +checksum = "40990edd51aae2c2b6907af74ffb635029d5788228222c4bb811e9351c0caad3" [[package]] name = "zstd" diff --git a/Cargo.toml b/Cargo.toml index 8a8a98d34d244..dcdb7599278f4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -79,7 +79,7 @@ repository = "https://github.com/apache/datafusion" # Define Minimum Supported Rust Version (MSRV) rust-version = "1.88.0" # Define DataFusion version -version = "51.0.0" +version = "52.0.0" [workspace.dependencies] # We turn off default-features for some dependencies here so the workspaces which inherit them can @@ -118,54 +118,54 @@ chrono = { version = "0.4.42", default-features = false } criterion = "0.8" ctor = "0.6.3" dashmap = "6.0.1" -datafusion = { path = "datafusion/core", version = "51.0.0", default-features = false } -datafusion-catalog = { path = "datafusion/catalog", version = "51.0.0" } -datafusion-catalog-listing = { path = "datafusion/catalog-listing", version = "51.0.0" } -datafusion-common = { path = "datafusion/common", version = "51.0.0", default-features = false } -datafusion-common-runtime = { path = "datafusion/common-runtime", version = "51.0.0" } -datafusion-datasource = { path = "datafusion/datasource", version = "51.0.0", default-features = false } -datafusion-datasource-arrow = { path = "datafusion/datasource-arrow", version = "51.0.0", default-features = false } -datafusion-datasource-avro = { path = "datafusion/datasource-avro", version = "51.0.0", default-features = false } -datafusion-datasource-csv = { path = "datafusion/datasource-csv", version = "51.0.0", default-features = false } -datafusion-datasource-json = { path = "datafusion/datasource-json", version = "51.0.0", default-features = false } -datafusion-datasource-parquet = { path = "datafusion/datasource-parquet", version = "51.0.0", default-features = false } -datafusion-doc = { path = "datafusion/doc", version = "51.0.0" } -datafusion-execution = { path = "datafusion/execution", version = "51.0.0", default-features = false } -datafusion-expr = { path = "datafusion/expr", version = "51.0.0", default-features = false } -datafusion-expr-common = { path = "datafusion/expr-common", version = "51.0.0" } -datafusion-ffi = { path = "datafusion/ffi", version = "51.0.0" } -datafusion-functions = { path = "datafusion/functions", version = "51.0.0" } -datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "51.0.0" } -datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "51.0.0" } -datafusion-functions-nested = { path = "datafusion/functions-nested", version = "51.0.0", default-features = false } -datafusion-functions-table = { path = "datafusion/functions-table", version = "51.0.0" } -datafusion-functions-window = { path = "datafusion/functions-window", version = "51.0.0" } -datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "51.0.0" } -datafusion-macros = { path = "datafusion/macros", version = "51.0.0" } -datafusion-optimizer = { path = "datafusion/optimizer", version = "51.0.0", default-features = false } -datafusion-physical-expr = { path = "datafusion/physical-expr", version = "51.0.0", default-features = false } -datafusion-physical-expr-adapter = { path = "datafusion/physical-expr-adapter", version = "51.0.0", default-features = false } -datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "51.0.0", default-features = false } -datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "51.0.0" } -datafusion-physical-plan = { path = "datafusion/physical-plan", version = "51.0.0" } -datafusion-proto = { path = "datafusion/proto", version = "51.0.0" } -datafusion-proto-common = { path = "datafusion/proto-common", version = "51.0.0" } -datafusion-pruning = { path = "datafusion/pruning", version = "51.0.0" } -datafusion-session = { path = "datafusion/session", version = "51.0.0" } -datafusion-spark = { path = "datafusion/spark", version = "51.0.0" } -datafusion-sql = { path = "datafusion/sql", version = "51.0.0" } -datafusion-substrait = { path = "datafusion/substrait", version = "51.0.0" } +datafusion = { path = "datafusion/core", version = "52.0.0", default-features = false } +datafusion-catalog = { path = "datafusion/catalog", version = "52.0.0" } +datafusion-catalog-listing = { path = "datafusion/catalog-listing", version = "52.0.0" } +datafusion-common = { path = "datafusion/common", version = "52.0.0", default-features = false } +datafusion-common-runtime = { path = "datafusion/common-runtime", version = "52.0.0" } +datafusion-datasource = { path = "datafusion/datasource", version = "52.0.0", default-features = false } +datafusion-datasource-arrow = { path = "datafusion/datasource-arrow", version = "52.0.0", default-features = false } +datafusion-datasource-avro = { path = "datafusion/datasource-avro", version = "52.0.0", default-features = false } +datafusion-datasource-csv = { path = "datafusion/datasource-csv", version = "52.0.0", default-features = false } +datafusion-datasource-json = { path = "datafusion/datasource-json", version = "52.0.0", default-features = false } +datafusion-datasource-parquet = { path = "datafusion/datasource-parquet", version = "52.0.0", default-features = false } +datafusion-doc = { path = "datafusion/doc", version = "52.0.0" } +datafusion-execution = { path = "datafusion/execution", version = "52.0.0", default-features = false } +datafusion-expr = { path = "datafusion/expr", version = "52.0.0", default-features = false } +datafusion-expr-common = { path = "datafusion/expr-common", version = "52.0.0" } +datafusion-ffi = { path = "datafusion/ffi", version = "52.0.0" } +datafusion-functions = { path = "datafusion/functions", version = "52.0.0" } +datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "52.0.0" } +datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "52.0.0" } +datafusion-functions-nested = { path = "datafusion/functions-nested", version = "52.0.0", default-features = false } +datafusion-functions-table = { path = "datafusion/functions-table", version = "52.0.0" } +datafusion-functions-window = { path = "datafusion/functions-window", version = "52.0.0" } +datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "52.0.0" } +datafusion-macros = { path = "datafusion/macros", version = "52.0.0" } +datafusion-optimizer = { path = "datafusion/optimizer", version = "52.0.0", default-features = false } +datafusion-physical-expr = { path = "datafusion/physical-expr", version = "52.0.0", default-features = false } +datafusion-physical-expr-adapter = { path = "datafusion/physical-expr-adapter", version = "52.0.0", default-features = false } +datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "52.0.0", default-features = false } +datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "52.0.0" } +datafusion-physical-plan = { path = "datafusion/physical-plan", version = "52.0.0" } +datafusion-proto = { path = "datafusion/proto", version = "52.0.0" } +datafusion-proto-common = { path = "datafusion/proto-common", version = "52.0.0" } +datafusion-pruning = { path = "datafusion/pruning", version = "52.0.0" } +datafusion-session = { path = "datafusion/session", version = "52.0.0" } +datafusion-spark = { path = "datafusion/spark", version = "52.0.0" } +datafusion-sql = { path = "datafusion/sql", version = "52.0.0" } +datafusion-substrait = { path = "datafusion/substrait", version = "52.0.0" } doc-comment = "0.3" env_logger = "0.11" -flate2 = "1.1.5" +flate2 = "1.1.8" futures = "0.3" glob = "0.3.0" half = { version = "2.7.0", default-features = false } -hashbrown = { version = "0.14.5", features = ["raw"] } +hashbrown = { version = "0.16.1" } hex = { version = "0.4.3" } -indexmap = "2.12.1" -insta = { version = "1.44.3", features = ["glob", "filters"] } +indexmap = "2.13.0" +insta = { version = "1.46.0", features = ["glob", "filters"] } itertools = "0.14" liblzma = { version = "0.4.4", features = ["static"] } log = "^0.4" @@ -187,12 +187,11 @@ recursive = "0.1.1" regex = "1.12" rstest = "0.26.1" serde_json = "1" -sqlparser = { version = "0.59.0", default-features = false, features = ["std", "visitor"] } +sqlparser = { version = "0.60.0", default-features = false, features = ["std", "visitor"] } strum = "0.27.2" strum_macros = "0.27.2" tempfile = "3" -testcontainers = { version = "0.25.2", features = ["default"] } -testcontainers-modules = { version = "0.13" } +testcontainers-modules = { version = "0.14" } tokio = { version = "1.48", features = ["macros", "rt", "sync"] } url = "2.5.7" zstd = { version = "0.13", default-features = false }