Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
e27e1f8
draft: using arrow-avro
getChan Oct 1, 2025
027e79d
draft: using arrow-avro
getChan Oct 1, 2025
2bf2c89
Merge remote-tracking branch 'origin/main' into arrow-avro
getChan Oct 19, 2025
2c7e364
cargo fmt
getChan Oct 19, 2025
7c9f976
check public API's can be replaced
getChan Oct 19, 2025
7bc24ea
revert arrow dependency
getChan Oct 27, 2025
185ede5
Merge remote-tracking branch 'origin/main' into arrow-avro
getChan Oct 27, 2025
51557fe
apply arrow-avro 57.0.0
getChan Oct 27, 2025
8e12f47
BufReader
getChan Oct 29, 2025
83ec01d
remove apache-avro dependencies
getChan Oct 29, 2025
f37fcd3
Merge remote-tracking branch 'origin/main' into arrow-avro
getChan Oct 29, 2025
b12f563
remove apache-avro dependencies
getChan Oct 29, 2025
581ecae
remove apache-avro dependencies
getChan Oct 29, 2025
74792ff
fix test result
getChan Oct 30, 2025
151ea95
pass projected schema to arrow-avro
getChan Oct 30, 2025
78df065
testing submodule checkout
getChan Oct 31, 2025
c7b3b63
taplo format
getChan Oct 31, 2025
2556591
replace with arrow-avro
getChan Nov 4, 2025
91db92b
Merge remote-tracking branch 'origin/main' into arrow-avro
getChan Nov 4, 2025
ac97245
replace with arrow-avro
getChan Nov 4, 2025
b50df22
remove unused dependencies
getChan Nov 4, 2025
b4c3b5c
parallel avro file reading
getChan Nov 4, 2025
3014b8a
parallel avro file reading
getChan Nov 4, 2025
b2e9eb1
Merge remote-tracking branch 'origin/main' into arrow-avro
getChan Nov 10, 2025
d55f945
Apply suggestions from code review
getChan Nov 26, 2025
dbc082c
merge suggestions
getChan Nov 26, 2025
62d5d16
dependency conflict
getChan Nov 26, 2025
3cd6712
remove empty file
getChan Nov 26, 2025
c6cebbd
remove empty file
getChan Nov 26, 2025
3e2667a
projection indices
getChan Nov 26, 2025
d574fe7
check schema should be inferred
getChan Dec 2, 2025
d647ec6
fix test table column type
getChan Dec 7, 2025
5832449
Merge remote-tracking branch 'origin/main' into arrow-avro
getChan Dec 15, 2025
f6d80fa
resolve conflicts
getChan Dec 15, 2025
9404bd6
resolve conflicts
getChan Dec 15, 2025
c6b9a7c
resolve conflicts
getChan Dec 15, 2025
c40ecb8
use default repartition
getChan Dec 16, 2025
b91a351
Merge remote-tracking branch 'origin/main' into arrow-avro
getChan Dec 17, 2025
d8e0828
use default repartition
getChan Dec 20, 2025
0b632b3
apply projection feature in arrow-avro
getChan Jan 14, 2026
bdd3820
apply projection feature in arrow-avro
getChan Jan 14, 2026
9da2388
apply projection feature in arrow-avro
getChan Jan 16, 2026
31a07ca
Merge remote-tracking branch 'origin/main' into arrow-avro
getChan Jan 16, 2026
ee742ee
resolve conflict
getChan Jan 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,054 changes: 532 additions & 522 deletions Cargo.lock

Large diffs are not rendered by default.

22 changes: 14 additions & 8 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -90,20 +90,26 @@ version = "52.0.0"
ahash = { version = "0.8", default-features = false, features = [
"runtime-rng",
] }
apache-avro = { version = "0.21", default-features = false }
arrow = { version = "57.2.0", features = [
arrow = { git = "https://github.com/apache/arrow-rs", rev = "1db1a8869cceb179aa885ed58da9f0b49c03eafe", features = [ # fixme
"prettyprint",
"chrono-tz",
] }
arrow-buffer = { version = "57.2.0", default-features = false }
arrow-flight = { version = "57.2.0", features = [
arrow-avro = { git = "https://github.com/apache/arrow-rs", rev = "1db1a8869cceb179aa885ed58da9f0b49c03eafe", default-features = false, features = [ # fixme
"deflate",
"snappy",
"zstd",
"bzip2",
"xz",
] }
arrow-buffer = { git = "https://github.com/apache/arrow-rs", rev = "1db1a8869cceb179aa885ed58da9f0b49c03eafe", default-features = false } # fixme
arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "1db1a8869cceb179aa885ed58da9f0b49c03eafe", features = [ # fixme
"flight-sql-experimental",
] }
arrow-ipc = { version = "57.2.0", default-features = false, features = [
arrow-ipc = { git = "https://github.com/apache/arrow-rs", rev = "1db1a8869cceb179aa885ed58da9f0b49c03eafe", default-features = false, features = [ # fixme
"lz4",
] }
arrow-ord = { version = "57.2.0", default-features = false }
arrow-schema = { version = "57.2.0", default-features = false }
arrow-ord = { git = "https://github.com/apache/arrow-rs", rev = "1db1a8869cceb179aa885ed58da9f0b49c03eafe", default-features = false } # fixme
arrow-schema = { git = "https://github.com/apache/arrow-rs", rev = "1db1a8869cceb179aa885ed58da9f0b49c03eafe", default-features = false } # fixme
async-trait = "0.1.89"
bigdecimal = "0.4.8"
bytes = "1.11"
Expand Down Expand Up @@ -166,7 +172,7 @@ log = "^0.4"
num-traits = { version = "0.2" }
object_store = { version = "0.12.4", default-features = false }
parking_lot = "0.12"
parquet = { version = "57.2.0", default-features = false, features = [
parquet = { git = "https://github.com/apache/arrow-rs", rev = "1db1a8869cceb179aa885ed58da9f0b49c03eafe", default-features = false, features = [ # fixme
"arrow",
"async",
"object_store",
Expand Down
7 changes: 0 additions & 7 deletions datafusion/common/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ workspace = true
name = "datafusion_common"

[features]
avro = ["apache-avro"]
backtrace = []
parquet_encryption = [
"parquet",
Expand All @@ -59,12 +58,6 @@ name = "with_hashes"

[dependencies]
ahash = { workspace = true }
apache-avro = { workspace = true, features = [
"bzip",
"snappy",
"xz",
"zstandard",
], optional = true }
arrow = { workspace = true }
arrow-ipc = { workspace = true }
chrono = { workspace = true }
Expand Down
18 changes: 0 additions & 18 deletions datafusion/common/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,6 @@ use std::sync::Arc;
use crate::utils::datafusion_strsim::normalized_levenshtein;
use crate::utils::quote_identifier;
use crate::{Column, DFSchema, Diagnostic, TableReference};
#[cfg(feature = "avro")]
use apache_avro::Error as AvroError;
use arrow::error::ArrowError;
#[cfg(feature = "parquet")]
use parquet::errors::ParquetError;
Expand All @@ -76,9 +74,6 @@ pub enum DataFusionError {
/// Error when reading / writing Parquet data.
#[cfg(feature = "parquet")]
ParquetError(Box<ParquetError>),
/// Error when reading Avro data.
#[cfg(feature = "avro")]
AvroError(Box<AvroError>),
/// Error when reading / writing to / from an object_store (e.g. S3 or LocalFile)
#[cfg(feature = "object_store")]
ObjectStore(Box<object_store::Error>),
Expand Down Expand Up @@ -332,13 +327,6 @@ impl From<ParquetError> for DataFusionError {
}
}

#[cfg(feature = "avro")]
impl From<AvroError> for DataFusionError {
fn from(e: AvroError) -> Self {
DataFusionError::AvroError(Box::new(e))
}
}

#[cfg(feature = "object_store")]
impl From<object_store::Error> for DataFusionError {
fn from(e: object_store::Error) -> Self {
Expand Down Expand Up @@ -389,8 +377,6 @@ impl Error for DataFusionError {
DataFusionError::ArrowError(e, _) => Some(e.as_ref()),
#[cfg(feature = "parquet")]
DataFusionError::ParquetError(e) => Some(e.as_ref()),
#[cfg(feature = "avro")]
DataFusionError::AvroError(e) => Some(e.as_ref()),
#[cfg(feature = "object_store")]
DataFusionError::ObjectStore(e) => Some(e.as_ref()),
DataFusionError::IoError(e) => Some(e),
Expand Down Expand Up @@ -520,8 +506,6 @@ impl DataFusionError {
DataFusionError::ArrowError(_, _) => "Arrow error: ",
#[cfg(feature = "parquet")]
DataFusionError::ParquetError(_) => "Parquet error: ",
#[cfg(feature = "avro")]
DataFusionError::AvroError(_) => "Avro error: ",
#[cfg(feature = "object_store")]
DataFusionError::ObjectStore(_) => "Object Store error: ",
DataFusionError::IoError(_) => "IO error: ",
Expand Down Expand Up @@ -561,8 +545,6 @@ impl DataFusionError {
}
#[cfg(feature = "parquet")]
DataFusionError::ParquetError(ref desc) => Cow::Owned(desc.to_string()),
#[cfg(feature = "avro")]
DataFusionError::AvroError(ref desc) => Cow::Owned(desc.to_string()),
DataFusionError::IoError(ref desc) => Cow::Owned(desc.to_string()),
#[cfg(feature = "sql")]
DataFusionError::SQL(ref desc, ref backtrace) => {
Expand Down
2 changes: 1 addition & 1 deletion datafusion/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ nested_expressions = ["datafusion-functions-nested"]
# This feature is deprecated. Use the `nested_expressions` feature instead.
array_expressions = ["nested_expressions"]
# Used to enable the avro format
avro = ["datafusion-common/avro", "datafusion-datasource-avro"]
avro = ["datafusion-datasource-avro"]
backtrace = ["datafusion-common/backtrace"]
compression = [
"liblzma",
Expand Down
26 changes: 13 additions & 13 deletions datafusion/core/src/datasource/file_format/avro.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ mod tests {
"double_col: Float64",
"date_string_col: Binary",
"string_col: Binary",
"timestamp_col: Timestamp(Microsecond, None)",
"timestamp_col: Timestamp(Microsecond, Some(\"+00:00\"))",
],
x
);
Expand All @@ -118,18 +118,18 @@ mod tests {
assert_eq!(batches.len(), 1);

assert_snapshot!(batches_to_string(&batches),@r"
+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+
| id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col | string_col | timestamp_col |
+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+
| 4 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30332f30312f3039 | 30 | 2009-03-01T00:00:00 |
| 5 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30332f30312f3039 | 31 | 2009-03-01T00:01:00 |
| 6 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30342f30312f3039 | 30 | 2009-04-01T00:00:00 |
| 7 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30342f30312f3039 | 31 | 2009-04-01T00:01:00 |
| 2 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30322f30312f3039 | 30 | 2009-02-01T00:00:00 |
| 3 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30322f30312f3039 | 31 | 2009-02-01T00:01:00 |
| 0 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30312f30312f3039 | 30 | 2009-01-01T00:00:00 |
| 1 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30312f30312f3039 | 31 | 2009-01-01T00:01:00 |
+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+
+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+----------------------+
| id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col | string_col | timestamp_col |
+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+----------------------+
| 4 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30332f30312f3039 | 30 | 2009-03-01T00:00:00Z |
| 5 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30332f30312f3039 | 31 | 2009-03-01T00:01:00Z |
| 6 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30342f30312f3039 | 30 | 2009-04-01T00:00:00Z |
| 7 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30342f30312f3039 | 31 | 2009-04-01T00:01:00Z |
| 2 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30322f30312f3039 | 30 | 2009-02-01T00:00:00Z |
| 3 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30322f30312f3039 | 31 | 2009-02-01T00:01:00Z |
| 0 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30312f30312f3039 | 30 | 2009-01-01T00:00:00Z |
| 1 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30312f30312f3039 | 31 | 2009-01-01T00:01:00Z |
+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+----------------------+
");
Ok(())
}
Expand Down
2 changes: 1 addition & 1 deletion datafusion/core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -787,7 +787,7 @@ pub use object_store;
pub use parquet;

#[cfg(feature = "avro")]
pub use datafusion_datasource_avro::apache_avro;
pub use datafusion_datasource_avro::arrow_avro;

// re-export DataFusion sub-crates at the top level. Use `pub use *`
// so that the contents of the subcrates appears in rustdocs
Expand Down
7 changes: 2 additions & 5 deletions datafusion/datasource-avro/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,18 @@ version.workspace = true
all-features = true

[dependencies]
apache-avro = { workspace = true }
arrow = { workspace = true }
arrow-avro = { workspace = true }
async-trait = { workspace = true }
bytes = { workspace = true }
datafusion-common = { workspace = true, features = ["object_store", "avro"] }
datafusion-common = { workspace = true, features = ["object_store"] }
datafusion-datasource = { workspace = true }
datafusion-physical-expr-common = { workspace = true }
datafusion-physical-plan = { workspace = true }
datafusion-session = { workspace = true }
futures = { workspace = true }
num-traits = { workspace = true }
object_store = { workspace = true }

[dev-dependencies]
serde_json = { workspace = true }

# Note: add additional linter rules in lib.rs.
# Rust does not support workspace + new linter rules in subcrates yet
Expand Down
Loading
Loading