From 8b5c8bf6306c60954ac6b7f4102b7a732e834fca Mon Sep 17 00:00:00 2001 From: ShashidharM0118 Date: Sat, 20 Dec 2025 12:45:25 +0530 Subject: [PATCH 1/4] feat: output statistics for constant columns in projections --- datafusion/physical-expr/src/projection.rs | 100 ++++++++++++++++++++- 1 file changed, 98 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-expr/src/projection.rs b/datafusion/physical-expr/src/projection.rs index c46df87fd8b3f..a91684169cd41 100644 --- a/datafusion/physical-expr/src/projection.rs +++ b/datafusion/physical-expr/src/projection.rs @@ -21,12 +21,12 @@ use std::ops::Deref; use std::sync::Arc; use crate::PhysicalExpr; -use crate::expressions::Column; +use crate::expressions::{Column, Literal}; use crate::utils::collect_columns; use arrow::array::{RecordBatch, RecordBatchOptions}; use arrow::datatypes::{Field, Schema, SchemaRef}; -use datafusion_common::stats::ColumnStatistics; +use datafusion_common::stats::{ColumnStatistics, Precision}; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::{ Result, assert_or_internal_err, internal_datafusion_err, plan_err, @@ -587,6 +587,38 @@ impl ProjectionExprs { let expr = &proj_expr.expr; let col_stats = if let Some(col) = expr.as_any().downcast_ref::() { std::mem::take(&mut stats.column_statistics[col.index()]) + } else if let Some(literal) = expr.as_any().downcast_ref::() + && !literal.value().is_null() + { + // For constant columns (non-null literals), output proper statistics + let value = literal.value(); + let data_type = expr.data_type(output_schema)?; + + // For constant columns: + // - min_value = max_value = the literal value + // - distinct_count = 1 + // - null_count = 0 + // - byte_size = calculated from data type and num_rows + let distinct_count = Precision::Exact(1); + let null_count = Precision::Exact(0); + + // Calculate byte_size: for primitive types, use width * num_rows + let byte_size = if let Some(byte_width) = data_type.primitive_width() { + stats.num_rows.multiply(&Precision::Exact(byte_width)) + } else { + // For complex types (Utf8, List, etc.), we can't calculate exact size + // without knowing the actual data, so leave it as Absent + Precision::Absent + }; + + ColumnStatistics { + min_value: Precision::Exact(value.clone()), + max_value: Precision::Exact(value.clone()), + distinct_count, + null_count, + sum_value: Precision::Absent, // Sum doesn't make sense for constants + byte_size, + } } else { // TODO stats: estimate more statistics from expressions // (expressions should compute their statistics themselves) @@ -2593,4 +2625,68 @@ pub(crate) mod tests { Ok(()) } + + #[test] + fn test_project_statistics_with_literal() -> Result<()> { + let input_stats = get_stats(); + let input_schema = get_schema(); + + // Projection with literal: SELECT 42 AS constant, col0 AS num + let projection = ProjectionExprs::new(vec![ + ProjectionExpr { + expr: Arc::new(Literal::new(ScalarValue::Int64(Some(42)))), + alias: "constant".to_string(), + }, + ProjectionExpr { + expr: Arc::new(Column::new("col0", 0)), + alias: "num".to_string(), + }, + ]); + + let output_stats = projection.project_statistics( + input_stats, + &projection.project_schema(&input_schema)?, + )?; + + // Row count should be preserved + assert_eq!(output_stats.num_rows, Precision::Exact(5)); + + // Should have 2 column statistics + assert_eq!(output_stats.column_statistics.len(), 2); + + // First column (literal 42) should have proper constant statistics + assert_eq!( + output_stats.column_statistics[0].min_value, + Precision::Exact(ScalarValue::Int64(Some(42))) + ); + assert_eq!( + output_stats.column_statistics[0].max_value, + Precision::Exact(ScalarValue::Int64(Some(42))) + ); + assert_eq!( + output_stats.column_statistics[0].distinct_count, + Precision::Exact(1) + ); + assert_eq!( + output_stats.column_statistics[0].null_count, + Precision::Exact(0) + ); + // Int64 is 8 bytes, 5 rows = 40 bytes + assert_eq!( + output_stats.column_statistics[0].byte_size, + Precision::Exact(40) + ); + + // Second column (col0) should preserve statistics + assert_eq!( + output_stats.column_statistics[1].distinct_count, + Precision::Exact(5) + ); + assert_eq!( + output_stats.column_statistics[1].max_value, + Precision::Exact(ScalarValue::Int64(Some(21))) + ); + + Ok(()) + } } From 2815d0cf0f0717dc615f28e9d08b9abe4bc5b130 Mon Sep 17 00:00:00 2001 From: ShashidharM0118 Date: Mon, 22 Dec 2025 12:27:03 +0530 Subject: [PATCH 2/4] generate stats for null literals and compute sum_value --- datafusion/physical-expr/src/projection.rs | 209 ++++++++++++++++++--- 1 file changed, 180 insertions(+), 29 deletions(-) diff --git a/datafusion/physical-expr/src/projection.rs b/datafusion/physical-expr/src/projection.rs index a91684169cd41..6f724d93d70de 100644 --- a/datafusion/physical-expr/src/projection.rs +++ b/datafusion/physical-expr/src/projection.rs @@ -587,37 +587,59 @@ impl ProjectionExprs { let expr = &proj_expr.expr; let col_stats = if let Some(col) = expr.as_any().downcast_ref::() { std::mem::take(&mut stats.column_statistics[col.index()]) - } else if let Some(literal) = expr.as_any().downcast_ref::() - && !literal.value().is_null() - { - // For constant columns (non-null literals), output proper statistics - let value = literal.value(); + } else if let Some(literal) = expr.as_any().downcast_ref::() { let data_type = expr.data_type(output_schema)?; - - // For constant columns: - // - min_value = max_value = the literal value - // - distinct_count = 1 - // - null_count = 0 - // - byte_size = calculated from data type and num_rows - let distinct_count = Precision::Exact(1); - let null_count = Precision::Exact(0); - - // Calculate byte_size: for primitive types, use width * num_rows - let byte_size = if let Some(byte_width) = data_type.primitive_width() { - stats.num_rows.multiply(&Precision::Exact(byte_width)) + + if literal.value().is_null() { + // For NULL literals (constant NULL columns), output proper statistics + // This enables optimizations like constant column detection and sort elimination + // For constant NULL columns: + // - null_count = num_rows (all rows are NULL) + // - distinct_count = 1 (all NULLs are considered the same) + // - min_value/max_value = Absent (NULLs don't have min/max) + // - byte_size = Absent (NULLs don't take space in most representations) + let null_count = match stats.num_rows { + Precision::Exact(num_rows) => Precision::Exact(num_rows), + _ => Precision::Absent, // Can't determine null_count without exact row count + }; + + ColumnStatistics { + min_value: Precision::Absent, // NULLs don't have min/max + max_value: Precision::Absent, + distinct_count: Precision::Exact(1), // All NULLs are considered the same + null_count, + sum_value: Precision::Absent, // Sum doesn't make sense for NULLs + byte_size: Precision::Absent, // NULLs don't take space + } } else { - // For complex types (Utf8, List, etc.), we can't calculate exact size - // without knowing the actual data, so leave it as Absent - Precision::Absent - }; - - ColumnStatistics { - min_value: Precision::Exact(value.clone()), - max_value: Precision::Exact(value.clone()), - distinct_count, - null_count, - sum_value: Precision::Absent, // Sum doesn't make sense for constants - byte_size, + // For constant columns (non-null literals), output proper statistics + let value = literal.value(); + + // For constant columns: + // - min_value = max_value = the literal value + // - distinct_count = 1 + // - null_count = 0 + // - byte_size = calculated from data type and num_rows + let distinct_count = Precision::Exact(1); + let null_count = Precision::Exact(0); + + // Calculate byte_size: for primitive types, use width * num_rows + let byte_size = if let Some(byte_width) = data_type.primitive_width() { + stats.num_rows.multiply(&Precision::Exact(byte_width)) + } else { + // For complex types (Utf8, List, etc.), we can't calculate exact size + // without knowing the actual data, so leave it as Absent + Precision::Absent + }; + + ColumnStatistics { + min_value: Precision::Exact(value.clone()), + max_value: Precision::Exact(value.clone()), + distinct_count, + null_count, + sum_value: Precision::Absent, // Sum doesn't make sense for constants + byte_size, + } } } else { // TODO stats: estimate more statistics from expressions @@ -2689,4 +2711,133 @@ pub(crate) mod tests { Ok(()) } + + #[test] + fn test_project_statistics_with_null_literal() -> Result<()> { + let input_stats = get_stats(); + let input_schema = get_schema(); + + // Projection with NULL literal: SELECT NULL AS null_col, col0 AS num + let projection = ProjectionExprs::new(vec![ + ProjectionExpr { + expr: Arc::new(Literal::new(ScalarValue::Int64(None))), + alias: "null_col".to_string(), + }, + ProjectionExpr { + expr: Arc::new(Column::new("col0", 0)), + alias: "num".to_string(), + }, + ]); + + let output_stats = projection.project_statistics( + input_stats, + &projection.project_schema(&input_schema)?, + )?; + + // Row count should be preserved + assert_eq!(output_stats.num_rows, Precision::Exact(5)); + + // Should have 2 column statistics + assert_eq!(output_stats.column_statistics.len(), 2); + + // First column (NULL literal) should have proper constant NULL statistics + assert_eq!( + output_stats.column_statistics[0].min_value, + Precision::Absent // NULLs don't have min/max + ); + assert_eq!( + output_stats.column_statistics[0].max_value, + Precision::Absent // NULLs don't have min/max + ); + assert_eq!( + output_stats.column_statistics[0].distinct_count, + Precision::Exact(1) // All NULLs are considered the same + ); + assert_eq!( + output_stats.column_statistics[0].null_count, + Precision::Exact(5) // All rows are NULL + ); + assert_eq!( + output_stats.column_statistics[0].byte_size, + Precision::Absent // NULLs don't take space + ); + + // Second column (col0) should preserve statistics + assert_eq!( + output_stats.column_statistics[1].distinct_count, + Precision::Exact(5) + ); + assert_eq!( + output_stats.column_statistics[1].max_value, + Precision::Exact(ScalarValue::Int64(Some(21))) + ); + + Ok(()) + } + + #[test] + fn test_project_statistics_with_complex_type_literal() -> Result<()> { + let input_stats = get_stats(); + let input_schema = get_schema(); + + // Projection with Utf8 literal (complex type): SELECT 'hello' AS text, col0 AS num + let projection = ProjectionExprs::new(vec![ + ProjectionExpr { + expr: Arc::new(Literal::new(ScalarValue::Utf8(Some("hello".to_string())))), + alias: "text".to_string(), + }, + ProjectionExpr { + expr: Arc::new(Column::new("col0", 0)), + alias: "num".to_string(), + }, + ]); + + let output_stats = projection.project_statistics( + input_stats, + &projection.project_schema(&input_schema)?, + )?; + + // Row count should be preserved + assert_eq!(output_stats.num_rows, Precision::Exact(5)); + + // Should have 2 column statistics + assert_eq!(output_stats.column_statistics.len(), 2); + + // First column (Utf8 literal 'hello') should have proper constant statistics + // but byte_size should be Absent for complex types + assert_eq!( + output_stats.column_statistics[0].min_value, + Precision::Exact(ScalarValue::Utf8(Some("hello".to_string()))) + ); + assert_eq!( + output_stats.column_statistics[0].max_value, + Precision::Exact(ScalarValue::Utf8(Some("hello".to_string()))) + ); + assert_eq!( + output_stats.column_statistics[0].distinct_count, + Precision::Exact(1) + ); + assert_eq!( + output_stats.column_statistics[0].null_count, + Precision::Exact(0) + ); + // Complex types (Utf8, List, etc.) should have byte_size = Absent + // because we can't calculate exact size without knowing the actual data + assert_eq!( + output_stats.column_statistics[0].byte_size, + Precision::Absent + ); + + // Second column (col0) should preserve statistics + assert_eq!( + output_stats.column_statistics[1].distinct_count, + Precision::Exact(5) + ); + assert_eq!( + output_stats.column_statistics[1].max_value, + Precision::Exact(ScalarValue::Int64(Some(21))) + ); + + Ok(()) + } } From c23c52161e5f00e5b07cd950cd55791b8e608701 Mon Sep 17 00:00:00 2001 From: ShashidharM0118 Date: Mon, 22 Dec 2025 13:30:40 +0530 Subject: [PATCH 3/4] cleaner logic to handle sum_value --- datafusion/physical-expr/src/projection.rs | 51 ++++++++++++++++++---- 1 file changed, 43 insertions(+), 8 deletions(-) diff --git a/datafusion/physical-expr/src/projection.rs b/datafusion/physical-expr/src/projection.rs index 6f724d93d70de..656707334061c 100644 --- a/datafusion/physical-expr/src/projection.rs +++ b/datafusion/physical-expr/src/projection.rs @@ -29,7 +29,7 @@ use arrow::datatypes::{Field, Schema, SchemaRef}; use datafusion_common::stats::{ColumnStatistics, Precision}; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::{ - Result, assert_or_internal_err, internal_datafusion_err, plan_err, + Result, ScalarValue, assert_or_internal_err, internal_datafusion_err, plan_err, }; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; @@ -589,7 +589,7 @@ impl ProjectionExprs { std::mem::take(&mut stats.column_statistics[col.index()]) } else if let Some(literal) = expr.as_any().downcast_ref::() { let data_type = expr.data_type(output_schema)?; - + if literal.value().is_null() { // For NULL literals (constant NULL columns), output proper statistics // This enables optimizations like constant column detection and sort elimination @@ -602,7 +602,7 @@ impl ProjectionExprs { Precision::Exact(num_rows) => Precision::Exact(num_rows), _ => Precision::Absent, // Can't determine null_count without exact row count }; - + ColumnStatistics { min_value: Precision::Absent, // NULLs don't have min/max max_value: Precision::Absent, @@ -624,11 +624,29 @@ impl ProjectionExprs { let null_count = Precision::Exact(0); // Calculate byte_size: for primitive types, use width * num_rows - let byte_size = if let Some(byte_width) = data_type.primitive_width() { + let byte_size = if let Some(byte_width) = data_type.primitive_width() + { stats.num_rows.multiply(&Precision::Exact(byte_width)) } else { - // For complex types (Utf8, List, etc.), we can't calculate exact size - // without knowing the actual data, so leave it as Absent + // For complex types (Utf8, List, etc.), the byte_size when materialized + // as an array depends on the array encoding and representation (e.g., + // dictionary encoding, string view arrays), so we conservatively set it to Absent + Precision::Absent + }; + + // Calculate sum_value: for numeric types, sum = value * num_rows + // This is useful for optimizations (e.g., cross joins multiply sum_value by row count) + let sum_value = if !value.is_null() { + // Convert num_rows to a ScalarValue of the same type as the value + Precision::::from(stats.num_rows) + .cast_to(&value.data_type()) + .ok() + .map(|row_count| { + // Multiply value * num_rows to get the sum + Precision::Exact(value.clone()).multiply(&row_count) + }) + .unwrap_or(Precision::Absent) + } else { Precision::Absent }; @@ -637,7 +655,7 @@ impl ProjectionExprs { max_value: Precision::Exact(value.clone()), distinct_count, null_count, - sum_value: Precision::Absent, // Sum doesn't make sense for constants + sum_value, byte_size, } } @@ -2698,6 +2716,11 @@ pub(crate) mod tests { output_stats.column_statistics[0].byte_size, Precision::Exact(40) ); + // For a constant column, sum_value = value * num_rows = 42 * 5 = 210 + assert_eq!( + output_stats.column_statistics[0].sum_value, + Precision::Exact(ScalarValue::Int64(Some(210))) + ); // Second column (col0) should preserve statistics assert_eq!( @@ -2761,6 +2784,10 @@ pub(crate) mod tests { output_stats.column_statistics[0].byte_size, Precision::Absent // NULLs don't take space ); + assert_eq!( + output_stats.column_statistics[0].sum_value, + Precision::Absent // Sum doesn't make sense for NULLs + ); // Second column (col0) should preserve statistics assert_eq!( @@ -2783,7 +2810,9 @@ pub(crate) mod tests { // Projection with Utf8 literal (complex type): SELECT 'hello' AS text, col0 AS num let projection = ProjectionExprs::new(vec![ ProjectionExpr { - expr: Arc::new(Literal::new(ScalarValue::Utf8(Some("hello".to_string())))), + expr: Arc::new(Literal::new(ScalarValue::Utf8(Some( + "hello".to_string(), + )))), alias: "text".to_string(), }, ProjectionExpr { @@ -2827,6 +2856,12 @@ pub(crate) mod tests { output_stats.column_statistics[0].byte_size, Precision::Absent ); + // Non-numeric types (Utf8) should have sum_value = Absent + // because sum is only meaningful for numeric types + assert_eq!( + output_stats.column_statistics[0].sum_value, + Precision::Absent + ); // Second column (col0) should preserve statistics assert_eq!( From 7e2207ef5cbdf5338ae384e3bc9eedba56dbc21c Mon Sep 17 00:00:00 2001 From: ShashidharM0118 Date: Mon, 22 Dec 2025 13:41:45 +0530 Subject: [PATCH 4/4] remove redundant check for null literal --- datafusion/physical-expr/src/projection.rs | 55 +++++++--------------- 1 file changed, 17 insertions(+), 38 deletions(-) diff --git a/datafusion/physical-expr/src/projection.rs b/datafusion/physical-expr/src/projection.rs index 656707334061c..931790485259c 100644 --- a/datafusion/physical-expr/src/projection.rs +++ b/datafusion/physical-expr/src/projection.rs @@ -588,67 +588,43 @@ impl ProjectionExprs { let col_stats = if let Some(col) = expr.as_any().downcast_ref::() { std::mem::take(&mut stats.column_statistics[col.index()]) } else if let Some(literal) = expr.as_any().downcast_ref::() { + // Handle literal expressions (constants) by calculating proper statistics let data_type = expr.data_type(output_schema)?; if literal.value().is_null() { - // For NULL literals (constant NULL columns), output proper statistics - // This enables optimizations like constant column detection and sort elimination - // For constant NULL columns: - // - null_count = num_rows (all rows are NULL) - // - distinct_count = 1 (all NULLs are considered the same) - // - min_value/max_value = Absent (NULLs don't have min/max) - // - byte_size = Absent (NULLs don't take space in most representations) let null_count = match stats.num_rows { Precision::Exact(num_rows) => Precision::Exact(num_rows), - _ => Precision::Absent, // Can't determine null_count without exact row count + _ => Precision::Absent, }; ColumnStatistics { - min_value: Precision::Absent, // NULLs don't have min/max + min_value: Precision::Absent, max_value: Precision::Absent, - distinct_count: Precision::Exact(1), // All NULLs are considered the same + distinct_count: Precision::Exact(1), null_count, - sum_value: Precision::Absent, // Sum doesn't make sense for NULLs - byte_size: Precision::Absent, // NULLs don't take space + sum_value: Precision::Absent, + byte_size: Precision::Absent, } } else { - // For constant columns (non-null literals), output proper statistics let value = literal.value(); - - // For constant columns: - // - min_value = max_value = the literal value - // - distinct_count = 1 - // - null_count = 0 - // - byte_size = calculated from data type and num_rows let distinct_count = Precision::Exact(1); let null_count = Precision::Exact(0); - // Calculate byte_size: for primitive types, use width * num_rows let byte_size = if let Some(byte_width) = data_type.primitive_width() { stats.num_rows.multiply(&Precision::Exact(byte_width)) } else { - // For complex types (Utf8, List, etc.), the byte_size when materialized - // as an array depends on the array encoding and representation (e.g., - // dictionary encoding, string view arrays), so we conservatively set it to Absent + // Complex types depend on array encoding, so set to Absent Precision::Absent }; - // Calculate sum_value: for numeric types, sum = value * num_rows - // This is useful for optimizations (e.g., cross joins multiply sum_value by row count) - let sum_value = if !value.is_null() { - // Convert num_rows to a ScalarValue of the same type as the value - Precision::::from(stats.num_rows) - .cast_to(&value.data_type()) - .ok() - .map(|row_count| { - // Multiply value * num_rows to get the sum - Precision::Exact(value.clone()).multiply(&row_count) - }) - .unwrap_or(Precision::Absent) - } else { - Precision::Absent - }; + let sum_value = Precision::::from(stats.num_rows) + .cast_to(&value.data_type()) + .ok() + .map(|row_count| { + Precision::Exact(value.clone()).multiply(&row_count) + }) + .unwrap_or(Precision::Absent); ColumnStatistics { min_value: Precision::Exact(value.clone()), @@ -2666,6 +2642,7 @@ pub(crate) mod tests { Ok(()) } + // Test statistics calculation for non-null literal (numeric constant) #[test] fn test_project_statistics_with_literal() -> Result<()> { let input_stats = get_stats(); @@ -2735,6 +2712,7 @@ pub(crate) mod tests { Ok(()) } + // Test statistics calculation for NULL literal (constant NULL column) #[test] fn test_project_statistics_with_null_literal() -> Result<()> { let input_stats = get_stats(); @@ -2802,6 +2780,7 @@ pub(crate) mod tests { Ok(()) } + // Test statistics calculation for complex type literal (e.g., Utf8 string) #[test] fn test_project_statistics_with_complex_type_literal() -> Result<()> { let input_stats = get_stats();