From 4c34258d8257fa5cb2ef0e200f0b243b075ea5c4 Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Fri, 31 Oct 2025 15:06:33 +0800 Subject: [PATCH 01/26] Support row group limit pruning --- datafusion/core/tests/parquet/mod.rs | 52 ++- .../core/tests/parquet/row_group_pruning.rs | 336 +++++++++++++++++- datafusion/datasource-parquet/src/metrics.rs | 34 +- datafusion/datasource-parquet/src/opener.rs | 6 +- .../src/row_group_filter.rs | 97 ++++- datafusion/pruning/src/pruning_predicate.rs | 1 - 6 files changed, 505 insertions(+), 21 deletions(-) diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index 44c9a2393e3d8..16ac557f18811 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -182,6 +182,11 @@ impl TestOutput { .map(|(_pruned, matched)| matched) } + /// The number of row_groups fully matched by statistics + fn row_groups_fully_matched_statistics(&self) -> Option { + self.metric_value("row_groups_fully_matched_statistics") + } + /// The number of row_groups pruned by statistics fn row_groups_pruned_statistics(&self) -> Option { self.pruning_metric("row_groups_pruned_statistics") @@ -219,6 +224,11 @@ impl TestOutput { .map(|(pruned, _matched)| pruned) } + /// The number of row groups pruned by limit pruning + fn limit_pruned_row_groups(&self) -> Option { + self.metric_value("limit_pruned_row_groups") + } + fn description(&self) -> String { format!( "Input:\n{}\nQuery:\n{}\nOutput:\n{}\nMetrics:\n{}", @@ -232,20 +242,41 @@ impl TestOutput { /// and the appropriate scenario impl ContextWithParquet { async fn new(scenario: Scenario, unit: Unit) -> Self { - Self::with_config(scenario, unit, SessionConfig::new()).await + Self::with_config(scenario, unit, SessionConfig::new(), None, None).await + } + + /// Set custom schema and batches for the test + pub async fn with_custom_data( + scenario: Scenario, + unit: Unit, + schema: Arc, + batches: Vec, + ) -> Self { + Self::with_config( + scenario, + unit, + SessionConfig::new(), + Some(schema), + Some(batches), + ) + .await } async fn with_config( scenario: Scenario, unit: Unit, mut config: SessionConfig, + custom_schema: Option>, + custom_batches: Option>, ) -> Self { // Use a single partition for deterministic results no matter how many CPUs the host has config = config.with_target_partitions(1); let file = match unit { Unit::RowGroup(row_per_group) => { config = config.with_parquet_bloom_filter_pruning(true); - make_test_file_rg(scenario, row_per_group).await + config.options_mut().execution.parquet.pushdown_filters = true; + make_test_file_rg(scenario, row_per_group, custom_schema, custom_batches) + .await } Unit::Page(row_per_page) => { config = config.with_parquet_page_index_pruning(true); @@ -1075,7 +1106,12 @@ fn create_data_batch(scenario: Scenario) -> Vec { } /// Create a test parquet file with various data types -async fn make_test_file_rg(scenario: Scenario, row_per_group: usize) -> NamedTempFile { +async fn make_test_file_rg( + scenario: Scenario, + row_per_group: usize, + custom_schema: Option>, + custom_batches: Option>, +) -> NamedTempFile { let mut output_file = tempfile::Builder::new() .prefix("parquet_pruning") .suffix(".parquet") @@ -1088,8 +1124,14 @@ async fn make_test_file_rg(scenario: Scenario, row_per_group: usize) -> NamedTem .set_statistics_enabled(EnabledStatistics::Page) .build(); - let batches = create_data_batch(scenario); - let schema = batches[0].schema(); + let (batches, schema) = + if let (Some(schema), Some(batches)) = (custom_schema, custom_batches) { + (batches, schema) + } else { + let batches = create_data_batch(scenario); + let schema = batches[0].schema(); + (batches, schema) + }; let mut writer = ArrowWriter::try_new(&mut output_file, schema, Some(props)).unwrap(); diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs index 0411298055f26..e0ba462281ce3 100644 --- a/datafusion/core/tests/parquet/row_group_pruning.rs +++ b/datafusion/core/tests/parquet/row_group_pruning.rs @@ -18,8 +18,12 @@ //! This file contains an end to end test of parquet pruning. It writes //! data into a parquet file and then verifies row groups are pruned as //! expected. +use std::sync::Arc; + +use arrow::array::{ArrayRef, Int32Array, RecordBatch}; +use arrow_schema::{DataType, Field, Schema}; use datafusion::prelude::SessionConfig; -use datafusion_common::ScalarValue; +use datafusion_common::{DataFusionError, ScalarValue}; use itertools::Itertools; use crate::parquet::Unit::RowGroup; @@ -30,10 +34,12 @@ struct RowGroupPruningTest { query: String, expected_errors: Option, expected_row_group_matched_by_statistics: Option, + expected_row_group_fully_matched_by_statistics: Option, expected_row_group_pruned_by_statistics: Option, expected_files_pruned_by_statistics: Option, expected_row_group_matched_by_bloom_filter: Option, expected_row_group_pruned_by_bloom_filter: Option, + expected_limit_pruned_row_groups: Option, expected_rows: usize, } impl RowGroupPruningTest { @@ -45,9 +51,11 @@ impl RowGroupPruningTest { expected_errors: None, expected_row_group_matched_by_statistics: None, expected_row_group_pruned_by_statistics: None, + expected_row_group_fully_matched_by_statistics: None, expected_files_pruned_by_statistics: None, expected_row_group_matched_by_bloom_filter: None, expected_row_group_pruned_by_bloom_filter: None, + expected_limit_pruned_row_groups: None, expected_rows: 0, } } @@ -76,6 +84,15 @@ impl RowGroupPruningTest { self } + // Set the expected fully matched row groups by statistics + fn with_fully_matched_by_stats( + mut self, + fully_matched_by_stats: Option, + ) -> Self { + self.expected_row_group_fully_matched_by_statistics = fully_matched_by_stats; + self + } + // Set the expected pruned row groups by statistics fn with_pruned_by_stats(mut self, pruned_by_stats: Option) -> Self { self.expected_row_group_pruned_by_statistics = pruned_by_stats; @@ -99,6 +116,11 @@ impl RowGroupPruningTest { self } + fn with_limit_pruned_row_groups(mut self, pruned_by_limit: Option) -> Self { + self.expected_limit_pruned_row_groups = pruned_by_limit; + self + } + /// Set the number of expected rows from the output of this test fn with_expected_rows(mut self, rows: usize) -> Self { self.expected_rows = rows; @@ -144,6 +166,65 @@ impl RowGroupPruningTest { self.expected_row_group_pruned_by_bloom_filter, "mismatched row_groups_pruned_bloom_filter", ); + + assert_eq!( + output.result_rows, + self.expected_rows, + "Expected {} rows, got {}: {}", + output.result_rows, + self.expected_rows, + output.description(), + ); + } + + // Execute the test with the current configuration + async fn test_row_group_prune_with_custom_data( + self, + schema: Arc, + batches: Vec, + max_row_per_group: usize, + ) { + let output = ContextWithParquet::with_custom_data( + self.scenario, + RowGroup(max_row_per_group), + schema, + batches, + ) + .await + .query(&self.query) + .await; + + println!("{}", output.description()); + assert_eq!( + output.predicate_evaluation_errors(), + self.expected_errors, + "mismatched predicate_evaluation error" + ); + assert_eq!( + output.row_groups_matched_statistics(), + self.expected_row_group_matched_by_statistics, + "mismatched row_groups_matched_statistics", + ); + assert_eq!( + output.row_groups_fully_matched_statistics(), + self.expected_row_group_fully_matched_by_statistics, + "mismatched row_groups_fully_matched_statistics", + ); + assert_eq!( + output.row_groups_pruned_statistics(), + self.expected_row_group_pruned_by_statistics, + "mismatched row_groups_pruned_statistics", + ); + assert_eq!( + output.files_ranges_pruned_statistics(), + self.expected_files_pruned_by_statistics, + "mismatched files_ranges_pruned_statistics", + ); + assert_eq!( + output.limit_pruned_row_groups(), + self.expected_limit_pruned_row_groups, + "mismatched limit_pruned_row_groups", + ); assert_eq!( output.result_rows, self.expected_rows, @@ -289,11 +370,16 @@ async fn prune_disabled() { let expected_rows = 10; let config = SessionConfig::new().with_parquet_pruning(false); - let output = - ContextWithParquet::with_config(Scenario::Timestamps, RowGroup(5), config) - .await - .query(query) - .await; + let output = ContextWithParquet::with_config( + Scenario::Timestamps, + RowGroup(5), + config, + None, + None, + ) + .await + .query(query) + .await; println!("{}", output.description()); // This should not prune any @@ -1636,3 +1722,241 @@ async fn test_bloom_filter_decimal_dict() { .test_row_group_prune() .await; } + +// Helper function to create a batch with a single Int32 column. +fn make_i32_batch( + name: &str, + values: Vec, +) -> datafusion_common::error::Result { + let schema = Arc::new(Schema::new(vec![Field::new(name, DataType::Int32, false)])); + let array: ArrayRef = Arc::new(Int32Array::from(values)); + RecordBatch::try_new(schema, vec![array]).map_err(DataFusionError::from) +} + +// Helper function to create a batch with two Int32 columns +fn make_two_col_i32_batch( + name_a: &str, + name_b: &str, + values_a: Vec, + values_b: Vec, +) -> datafusion_common::error::Result { + let schema = Arc::new(Schema::new(vec![ + Field::new(name_a, DataType::Int32, false), + Field::new(name_b, DataType::Int32, false), + ])); + let array_a: ArrayRef = Arc::new(Int32Array::from(values_a)); + let array_b: ArrayRef = Arc::new(Int32Array::from(values_b)); + RecordBatch::try_new(schema, vec![array_a, array_b]).map_err(DataFusionError::from) +} + +#[tokio::test] +async fn test_limit_pruning_basic() -> datafusion_common::error::Result<()> { + // Scenario: Simple integer column, multiple row groups + // Query: SELECT c1 FROM t WHERE c1 = 0 LIMIT 2 + // We expect 2 rows in total. + + // Row Group 0: c1 = [0, -2] -> Partially matched, 1 row + // Row Group 1: c1 = [1, 2] -> Fully matched, 2 rows + // Row Group 2: c1 = [3, 4] -> Fully matched, 2 rows + // Row Group 3: c1 = [5, 6] -> Fully matched, 2 rows + // Row Group 4: c1 = [-1, -2] -> Not matched + + // If limit = 2, and RG1 is fully matched and has 2 rows, we should + // only scan RG1 and prune other row groups + // RG4 is pruned by statistics. RG2 and RG3 are pruned by limit. + // So 2 row groups are effectively pruned due to limit pruning. + + let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)])); + let query = "SELECT c1 FROM t WHERE c1 >= 0 LIMIT 2"; + + let batches = vec![ + make_i32_batch("c1", vec![0, -2])?, + make_i32_batch("c1", vec![0, 0])?, + make_i32_batch("c1", vec![0, 0])?, + make_i32_batch("c1", vec![0, 0])?, + make_i32_batch("c1", vec![-1, -2])?, + ]; + + RowGroupPruningTest::new() + .with_scenario(Scenario::Int) // Assuming Scenario::Int can handle this data + .with_query(query) + .with_expected_errors(Some(0)) + .with_expected_rows(2) + .with_pruned_files(Some(0)) + .with_matched_by_stats(Some(4)) + .with_fully_matched_by_stats(Some(3)) + .with_pruned_by_stats(Some(1)) + .with_limit_pruned_row_groups(Some(3)) + .test_row_group_prune_with_custom_data(schema, batches, 2) + .await; + + Ok(()) +} + +#[tokio::test] +async fn test_limit_pruning_complex_filter() -> datafusion_common::error::Result<()> { + // Test Case 1: Complex filter with two columns (a = 1 AND b > 1 AND b < 4) + // Row Group 0: a=[1,1,1], b=[0,2,3] -> Partially matched, 2 rows match (b=2,3) + // Row Group 1: a=[1,1,1], b=[2,2,2] -> Fully matched, 3 rows + // Row Group 2: a=[1,1,1], b=[2,3,3] -> Fully matched, 3 rows + // Row Group 3: a=[1,1,1], b=[2,2,3] -> Fully matched, 3 rows + // Row Group 4: a=[2,2,2], b=[2,2,2] -> Not matched (a != 1) + // Row Group 5: a=[1,1,1], b=[5,6,7] -> Not matched (b >= 4) + + // With LIMIT 5, we need RG1 (3 rows) + RG2 (2 rows from 3) = 5 rows + // RG4 and RG5 should be pruned by statistics + // RG3 should be pruned by limit + // RG0 is partially matched, so it depends on the order + + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ])); + let query = "SELECT a, b FROM t WHERE a = 1 AND b > 1 AND b < 4 LIMIT 5"; + + let batches = vec![ + make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![0, 2, 3])?, + make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![2, 2, 2])?, + make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![2, 3, 3])?, + make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![2, 2, 3])?, + make_two_col_i32_batch("a", "b", vec![2, 2, 2], vec![2, 2, 2])?, + make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![5, 6, 7])?, + ]; + + RowGroupPruningTest::new() + .with_scenario(Scenario::Int) + .with_query(query) + .with_expected_errors(Some(0)) + .with_expected_rows(5) + .with_pruned_files(Some(0)) + .with_matched_by_stats(Some(4)) // RG0,1,2,3 are matched + .with_fully_matched_by_stats(Some(3)) + .with_pruned_by_stats(Some(2)) // RG4,5 are pruned + .with_limit_pruned_row_groups(Some(2)) // RG0, RG3 is pruned by limit + .test_row_group_prune_with_custom_data(schema, batches, 3) + .await; + + Ok(()) +} + +#[tokio::test] +async fn test_limit_pruning_multiple_fully_matched( +) -> datafusion_common::error::Result<()> { + // Test Case 2: Limit requires multiple fully matched row groups + // Row Group 0: a=[5,5,5,5] -> Fully matched, 4 rows + // Row Group 1: a=[5,5,5,5] -> Fully matched, 4 rows + // Row Group 2: a=[5,5,5,5] -> Fully matched, 4 rows + // Row Group 3: a=[5,5,5,5] -> Fully matched, 4 rows + // Row Group 4: a=[1,2,3,4] -> Not matched + + // With LIMIT 8, we need RG0 (4 rows) + RG1 (4 rows) 8 rows + // RG2,3 should be pruned by limit + // RG4 should be pruned by statistics + + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); + let query = "SELECT a FROM t WHERE a = 5 LIMIT 8"; + + let batches = vec![ + make_i32_batch("a", vec![5, 5, 5, 5])?, + make_i32_batch("a", vec![5, 5, 5, 5])?, + make_i32_batch("a", vec![5, 5, 5, 5])?, + make_i32_batch("a", vec![5, 5, 5, 5])?, + make_i32_batch("a", vec![1, 2, 3, 4])?, + ]; + + RowGroupPruningTest::new() + .with_scenario(Scenario::Int) + .with_query(query) + .with_expected_errors(Some(0)) + .with_expected_rows(8) + .with_pruned_files(Some(0)) + .with_matched_by_stats(Some(4)) // RG0,1,2,3 matched + .with_fully_matched_by_stats(Some(4)) + .with_pruned_by_stats(Some(1)) // RG4 pruned + .with_limit_pruned_row_groups(Some(2)) // RG2,3 pruned by limit + .test_row_group_prune_with_custom_data(schema, batches, 4) + .await; + + Ok(()) +} + +#[tokio::test] +async fn test_limit_pruning_no_fully_matched() -> datafusion_common::error::Result<()> { + // Test Case 3: No fully matched row groups - all are partially matched + // Row Group 0: a=[1,2,3] -> Partially matched, 1 row (a=2) + // Row Group 1: a=[2,3,4] -> Partially matched, 1 row (a=2) + // Row Group 2: a=[2,5,6] -> Partially matched, 1 row (a=2) + // Row Group 3: a=[2,7,8] -> Partially matched, 1 row (a=2) + // Row Group 4: a=[9,10,11] -> Not matched + + // With LIMIT 3, we need to scan RG0,1,2 to get 3 matching rows + // Cannot prune much by limit since all matching RGs are partial + // RG4 should be pruned by statistics + + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); + let query = "SELECT a FROM t WHERE a = 2 LIMIT 3"; + + let batches = vec![ + make_i32_batch("a", vec![1, 2, 3])?, + make_i32_batch("a", vec![2, 3, 4])?, + make_i32_batch("a", vec![2, 5, 6])?, + make_i32_batch("a", vec![2, 7, 8])?, + make_i32_batch("a", vec![9, 10, 11])?, + ]; + + RowGroupPruningTest::new() + .with_scenario(Scenario::Int) + .with_query(query) + .with_expected_errors(Some(0)) + .with_expected_rows(3) + .with_pruned_files(Some(0)) + .with_matched_by_stats(Some(4)) // RG0,1,2,3 matched + .with_fully_matched_by_stats(Some(0)) + .with_pruned_by_stats(Some(1)) // RG4 pruned + .with_limit_pruned_row_groups(Some(0)) // RG3 pruned by limit + .test_row_group_prune_with_custom_data(schema, batches, 3) + .await; + + Ok(()) +} + +#[tokio::test] +async fn test_limit_pruning_exceeds_fully_matched() -> datafusion_common::error::Result<()> +{ + // Test Case 4: Limit exceeds all fully matched rows, need partially matched + // Row Group 0: a=[10,11,12,12] -> Partially matched, 1 row (a=10) + // Row Group 1: a=[10,10,10,10] -> Fully matched, 4 rows + // Row Group 2: a=[10,10,10,10] -> Fully matched, 4 rows + // Row Group 3: a=[10,13,14,11] -> Partially matched, 1 row (a=10) + // Row Group 4: a=[20,21,22,22] -> Not matched + + // With LIMIT 10, we need RG1 (4) + RG2 (4) = 8 from fully matched + // Still need 2 more, so we need to scan partially matched RG0 and RG3 + // All matching row groups should be scanned, only RG4 pruned by statistics + + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); + let query = "SELECT a FROM t WHERE a = 10 LIMIT 10"; + + let batches = vec![ + make_i32_batch("a", vec![10, 11, 12, 12])?, + make_i32_batch("a", vec![10, 10, 10, 10])?, + make_i32_batch("a", vec![10, 10, 10, 10])?, + make_i32_batch("a", vec![10, 13, 14, 11])?, + make_i32_batch("a", vec![20, 21, 22, 22])?, + ]; + + RowGroupPruningTest::new() + .with_scenario(Scenario::Int) + .with_query(query) + .with_expected_errors(Some(0)) + .with_expected_rows(10) // Total: 1 + 3 + 4 + 1 = 9 (less than limit) + .with_pruned_files(Some(0)) + .with_matched_by_stats(Some(4)) // RG0,1,2,3 matched + .with_fully_matched_by_stats(Some(2)) + .with_pruned_by_stats(Some(1)) // RG4 pruned + .with_limit_pruned_row_groups(Some(0)) // No limit pruning since we need all RGs + .test_row_group_prune_with_custom_data(schema, batches, 4) + .await; + + Ok(()) +} diff --git a/datafusion/datasource-parquet/src/metrics.rs b/datafusion/datasource-parquet/src/metrics.rs index 8ce3a081a2e32..e9673e16de56f 100644 --- a/datafusion/datasource-parquet/src/metrics.rs +++ b/datafusion/datasource-parquet/src/metrics.rs @@ -45,10 +45,18 @@ pub struct ParquetFileMetrics { pub files_ranges_pruned_statistics: PruningMetrics, /// Number of times the predicate could not be evaluated pub predicate_evaluation_errors: Count, - /// Number of row groups whose bloom filters were checked, tracked with matched/pruned counts - pub row_groups_pruned_bloom_filter: PruningMetrics, - /// Number of row groups whose statistics were checked, tracked with matched/pruned counts - pub row_groups_pruned_statistics: PruningMetrics, + /// Number of row groups whose bloom filters were checked and matched (not pruned) + pub row_groups_matched_bloom_filter: Count, + /// Number of row groups pruned by bloom filters + pub row_groups_pruned_bloom_filter: Count, + /// Number of row groups pruned due to limit pruning. + pub limit_pruned_row_groups: Count, + /// Number of row groups whose statistics were checked and fully matched + pub row_groups_fully_matched_statistics: Count, + /// Number of row groups whose statistics were checked and matched (not pruned) + pub row_groups_matched_statistics: Count, + /// Number of row groups pruned by statistics + pub row_groups_pruned_statistics: Count, /// Total number of bytes scanned pub bytes_scanned: Count, /// Total rows filtered out by predicates pushed into parquet scan @@ -96,8 +104,19 @@ impl ParquetFileMetrics { // ----------------------- let row_groups_pruned_bloom_filter = MetricBuilder::new(metrics) .with_new_label("filename", filename.to_string()) - .with_type(MetricType::SUMMARY) - .pruning_metrics("row_groups_pruned_bloom_filter", partition); + .counter("row_groups_pruned_bloom_filter", partition); + + let limit_pruned_row_groups = MetricBuilder::new(metrics) + .with_new_label("filename", filename.to_string()) + .counter("limit_pruned_row_groups", partition); + + let row_groups_fully_matched_statistics = MetricBuilder::new(metrics) + .with_new_label("filename", filename.to_string()) + .counter("row_groups_fully_matched_statistics", partition); + + let row_groups_matched_statistics = MetricBuilder::new(metrics) + .with_new_label("filename", filename.to_string()) + .counter("row_groups_matched_statistics", partition); let row_groups_pruned_statistics = MetricBuilder::new(metrics) .with_new_label("filename", filename.to_string()) @@ -172,7 +191,10 @@ impl ParquetFileMetrics { files_ranges_pruned_statistics, predicate_evaluation_errors, row_groups_pruned_bloom_filter, + row_groups_fully_matched_statistics, + row_groups_matched_statistics, row_groups_pruned_statistics, + limit_pruned_row_groups, bytes_scanned, pushdown_rows_pruned, pushdown_rows_matched, diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index 570f9b4412840..c50d313ed5448 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -545,11 +545,15 @@ impl FileOpener for ParquetOpener { .add_matched(n_remaining_row_groups); } - let mut access_plan = row_groups.build(); + // Prune by limit + if let Some(limit) = limit { + row_groups.prune_by_limit(limit, rg_metadata, &file_metrics); + } // -------------------------------------------------------- // Step: prune pages from the kept row groups // + let mut access_plan = row_groups.build(); // page index pruning: if all data on individual pages can // be ruled using page metadata, rows from other columns // with that range can be skipped as well diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs index 046379cc25e23..bed29aea4a4ad 100644 --- a/datafusion/datasource-parquet/src/row_group_filter.rs +++ b/datafusion/datasource-parquet/src/row_group_filter.rs @@ -24,6 +24,8 @@ use arrow::datatypes::Schema; use datafusion_common::pruning::PruningStatistics; use datafusion_common::{Column, Result, ScalarValue}; use datafusion_datasource::FileRange; +use datafusion_physical_expr::expressions::NotExpr; +use datafusion_physical_expr::PhysicalExprSimplifier; use datafusion_pruning::PruningPredicate; use parquet::arrow::arrow_reader::statistics::StatisticsConverter; use parquet::arrow::parquet_column; @@ -46,13 +48,19 @@ use parquet::{ pub struct RowGroupAccessPlanFilter { /// which row groups should be accessed access_plan: ParquetAccessPlan, + /// which row groups are fully contained within the pruning predicate + is_fully_matched: Vec, } impl RowGroupAccessPlanFilter { /// Create a new `RowGroupPlanBuilder` for pruning out the groups to scan /// based on metadata and statistics pub fn new(access_plan: ParquetAccessPlan) -> Self { - Self { access_plan } + let num_row_groups = access_plan.len(); + Self { + access_plan, + is_fully_matched: vec![false; num_row_groups], + } } /// Return true if there are no row groups @@ -70,6 +78,49 @@ impl RowGroupAccessPlanFilter { self.access_plan } + /// Returns the is_fully_matched vector + pub fn is_fully_matched(&self) -> &Vec { + &self.is_fully_matched + } + + /// Prunes the access plan based on the limit and fully contained row groups. + pub fn prune_by_limit( + &mut self, + limit: usize, + rg_metadata: &[RowGroupMetaData], + metrics: &ParquetFileMetrics, + ) { + let mut fully_matched_row_group_indexes: Vec = Vec::new(); + let mut fully_matched_rows_count: usize = 0; + + // Iterate through the currently accessible row groups + for &idx in self.access_plan.row_group_indexes().iter() { + if self.is_fully_matched[idx] { + let row_group_row_count = rg_metadata[idx].num_rows() as usize; + fully_matched_row_group_indexes.push(idx); + fully_matched_rows_count += row_group_row_count; + if fully_matched_rows_count >= limit { + break; + } + } + } + + if fully_matched_rows_count >= limit { + let original_num_accessible_row_groups = + self.access_plan.row_group_indexes().len(); + let new_num_accessible_row_groups = fully_matched_row_group_indexes.len(); + let pruned_count = original_num_accessible_row_groups + .saturating_sub(new_num_accessible_row_groups); + metrics.limit_pruned_row_groups.add(pruned_count); + + let mut new_access_plan = ParquetAccessPlan::new_none(rg_metadata.len()); + for &idx in &fully_matched_row_group_indexes { + new_access_plan.scan(idx); + } + self.access_plan = new_access_plan; + } + } + /// Prune remaining row groups to only those within the specified range. /// /// Updates this set to mark row groups that should not be scanned @@ -135,13 +186,55 @@ impl RowGroupAccessPlanFilter { // try to prune the row groups in a single call match predicate.prune(&pruning_stats) { Ok(values) => { - // values[i] is false means the predicate could not be true for row group i + let mut fully_contained_candidates_original_idx: Vec = Vec::new(); for (idx, &value) in row_group_indexes.iter().zip(values.iter()) { if !value { self.access_plan.skip(*idx); metrics.row_groups_pruned_statistics.add_pruned(1); } else { metrics.row_groups_pruned_statistics.add_matched(1); + fully_contained_candidates_original_idx.push(*idx); + metrics.row_groups_matched_statistics.add(1); + } + } + + // Note: this part of code shouldn't be expensive with a limited number of row groups + // If we do find it's expensive, we can consider optimizing it further. + if !fully_contained_candidates_original_idx.is_empty() { + // Use NotExpr to create the inverted predicate + let inverted_expr = + Arc::new(NotExpr::new(Arc::clone(predicate.orig_expr()))); + // Simplify the NOT expression (e.g., NOT(c1 = 0) -> c1 != 0) + // before building the pruning predicate + let mut simplifier = PhysicalExprSimplifier::new(arrow_schema); + let inverted_expr = simplifier.simplify(inverted_expr).unwrap(); + if let Ok(inverted_predicate) = PruningPredicate::try_new( + inverted_expr, + Arc::clone(predicate.schema()), + ) { + let inverted_pruning_stats = RowGroupPruningStatistics { + parquet_schema, + row_group_metadatas: fully_contained_candidates_original_idx + .iter() + .map(|&i| &groups[i]) + .collect::>(), + arrow_schema, + }; + + if let Ok(inverted_values) = + inverted_predicate.prune(&inverted_pruning_stats) + { + for (i, &original_row_group_idx) in + fully_contained_candidates_original_idx.iter().enumerate() + { + // If the inverted predicate *also* prunes this row group (meaning inverted_values[i] is false), + // it implies that *all* rows in this group satisfy the original predicate. + if !inverted_values[i] { + self.is_fully_matched[original_row_group_idx] = true; + metrics.row_groups_fully_matched_statistics.add(1); + } + } + } } } } diff --git a/datafusion/pruning/src/pruning_predicate.rs b/datafusion/pruning/src/pruning_predicate.rs index b5b8267d7f93f..5f1b4233b5d48 100644 --- a/datafusion/pruning/src/pruning_predicate.rs +++ b/datafusion/pruning/src/pruning_predicate.rs @@ -492,7 +492,6 @@ impl PruningPredicate { // Simplify the newly created predicate to get rid of redundant casts, comparisons, etc. let predicate_expr = PhysicalExprSimplifier::new(&predicate_schema).simplify(predicate_expr)?; - let literal_guarantees = LiteralGuarantee::analyze(&expr); Ok(Self { From 1d78b6f9dfe58ea6256b2475f5e02657708dd478 Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Fri, 21 Nov 2025 17:42:35 +0800 Subject: [PATCH 02/26] Support row group limit pruning --- datafusion/datasource-parquet/src/metrics.rs | 28 +++++-------------- .../src/row_group_filter.rs | 7 +++-- .../physical-expr-common/src/metrics/value.rs | 9 ++++++ 3 files changed, 20 insertions(+), 24 deletions(-) diff --git a/datafusion/datasource-parquet/src/metrics.rs b/datafusion/datasource-parquet/src/metrics.rs index e9673e16de56f..fbb14d9a6d90c 100644 --- a/datafusion/datasource-parquet/src/metrics.rs +++ b/datafusion/datasource-parquet/src/metrics.rs @@ -45,18 +45,12 @@ pub struct ParquetFileMetrics { pub files_ranges_pruned_statistics: PruningMetrics, /// Number of times the predicate could not be evaluated pub predicate_evaluation_errors: Count, - /// Number of row groups whose bloom filters were checked and matched (not pruned) - pub row_groups_matched_bloom_filter: Count, /// Number of row groups pruned by bloom filters - pub row_groups_pruned_bloom_filter: Count, + pub row_groups_pruned_bloom_filter: PruningMetrics, /// Number of row groups pruned due to limit pruning. - pub limit_pruned_row_groups: Count, - /// Number of row groups whose statistics were checked and fully matched - pub row_groups_fully_matched_statistics: Count, - /// Number of row groups whose statistics were checked and matched (not pruned) - pub row_groups_matched_statistics: Count, + pub limit_pruned_row_groups: PruningMetrics, /// Number of row groups pruned by statistics - pub row_groups_pruned_statistics: Count, + pub row_groups_pruned_statistics: PruningMetrics, /// Total number of bytes scanned pub bytes_scanned: Count, /// Total rows filtered out by predicates pushed into parquet scan @@ -104,19 +98,13 @@ impl ParquetFileMetrics { // ----------------------- let row_groups_pruned_bloom_filter = MetricBuilder::new(metrics) .with_new_label("filename", filename.to_string()) - .counter("row_groups_pruned_bloom_filter", partition); + .with_type(MetricType::SUMMARY) + .pruning_metrics("row_groups_pruned_bloom_filter", partition); let limit_pruned_row_groups = MetricBuilder::new(metrics) .with_new_label("filename", filename.to_string()) - .counter("limit_pruned_row_groups", partition); - - let row_groups_fully_matched_statistics = MetricBuilder::new(metrics) - .with_new_label("filename", filename.to_string()) - .counter("row_groups_fully_matched_statistics", partition); - - let row_groups_matched_statistics = MetricBuilder::new(metrics) - .with_new_label("filename", filename.to_string()) - .counter("row_groups_matched_statistics", partition); + .with_type(MetricType::SUMMARY) + .pruning_metrics("limit_pruned_row_groups", partition); let row_groups_pruned_statistics = MetricBuilder::new(metrics) .with_new_label("filename", filename.to_string()) @@ -191,8 +179,6 @@ impl ParquetFileMetrics { files_ranges_pruned_statistics, predicate_evaluation_errors, row_groups_pruned_bloom_filter, - row_groups_fully_matched_statistics, - row_groups_matched_statistics, row_groups_pruned_statistics, limit_pruned_row_groups, bytes_scanned, diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs index bed29aea4a4ad..50979d3687771 100644 --- a/datafusion/datasource-parquet/src/row_group_filter.rs +++ b/datafusion/datasource-parquet/src/row_group_filter.rs @@ -111,7 +111,7 @@ impl RowGroupAccessPlanFilter { let new_num_accessible_row_groups = fully_matched_row_group_indexes.len(); let pruned_count = original_num_accessible_row_groups .saturating_sub(new_num_accessible_row_groups); - metrics.limit_pruned_row_groups.add(pruned_count); + metrics.limit_pruned_row_groups.add_pruned(pruned_count); let mut new_access_plan = ParquetAccessPlan::new_none(rg_metadata.len()); for &idx in &fully_matched_row_group_indexes { @@ -194,7 +194,6 @@ impl RowGroupAccessPlanFilter { } else { metrics.row_groups_pruned_statistics.add_matched(1); fully_contained_candidates_original_idx.push(*idx); - metrics.row_groups_matched_statistics.add(1); } } @@ -231,7 +230,9 @@ impl RowGroupAccessPlanFilter { // it implies that *all* rows in this group satisfy the original predicate. if !inverted_values[i] { self.is_fully_matched[original_row_group_idx] = true; - metrics.row_groups_fully_matched_statistics.add(1); + metrics + .row_groups_pruned_statistics + .add_fully_matched(1); } } } diff --git a/datafusion/physical-expr-common/src/metrics/value.rs b/datafusion/physical-expr-common/src/metrics/value.rs index 9a14b804a20b5..0054813164bcc 100644 --- a/datafusion/physical-expr-common/src/metrics/value.rs +++ b/datafusion/physical-expr-common/src/metrics/value.rs @@ -372,6 +372,7 @@ impl Drop for ScopedTimerGuard<'_> { pub struct PruningMetrics { pruned: Arc, matched: Arc, + fully_matched: Arc, } impl Display for PruningMetrics { @@ -400,6 +401,7 @@ impl PruningMetrics { Self { pruned: Arc::new(AtomicUsize::new(0)), matched: Arc::new(AtomicUsize::new(0)), + fully_matched: Arc::new(AtomicUsize::new(0)), } } @@ -417,6 +419,13 @@ impl PruningMetrics { self.matched.fetch_add(n, Ordering::Relaxed); } + /// Add `n` to the metric's fully matched value + pub fn add_fully_matched(&self, n: usize) { + // relaxed ordering for operations on `value` poses no issues + // we're purely using atomic ops with no associated memory ops + self.fully_matched.fetch_add(n, Ordering::Relaxed); + } + /// Subtract `n` to the metric's matched value. pub fn subtract_matched(&self, n: usize) { // relaxed ordering for operations on `value` poses no issues From d1fc3bd4936239e6cc678d48c52b6b5e35f4ba2b Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Tue, 25 Nov 2025 11:33:32 +0800 Subject: [PATCH 03/26] Add fetch_order_sensitive during limit pushdown to decide if use limit pruning --- datafusion/catalog-listing/src/table.rs | 1 + datafusion/catalog/src/table.rs | 20 +++++++ datafusion/core/src/physical_planner.rs | 4 +- datafusion/core/tests/parquet/mod.rs | 38 +++++++------ .../core/tests/parquet/row_group_pruning.rs | 4 +- datafusion/datasource-parquet/src/opener.rs | 8 ++- datafusion/datasource-parquet/src/source.rs | 1 + datafusion/datasource/src/file_scan_config.rs | 20 +++++++ datafusion/expr/src/logical_plan/builder.rs | 4 +- datafusion/expr/src/logical_plan/plan.rs | 12 ++++ datafusion/expr/src/logical_plan/tree_node.rs | 2 + .../optimizer/src/optimize_projections/mod.rs | 15 ++--- datafusion/optimizer/src/push_down_filter.rs | 5 +- datafusion/optimizer/src/push_down_limit.rs | 57 ++++++++++++++++++- .../physical-expr-common/src/metrics/value.rs | 5 ++ datafusion/proto/src/logical_plan/mod.rs | 1 + 16 files changed, 160 insertions(+), 37 deletions(-) diff --git a/datafusion/catalog-listing/src/table.rs b/datafusion/catalog-listing/src/table.rs index 38456944075fc..039f276d9492b 100644 --- a/datafusion/catalog-listing/src/table.rs +++ b/datafusion/catalog-listing/src/table.rs @@ -581,6 +581,7 @@ impl TableProvider for ListingTable { .with_statistics(statistics) .with_projection_indices(projection)? .with_limit(limit) + .with_limit_order_sensitive(args.limit_order_sensitive()) .with_output_ordering(output_ordering) .with_expr_adapter(self.expr_adapter_factory.clone()) .with_partitioned_by_file_group(partitioned_by_file_group) diff --git a/datafusion/catalog/src/table.rs b/datafusion/catalog/src/table.rs index 1f223852c2b9d..4f604482bb6c2 100644 --- a/datafusion/catalog/src/table.rs +++ b/datafusion/catalog/src/table.rs @@ -361,6 +361,7 @@ pub struct ScanArgs<'a> { filters: Option<&'a [Expr]>, projection: Option<&'a [usize]>, limit: Option, + limit_order_sensitive: bool, } impl<'a> ScanArgs<'a> { @@ -422,6 +423,25 @@ impl<'a> ScanArgs<'a> { pub fn limit(&self) -> Option { self.limit } + + /// Set whether the scan's limit should be order-sensitive. + /// + /// If specified, the scan should return the limited rows in a specific order. + /// Or we can leverage limit pruning to optimize the scan. + /// + /// # Arguments + /// * `order_sensitive` - Whether the scan's limit should be order-sensitive + pub fn with_limit_order_sensitive(mut self, order_sensitive: bool) -> Self { + self.limit_order_sensitive = order_sensitive; + self + } + + /// Get whether the scan's limit should be order-sensitive. + /// + /// Returns `true` if the scan's limit should be order-sensitive, or `false` if not. + pub fn limit_order_sensitive(&self) -> bool { + self.limit_order_sensitive + } } /// Result of a table scan operation from [`TableProvider::scan_with_args`]. diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index cc7d534776d7e..83e6bf1badbbb 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -460,6 +460,7 @@ impl DefaultPhysicalPlanner { projection, filters, fetch, + fetch_order_sensitive, .. }) => { let source = source_as_provider(source)?; @@ -471,7 +472,8 @@ impl DefaultPhysicalPlanner { let opts = ScanArgs::default() .with_projection(projection.as_deref()) .with_filters(Some(&filters_vec)) - .with_limit(*fetch); + .with_limit(*fetch) + .with_limit_order_sensitive(*fetch_order_sensitive); let res = source.scan_with_args(session_state, opts).await?; Arc::clone(res.plan()) } diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index 16ac557f18811..4d0209267514b 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -127,7 +127,7 @@ struct TestOutput { impl TestOutput { /// retrieve the value of the named metric, if any fn metric_value(&self, metric_name: &str) -> Option { - if let Some((pruned, _matched)) = self.pruning_metric(metric_name) { + if let Some((pruned, _matched, _fully)) = self.pruning_metric(metric_name) { return Some(pruned); } @@ -141,9 +141,10 @@ impl TestOutput { }) } - fn pruning_metric(&self, metric_name: &str) -> Option<(usize, usize)> { + fn pruning_metric(&self, metric_name: &str) -> Option<(usize, usize, usize)> { let mut total_pruned = 0; let mut total_matched = 0; + let mut total_fully_matched = 0; let mut found = false; for metric in self.parquet_metrics.iter() { @@ -152,15 +153,18 @@ impl TestOutput { && let MetricValue::PruningMetrics { pruning_metrics, .. } = metric.value() - { - total_pruned += pruning_metrics.pruned(); - total_matched += pruning_metrics.matched(); - found = true; + { + total_pruned += pruning_metrics.pruned(); + total_matched += pruning_metrics.matched(); + total_fully_matched += pruning_metrics.fully_matched(); + + found = true; + } } } if found { - Some((total_pruned, total_matched)) + Some((total_pruned, total_matched, total_fully_matched)) } else { None } @@ -172,32 +176,33 @@ impl TestOutput { } /// The number of row_groups pruned / matched by bloom filter - fn row_groups_bloom_filter(&self) -> Option<(usize, usize)> { + fn row_groups_bloom_filter(&self) -> Option<(usize, usize, usize)> { self.pruning_metric("row_groups_pruned_bloom_filter") } /// The number of row_groups matched by statistics fn row_groups_matched_statistics(&self) -> Option { self.pruning_metric("row_groups_pruned_statistics") - .map(|(_pruned, matched)| matched) + .map(|(_pruned, matched, _fully)| matched) } /// The number of row_groups fully matched by statistics fn row_groups_fully_matched_statistics(&self) -> Option { - self.metric_value("row_groups_fully_matched_statistics") + self.pruning_metric("row_groups_pruned_statistics") + .map(|(_pruned, _, fully)| fully) } /// The number of row_groups pruned by statistics fn row_groups_pruned_statistics(&self) -> Option { self.pruning_metric("row_groups_pruned_statistics") - .map(|(pruned, _matched)| pruned) + .map(|(pruned, _matched, _fully)| pruned) } /// Metric `files_ranges_pruned_statistics` tracks both pruned and matched count, /// for testing purpose, here it only aggregate the `pruned` count. fn files_ranges_pruned_statistics(&self) -> Option { self.pruning_metric("files_ranges_pruned_statistics") - .map(|(pruned, _matched)| pruned) + .map(|(pruned, _matched, _fully)| pruned) } /// The number of row_groups matched by bloom filter or statistics @@ -207,13 +212,13 @@ impl TestOutput { /// count. fn row_groups_matched(&self) -> Option { self.row_groups_bloom_filter() - .map(|(_pruned, matched)| matched) + .map(|(_pruned, matched, _fully)| matched) } /// The number of row_groups pruned fn row_groups_pruned(&self) -> Option { self.row_groups_bloom_filter() - .map(|(pruned, _matched)| pruned) + .map(|(pruned, _matched, _fully)| pruned) .zip(self.row_groups_pruned_statistics()) .map(|(a, b)| a + b) } @@ -221,12 +226,13 @@ impl TestOutput { /// The number of row pages pruned fn row_pages_pruned(&self) -> Option { self.pruning_metric("page_index_rows_pruned") - .map(|(pruned, _matched)| pruned) + .map(|(pruned, _matched, _fully)| pruned) } /// The number of row groups pruned by limit pruning fn limit_pruned_row_groups(&self) -> Option { - self.metric_value("limit_pruned_row_groups") + self.pruning_metric("limit_pruned_row_groups") + .map(|(pruned, _, _)| pruned) } fn description(&self) -> String { diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs index e0ba462281ce3..f2e2561945140 100644 --- a/datafusion/core/tests/parquet/row_group_pruning.rs +++ b/datafusion/core/tests/parquet/row_group_pruning.rs @@ -157,12 +157,12 @@ impl RowGroupPruningTest { ); let bloom_filter_metrics = output.row_groups_bloom_filter(); assert_eq!( - bloom_filter_metrics.map(|(_pruned, matched)| matched), + bloom_filter_metrics.map(|(_pruned, matched, _)| matched), self.expected_row_group_matched_by_bloom_filter, "mismatched row_groups_matched_bloom_filter", ); assert_eq!( - bloom_filter_metrics.map(|(pruned, _matched)| pruned), + bloom_filter_metrics.map(|(pruned, _matched, _)| pruned), self.expected_row_group_pruned_by_bloom_filter, "mismatched row_groups_pruned_bloom_filter", ); diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index c50d313ed5448..3947524684efa 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -76,6 +76,8 @@ pub(super) struct ParquetOpener { pub batch_size: usize, /// Optional limit on the number of rows to read pub limit: Option, + /// limit order sensitivity + pub limit_order_sensitive: bool, /// Optional predicate to apply during the scan pub predicate: Option>, /// Table schema, including partition columns. @@ -277,6 +279,8 @@ impl FileOpener for ParquetOpener { let max_predicate_cache_size = self.max_predicate_cache_size; let reverse_row_groups = self.reverse_row_groups; + let limit_order_sensitive = self.limit_order_sensitive; + Ok(Box::pin(async move { #[cfg(feature = "parquet_encryption")] let file_decryption_properties = encryption_context @@ -545,8 +549,8 @@ impl FileOpener for ParquetOpener { .add_matched(n_remaining_row_groups); } - // Prune by limit - if let Some(limit) = limit { + // Prune by limit if limit is set and limit order is not sensitive + if let (Some(limit), false) = (limit, limit_order_sensitive) { row_groups.prune_by_limit(limit, rg_metadata, &file_metrics); } diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 2e0919b1447de..d36e0fa106c0b 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -548,6 +548,7 @@ impl FileSource for ParquetSource { .batch_size .expect("Batch size must set before creating ParquetOpener"), limit: base_config.limit, + limit_order_sensitive: base_config.limit_order_sensitive, predicate: self.predicate.clone(), table_schema: self.table_schema.clone(), metadata_size_hint: self.metadata_size_hint, diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 1f7c37315c47a..082f06829f14d 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -152,6 +152,8 @@ pub struct FileScanConfig { /// The maximum number of records to read from this plan. If `None`, /// all records after filtering are returned. pub limit: Option, + /// Whether the scan's limit is order sensitive + pub limit_order_sensitive: bool, /// All equivalent lexicographical orderings that describe the schema. pub output_ordering: Vec, /// File compression type @@ -240,6 +242,8 @@ pub struct FileScanConfigBuilder { object_store_url: ObjectStoreUrl, file_source: Arc, limit: Option, + limit_order_sensitive: bool, + projection_indices: Option>, constraints: Option, file_groups: Vec, statistics: Option, @@ -269,6 +273,8 @@ impl FileScanConfigBuilder { output_ordering: vec![], file_compression_type: None, limit: None, + limit_order_sensitive: false, + projection_indices: None, constraints: None, batch_size: None, expr_adapter_factory: None, @@ -283,6 +289,12 @@ impl FileScanConfigBuilder { self } + /// Set whether the limit should be order-sensitive. + pub fn with_limit_order_sensitive(mut self, order_sensitive: bool) -> Self { + self.limit_order_sensitive = order_sensitive; + self + } + /// Set the file source for scanning files. /// /// This method allows you to change the file source implementation (e.g. ParquetSource, CsvSource, etc.) @@ -450,6 +462,8 @@ impl FileScanConfigBuilder { object_store_url, file_source, limit, + limit_order_sensitive, + projection_indices, constraints, file_groups, statistics, @@ -471,6 +485,8 @@ impl FileScanConfigBuilder { object_store_url, file_source, limit, + limit_order_sensitive, + projection_exprs, constraints, file_groups, output_ordering, @@ -493,6 +509,10 @@ impl From for FileScanConfigBuilder { output_ordering: config.output_ordering, file_compression_type: Some(config.file_compression_type), limit: config.limit, + limit_order_sensitive: config.limit_order_sensitive, + projection_indices: config + .projection_exprs + .map(|p| p.ordered_column_indices()), constraints: Some(config.constraints), batch_size: config.batch_size, expr_adapter_factory: config.expr_adapter_factory, diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 6f654428e41a1..27852fd4b9897 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -2756,12 +2756,12 @@ mod tests { assert_snapshot!(plan, @r" Union - Cross Join: + Cross Join: SubqueryAlias: left Values: (Int32(1)) SubqueryAlias: right Values: (Int32(1)) - Cross Join: + Cross Join: SubqueryAlias: left Values: (Int32(1)) SubqueryAlias: right diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 4219c24bfc9c9..df7cbb7527a9f 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -2683,6 +2683,9 @@ pub struct TableScan { pub filters: Vec, /// Optional number of rows to read pub fetch: Option, + /// If the fetch is order-sensitive, it'll be true. + /// And the limit pruning will be enabled. + pub fetch_order_sensitive: bool, } impl Debug for TableScan { @@ -2705,6 +2708,7 @@ impl PartialEq for TableScan { && self.projected_schema == other.projected_schema && self.filters == other.filters && self.fetch == other.fetch + && self.fetch_order_sensitive == other.fetch_order_sensitive } } @@ -2724,18 +2728,22 @@ impl PartialOrd for TableScan { pub filters: &'a Vec, /// Optional number of rows to read pub fetch: &'a Option, + /// Whether the fetch is order-sensitive + pub fetch_order_sensitive: bool, } let comparable_self = ComparableTableScan { table_name: &self.table_name, projection: &self.projection, filters: &self.filters, fetch: &self.fetch, + fetch_order_sensitive: self.fetch_order_sensitive, }; let comparable_other = ComparableTableScan { table_name: &other.table_name, projection: &other.projection, filters: &other.filters, fetch: &other.fetch, + fetch_order_sensitive: other.fetch_order_sensitive, }; comparable_self .partial_cmp(&comparable_other) @@ -2751,6 +2759,7 @@ impl Hash for TableScan { self.projected_schema.hash(state); self.filters.hash(state); self.fetch.hash(state); + self.fetch_order_sensitive.hash(state); } } @@ -2804,6 +2813,7 @@ impl TableScan { projected_schema, filters, fetch, + fetch_order_sensitive: false, }) } } @@ -4968,6 +4978,7 @@ mod tests { projected_schema: Arc::clone(&schema), filters: vec![], fetch: None, + fetch_order_sensitive: false, })); let col = schema.field_names()[0].clone(); @@ -4998,6 +5009,7 @@ mod tests { projected_schema: Arc::clone(&unique_schema), filters: vec![], fetch: None, + fetch_order_sensitive: false, })); let col = schema.field_names()[0].clone(); diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs index 62a27b0a025ad..c9ca99c20e08b 100644 --- a/datafusion/expr/src/logical_plan/tree_node.rs +++ b/datafusion/expr/src/logical_plan/tree_node.rs @@ -599,6 +599,7 @@ impl LogicalPlan { projected_schema, filters, fetch, + fetch_order_sensitive, }) => filters.map_elements(f)?.update_data(|filters| { LogicalPlan::TableScan(TableScan { table_name, @@ -607,6 +608,7 @@ impl LogicalPlan { projected_schema, filters, fetch, + fetch_order_sensitive, }) }), LogicalPlan::Distinct(Distinct::On(DistinctOn { diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index 548eadffa242e..c85793228ba05 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -259,6 +259,7 @@ fn optimize_projections( projection, filters, fetch, + fetch_order_sensitive, projected_schema: _, } = table_scan; @@ -268,15 +269,11 @@ fn optimize_projections( Some(projection) => indices.into_mapped_indices(|idx| projection[idx]), None => indices.into_inner(), }; - return TableScan::try_new( - table_name, - source, - Some(projection), - filters, - fetch, - ) - .map(LogicalPlan::TableScan) - .map(Transformed::yes); + let mut new_scan = + TableScan::try_new(table_name, source, Some(projection), filters, fetch)?; + new_scan.fetch_order_sensitive = fetch_order_sensitive; + + return Ok(Transformed::yes(LogicalPlan::TableScan(new_scan))); } // Other node types are handled below _ => {} diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index 755ffdbafc869..cdb372791999d 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -2331,7 +2331,7 @@ mod tests { plan, @r" Projection: test.a, test1.d - Cross Join: + Cross Join: Projection: test.a, test.b, test.c TableScan: test, full_filters=[test.a = Int32(1)] Projection: test1.d, test1.e, test1.f @@ -2361,7 +2361,7 @@ mod tests { plan, @r" Projection: test.a, test1.a - Cross Join: + Cross Join: Projection: test.a, test.b, test.c TableScan: test, full_filters=[test.a = Int32(1)] Projection: test1.a, test1.b, test1.c @@ -3119,6 +3119,7 @@ mod tests { projection, source: Arc::new(test_provider), fetch: None, + fetch_order_sensitive: false, }); Ok(LogicalPlanBuilder::from(table_scan)) diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs index 7b302adf22acc..4e0e357a289df 100644 --- a/datafusion/optimizer/src/push_down_limit.rs +++ b/datafusion/optimizer/src/push_down_limit.rs @@ -25,6 +25,7 @@ use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::Result; use datafusion_common::tree_node::Transformed; +use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::utils::combine_limit; use datafusion_expr::logical_plan::{Join, JoinType, Limit, LogicalPlan}; use datafusion_expr::{FetchType, SkipType, lit}; @@ -124,6 +125,9 @@ impl OptimizerRule for PushDownLimit { })), LogicalPlan::Sort(mut sort) => { + let marked_input = + mark_fetch_order_sensitive(Arc::unwrap_or_clone(sort.input))?; + sort.input = Arc::new(marked_input); let new_fetch = { let sort_fetch = skip + fetch; Some(sort.fetch.map(|f| f.min(sort_fetch)).unwrap_or(sort_fetch)) @@ -268,6 +272,17 @@ fn push_down_join(mut join: Join, limit: usize) -> Transformed { Transformed::yes(join) } +fn mark_fetch_order_sensitive(plan: LogicalPlan) -> Result { + plan.transform_down(|node| match node { + LogicalPlan::TableScan(mut scan) => { + scan.fetch_order_sensitive = true; + Ok(Transformed::yes(LogicalPlan::TableScan(scan))) + } + _ => Ok(Transformed::no(node)), + }) + .map(|t| t.data) +} + #[cfg(test)] mod test { use std::cmp::Ordering; @@ -275,10 +290,11 @@ mod test { use std::vec; use super::*; - use crate::assert_optimized_plan_eq_snapshot; use crate::test::*; + use crate::{assert_optimized_plan_eq_snapshot, Optimizer}; use crate::OptimizerContext; + use datafusion_common::tree_node::TreeNodeRecursion; use datafusion_common::DFSchemaRef; use datafusion_expr::{ Expr, Extension, UserDefinedLogicalNodeCore, col, exists, @@ -1044,7 +1060,7 @@ mod test { plan, @r" Limit: skip=0, fetch=1000 - Cross Join: + Cross Join: Limit: skip=0, fetch=1000 TableScan: test, fetch=1000 Limit: skip=0, fetch=1000 @@ -1067,7 +1083,7 @@ mod test { plan, @r" Limit: skip=1000, fetch=1000 - Cross Join: + Cross Join: Limit: skip=0, fetch=2000 TableScan: test, fetch=2000 Limit: skip=0, fetch=2000 @@ -1131,4 +1147,39 @@ mod test { " ) } + + fn has_fetch_order_sensitive_scan(plan: &LogicalPlan) -> bool { + let mut found = false; + plan.apply(|node| { + if let LogicalPlan::TableScan(scan) = node { + if scan.fetch_order_sensitive { + found = true; + return Ok(TreeNodeRecursion::Stop); + } + } + Ok(TreeNodeRecursion::Continue) + }) + .expect("plan traversal"); + found + } + + #[test] + fn limit_push_down_sort_marks_scans_order_sensitive() -> Result<()> { + let table_scan = test_table_scan()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .sort_by(vec![col("a")])? + .limit(0, Some(10))? + .build()?; + + let optimizer_ctx = OptimizerContext::new().with_max_passes(1); + let rules: Vec> = + vec![Arc::new(PushDownLimit::new())]; + let optimized_plan = + Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; + + assert!(has_fetch_order_sensitive_scan(&optimized_plan)); + + Ok(()) + } } diff --git a/datafusion/physical-expr-common/src/metrics/value.rs b/datafusion/physical-expr-common/src/metrics/value.rs index 0054813164bcc..4bd1eb59d9bb6 100644 --- a/datafusion/physical-expr-common/src/metrics/value.rs +++ b/datafusion/physical-expr-common/src/metrics/value.rs @@ -442,6 +442,11 @@ impl PruningMetrics { pub fn matched(&self) -> usize { self.matched.load(Ordering::Relaxed) } + + /// Number of items fully matched + pub fn fully_matched(&self) -> usize { + self.fully_matched.load(Ordering::Relaxed) + } } /// Counters tracking ratio metrics (e.g. matched vs total) diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs index 218c2e4e47d04..3f3c9f51e2ce6 100644 --- a/datafusion/proto/src/logical_plan/mod.rs +++ b/datafusion/proto/src/logical_plan/mod.rs @@ -267,6 +267,7 @@ fn from_table_source( projected_schema, filters: vec![], fetch: None, + fetch_order_sensitive: false, }); LogicalPlanNode::try_from_logical_plan(&r, extension_codec) From 8170789c9a9a3d82c348a4d3a7f7eff8380c6756 Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Tue, 25 Nov 2025 14:08:05 +0800 Subject: [PATCH 04/26] fix test formant --- datafusion/expr/src/logical_plan/builder.rs | 4 ++-- datafusion/optimizer/src/push_down_filter.rs | 4 ++-- datafusion/optimizer/src/push_down_limit.rs | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 27852fd4b9897..6f654428e41a1 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -2756,12 +2756,12 @@ mod tests { assert_snapshot!(plan, @r" Union - Cross Join: + Cross Join: SubqueryAlias: left Values: (Int32(1)) SubqueryAlias: right Values: (Int32(1)) - Cross Join: + Cross Join: SubqueryAlias: left Values: (Int32(1)) SubqueryAlias: right diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index cdb372791999d..cc4d099777e9f 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -2331,7 +2331,7 @@ mod tests { plan, @r" Projection: test.a, test1.d - Cross Join: + Cross Join: Projection: test.a, test.b, test.c TableScan: test, full_filters=[test.a = Int32(1)] Projection: test1.d, test1.e, test1.f @@ -2361,7 +2361,7 @@ mod tests { plan, @r" Projection: test.a, test1.a - Cross Join: + Cross Join: Projection: test.a, test.b, test.c TableScan: test, full_filters=[test.a = Int32(1)] Projection: test1.a, test1.b, test1.c diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs index 4e0e357a289df..8accbd5bfaf5f 100644 --- a/datafusion/optimizer/src/push_down_limit.rs +++ b/datafusion/optimizer/src/push_down_limit.rs @@ -1060,7 +1060,7 @@ mod test { plan, @r" Limit: skip=0, fetch=1000 - Cross Join: + Cross Join: Limit: skip=0, fetch=1000 TableScan: test, fetch=1000 Limit: skip=0, fetch=1000 @@ -1083,7 +1083,7 @@ mod test { plan, @r" Limit: skip=1000, fetch=1000 - Cross Join: + Cross Join: Limit: skip=0, fetch=2000 TableScan: test, fetch=2000 Limit: skip=0, fetch=2000 From 187e10b5acc9282a1ad664b82e847c1ce6e6117f Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Thu, 27 Nov 2025 17:36:02 +0800 Subject: [PATCH 05/26] Rename to preserve_order --- datafusion/catalog-listing/src/table.rs | 2 +- datafusion/catalog/src/table.rs | 22 ++++++------------- datafusion/core/src/physical_planner.rs | 4 ++-- datafusion/datasource-parquet/src/opener.rs | 8 +++---- datafusion/datasource-parquet/src/source.rs | 2 +- datafusion/datasource/src/file_scan_config.rs | 16 +++++++------- datafusion/expr/src/logical_plan/plan.rs | 21 +++++++++--------- datafusion/expr/src/logical_plan/tree_node.rs | 4 ++-- .../optimizer/src/optimize_projections/mod.rs | 4 ++-- datafusion/optimizer/src/push_down_filter.rs | 2 +- datafusion/optimizer/src/push_down_limit.rs | 13 +++++------ datafusion/proto/src/logical_plan/mod.rs | 2 +- 12 files changed, 45 insertions(+), 55 deletions(-) diff --git a/datafusion/catalog-listing/src/table.rs b/datafusion/catalog-listing/src/table.rs index 039f276d9492b..be4a16a7bd1e5 100644 --- a/datafusion/catalog-listing/src/table.rs +++ b/datafusion/catalog-listing/src/table.rs @@ -581,7 +581,7 @@ impl TableProvider for ListingTable { .with_statistics(statistics) .with_projection_indices(projection)? .with_limit(limit) - .with_limit_order_sensitive(args.limit_order_sensitive()) + .with_preserve_order(args.preserve_order()) .with_output_ordering(output_ordering) .with_expr_adapter(self.expr_adapter_factory.clone()) .with_partitioned_by_file_group(partitioned_by_file_group) diff --git a/datafusion/catalog/src/table.rs b/datafusion/catalog/src/table.rs index 4f604482bb6c2..e5206b9358f8e 100644 --- a/datafusion/catalog/src/table.rs +++ b/datafusion/catalog/src/table.rs @@ -361,7 +361,7 @@ pub struct ScanArgs<'a> { filters: Option<&'a [Expr]>, projection: Option<&'a [usize]>, limit: Option, - limit_order_sensitive: bool, + preserve_order: bool, } impl<'a> ScanArgs<'a> { @@ -424,23 +424,15 @@ impl<'a> ScanArgs<'a> { self.limit } - /// Set whether the scan's limit should be order-sensitive. - /// - /// If specified, the scan should return the limited rows in a specific order. - /// Or we can leverage limit pruning to optimize the scan. - /// - /// # Arguments - /// * `order_sensitive` - Whether the scan's limit should be order-sensitive - pub fn with_limit_order_sensitive(mut self, order_sensitive: bool) -> Self { - self.limit_order_sensitive = order_sensitive; + /// Set whether should keep the output rows in order + pub fn with_preserve_order(mut self, order_sensitive: bool) -> Self { + self.preserve_order = order_sensitive; self } - /// Get whether the scan's limit should be order-sensitive. - /// - /// Returns `true` if the scan's limit should be order-sensitive, or `false` if not. - pub fn limit_order_sensitive(&self) -> bool { - self.limit_order_sensitive + /// Get whether should keep the output rows in order + pub fn preserve_order(&self) -> bool { + self.preserve_order } } diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 83e6bf1badbbb..fcc315be00f0f 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -460,7 +460,7 @@ impl DefaultPhysicalPlanner { projection, filters, fetch, - fetch_order_sensitive, + preserve_order, .. }) => { let source = source_as_provider(source)?; @@ -473,7 +473,7 @@ impl DefaultPhysicalPlanner { .with_projection(projection.as_deref()) .with_filters(Some(&filters_vec)) .with_limit(*fetch) - .with_limit_order_sensitive(*fetch_order_sensitive); + .with_preserve_order(*preserve_order); let res = source.scan_with_args(session_state, opts).await?; Arc::clone(res.plan()) } diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index 3947524684efa..891f349635c04 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -76,8 +76,8 @@ pub(super) struct ParquetOpener { pub batch_size: usize, /// Optional limit on the number of rows to read pub limit: Option, - /// limit order sensitivity - pub limit_order_sensitive: bool, + /// If should keep the output rows in order + pub preserve_order: bool, /// Optional predicate to apply during the scan pub predicate: Option>, /// Table schema, including partition columns. @@ -279,7 +279,7 @@ impl FileOpener for ParquetOpener { let max_predicate_cache_size = self.max_predicate_cache_size; let reverse_row_groups = self.reverse_row_groups; - let limit_order_sensitive = self.limit_order_sensitive; + let preserve_order = self.preserve_order; Ok(Box::pin(async move { #[cfg(feature = "parquet_encryption")] @@ -550,7 +550,7 @@ impl FileOpener for ParquetOpener { } // Prune by limit if limit is set and limit order is not sensitive - if let (Some(limit), false) = (limit, limit_order_sensitive) { + if let (Some(limit), false) = (limit, preserve_order) { row_groups.prune_by_limit(limit, rg_metadata, &file_metrics); } diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index d36e0fa106c0b..07f58db185f49 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -548,7 +548,7 @@ impl FileSource for ParquetSource { .batch_size .expect("Batch size must set before creating ParquetOpener"), limit: base_config.limit, - limit_order_sensitive: base_config.limit_order_sensitive, + preserve_order: base_config.preserve_order, predicate: self.predicate.clone(), table_schema: self.table_schema.clone(), metadata_size_hint: self.metadata_size_hint, diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 082f06829f14d..42067d2392831 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -153,7 +153,7 @@ pub struct FileScanConfig { /// all records after filtering are returned. pub limit: Option, /// Whether the scan's limit is order sensitive - pub limit_order_sensitive: bool, + pub preserve_order: bool, /// All equivalent lexicographical orderings that describe the schema. pub output_ordering: Vec, /// File compression type @@ -242,7 +242,7 @@ pub struct FileScanConfigBuilder { object_store_url: ObjectStoreUrl, file_source: Arc, limit: Option, - limit_order_sensitive: bool, + preserve_order: bool, projection_indices: Option>, constraints: Option, file_groups: Vec, @@ -273,7 +273,7 @@ impl FileScanConfigBuilder { output_ordering: vec![], file_compression_type: None, limit: None, - limit_order_sensitive: false, + preserve_order: false, projection_indices: None, constraints: None, batch_size: None, @@ -290,8 +290,8 @@ impl FileScanConfigBuilder { } /// Set whether the limit should be order-sensitive. - pub fn with_limit_order_sensitive(mut self, order_sensitive: bool) -> Self { - self.limit_order_sensitive = order_sensitive; + pub fn with_preserve_order(mut self, order_sensitive: bool) -> Self { + self.preserve_order = order_sensitive; self } @@ -462,7 +462,7 @@ impl FileScanConfigBuilder { object_store_url, file_source, limit, - limit_order_sensitive, + preserve_order, projection_indices, constraints, file_groups, @@ -485,7 +485,7 @@ impl FileScanConfigBuilder { object_store_url, file_source, limit, - limit_order_sensitive, + preserve_order, projection_exprs, constraints, file_groups, @@ -509,7 +509,7 @@ impl From for FileScanConfigBuilder { output_ordering: config.output_ordering, file_compression_type: Some(config.file_compression_type), limit: config.limit, - limit_order_sensitive: config.limit_order_sensitive, + preserve_order: config.preserve_order, projection_indices: config .projection_exprs .map(|p| p.ordered_column_indices()), diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index df7cbb7527a9f..9c7f365749663 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -2683,9 +2683,8 @@ pub struct TableScan { pub filters: Vec, /// Optional number of rows to read pub fetch: Option, - /// If the fetch is order-sensitive, it'll be true. - /// And the limit pruning will be enabled. - pub fetch_order_sensitive: bool, + /// If should keep the output rows in order + pub preserve_order: bool, } impl Debug for TableScan { @@ -2708,7 +2707,7 @@ impl PartialEq for TableScan { && self.projected_schema == other.projected_schema && self.filters == other.filters && self.fetch == other.fetch - && self.fetch_order_sensitive == other.fetch_order_sensitive + && self.preserve_order == other.preserve_order } } @@ -2729,21 +2728,21 @@ impl PartialOrd for TableScan { /// Optional number of rows to read pub fetch: &'a Option, /// Whether the fetch is order-sensitive - pub fetch_order_sensitive: bool, + pub preserve_order: bool, } let comparable_self = ComparableTableScan { table_name: &self.table_name, projection: &self.projection, filters: &self.filters, fetch: &self.fetch, - fetch_order_sensitive: self.fetch_order_sensitive, + preserve_order: self.preserve_order, }; let comparable_other = ComparableTableScan { table_name: &other.table_name, projection: &other.projection, filters: &other.filters, fetch: &other.fetch, - fetch_order_sensitive: other.fetch_order_sensitive, + preserve_order: other.preserve_order, }; comparable_self .partial_cmp(&comparable_other) @@ -2759,7 +2758,7 @@ impl Hash for TableScan { self.projected_schema.hash(state); self.filters.hash(state); self.fetch.hash(state); - self.fetch_order_sensitive.hash(state); + self.preserve_order.hash(state); } } @@ -2813,7 +2812,7 @@ impl TableScan { projected_schema, filters, fetch, - fetch_order_sensitive: false, + preserve_order: false, }) } } @@ -4978,7 +4977,7 @@ mod tests { projected_schema: Arc::clone(&schema), filters: vec![], fetch: None, - fetch_order_sensitive: false, + preserve_order: false, })); let col = schema.field_names()[0].clone(); @@ -5009,7 +5008,7 @@ mod tests { projected_schema: Arc::clone(&unique_schema), filters: vec![], fetch: None, - fetch_order_sensitive: false, + preserve_order: false, })); let col = schema.field_names()[0].clone(); diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs index c9ca99c20e08b..5cae151dd5852 100644 --- a/datafusion/expr/src/logical_plan/tree_node.rs +++ b/datafusion/expr/src/logical_plan/tree_node.rs @@ -599,7 +599,7 @@ impl LogicalPlan { projected_schema, filters, fetch, - fetch_order_sensitive, + preserve_order, }) => filters.map_elements(f)?.update_data(|filters| { LogicalPlan::TableScan(TableScan { table_name, @@ -608,7 +608,7 @@ impl LogicalPlan { projected_schema, filters, fetch, - fetch_order_sensitive, + preserve_order, }) }), LogicalPlan::Distinct(Distinct::On(DistinctOn { diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index c85793228ba05..1d7635f990e9d 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -259,7 +259,7 @@ fn optimize_projections( projection, filters, fetch, - fetch_order_sensitive, + preserve_order, projected_schema: _, } = table_scan; @@ -271,7 +271,7 @@ fn optimize_projections( }; let mut new_scan = TableScan::try_new(table_name, source, Some(projection), filters, fetch)?; - new_scan.fetch_order_sensitive = fetch_order_sensitive; + new_scan.preserve_order = preserve_order; return Ok(Transformed::yes(LogicalPlan::TableScan(new_scan))); } diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index cc4d099777e9f..c104184d68e1c 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -3119,7 +3119,7 @@ mod tests { projection, source: Arc::new(test_provider), fetch: None, - fetch_order_sensitive: false, + preserve_order: false, }); Ok(LogicalPlanBuilder::from(table_scan)) diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs index 8accbd5bfaf5f..582f4db20d9e4 100644 --- a/datafusion/optimizer/src/push_down_limit.rs +++ b/datafusion/optimizer/src/push_down_limit.rs @@ -125,8 +125,7 @@ impl OptimizerRule for PushDownLimit { })), LogicalPlan::Sort(mut sort) => { - let marked_input = - mark_fetch_order_sensitive(Arc::unwrap_or_clone(sort.input))?; + let marked_input = mark_preserve_order(Arc::unwrap_or_clone(sort.input))?; sort.input = Arc::new(marked_input); let new_fetch = { let sort_fetch = skip + fetch; @@ -272,10 +271,10 @@ fn push_down_join(mut join: Join, limit: usize) -> Transformed { Transformed::yes(join) } -fn mark_fetch_order_sensitive(plan: LogicalPlan) -> Result { +fn mark_preserve_order(plan: LogicalPlan) -> Result { plan.transform_down(|node| match node { LogicalPlan::TableScan(mut scan) => { - scan.fetch_order_sensitive = true; + scan.preserve_order = true; Ok(Transformed::yes(LogicalPlan::TableScan(scan))) } _ => Ok(Transformed::no(node)), @@ -1148,11 +1147,11 @@ mod test { ) } - fn has_fetch_order_sensitive_scan(plan: &LogicalPlan) -> bool { + fn has_preserve_order_scan(plan: &LogicalPlan) -> bool { let mut found = false; plan.apply(|node| { if let LogicalPlan::TableScan(scan) = node { - if scan.fetch_order_sensitive { + if scan.preserve_order { found = true; return Ok(TreeNodeRecursion::Stop); } @@ -1178,7 +1177,7 @@ mod test { let optimized_plan = Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; - assert!(has_fetch_order_sensitive_scan(&optimized_plan)); + assert!(has_preserve_order_scan(&optimized_plan)); Ok(()) } diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs index 3f3c9f51e2ce6..1af4db1094840 100644 --- a/datafusion/proto/src/logical_plan/mod.rs +++ b/datafusion/proto/src/logical_plan/mod.rs @@ -267,7 +267,7 @@ fn from_table_source( projected_schema, filters: vec![], fetch: None, - fetch_order_sensitive: false, + preserve_order: false, }); LogicalPlanNode::try_from_logical_plan(&r, extension_codec) From d6dc4b7eefc41d1a740eebd219d1fd6a9c78d420 Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Thu, 27 Nov 2025 21:10:40 +0800 Subject: [PATCH 06/26] refactor pushdown limit --- datafusion/optimizer/src/push_down_limit.rs | 374 +++++++++++++++++++- 1 file changed, 357 insertions(+), 17 deletions(-) diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs index 582f4db20d9e4..f2ad2a89ce59c 100644 --- a/datafusion/optimizer/src/push_down_limit.rs +++ b/datafusion/optimizer/src/push_down_limit.rs @@ -18,6 +18,7 @@ //! [`PushDownLimit`] pushes `LIMIT` earlier in the query plan use std::cmp::min; +use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use crate::optimizer::ApplyOrder; @@ -33,12 +34,17 @@ use datafusion_expr::{FetchType, SkipType, lit}; /// Optimization rule that tries to push down `LIMIT`. //. It will push down through projection, limits (taking the smaller limit) #[derive(Default, Debug)] -pub struct PushDownLimit {} +pub struct PushDownLimit { + /// Flag to track whether we're currently under a Sort node that requires order preservation + preserve_order: AtomicBool, +} impl PushDownLimit { #[expect(missing_docs)] pub fn new() -> Self { - Self {} + Self { + preserve_order: AtomicBool::new(false), + } } } @@ -54,6 +60,27 @@ impl OptimizerRule for PushDownLimit { config: &dyn OptimizerConfig, ) -> Result> { let _ = config.options(); + if let LogicalPlan::TableScan(mut scan) = plan { + if self.preserve_order.load(Ordering::Relaxed) && !scan.preserve_order { + scan.preserve_order = true; + return Ok(Transformed::yes(LogicalPlan::TableScan(scan))); + } + return Ok(Transformed::no(LogicalPlan::TableScan(scan))); + } + + if matches!( + plan, + LogicalPlan::Aggregate(_) + | LogicalPlan::Join(_) + | LogicalPlan::Union(_) + | LogicalPlan::Window(_) + | LogicalPlan::Distinct(_) + ) { + // These operations will break the order, so the downstream TableScan does not need to preserve order + self.preserve_order.store(false, Ordering::Relaxed); + return Ok(Transformed::no(plan)); + } + let LogicalPlan::Limit(mut limit) = plan else { return Ok(Transformed::no(plan)); }; @@ -125,8 +152,7 @@ impl OptimizerRule for PushDownLimit { })), LogicalPlan::Sort(mut sort) => { - let marked_input = mark_preserve_order(Arc::unwrap_or_clone(sort.input))?; - sort.input = Arc::new(marked_input); + self.preserve_order.store(true, Ordering::Relaxed); let new_fetch = { let sort_fetch = skip + fetch; Some(sort.fetch.map(|f| f.min(sort_fetch)).unwrap_or(sort_fetch)) @@ -271,17 +297,6 @@ fn push_down_join(mut join: Join, limit: usize) -> Transformed { Transformed::yes(join) } -fn mark_preserve_order(plan: LogicalPlan) -> Result { - plan.transform_down(|node| match node { - LogicalPlan::TableScan(mut scan) => { - scan.preserve_order = true; - Ok(Transformed::yes(LogicalPlan::TableScan(scan))) - } - _ => Ok(Transformed::no(node)), - }) - .map(|t| t.data) -} - #[cfg(test)] mod test { use std::cmp::Ordering; @@ -293,8 +308,9 @@ mod test { use crate::{assert_optimized_plan_eq_snapshot, Optimizer}; use crate::OptimizerContext; - use datafusion_common::tree_node::TreeNodeRecursion; + use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; use datafusion_common::DFSchemaRef; + use datafusion_expr::expr::WindowFunctionParams; use datafusion_expr::{ Expr, Extension, UserDefinedLogicalNodeCore, col, exists, logical_plan::builder::LogicalPlanBuilder, @@ -1163,7 +1179,42 @@ mod test { } #[test] - fn limit_push_down_sort_marks_scans_order_sensitive() -> Result<()> { + fn limit_push_down_sort_marks_scans_preserev_order() -> Result<()> { + let table_scan = test_table_scan()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .sort_by(vec![col("a")])? + .limit(0, Some(10))? + .build()?; + + let optimizer_ctx = OptimizerContext::new().with_max_passes(1); + let rules: Vec> = + vec![Arc::new(PushDownLimit::new())]; + let optimized_plan = + Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; + + assert!(has_preserve_order_scan(&optimized_plan)); + + Ok(()) + } + + // Helper function to count how many TableScans have preserve_order = true + fn count_preserve_order_scans(plan: &LogicalPlan) -> usize { + let mut count = 0; + plan.apply(|node| { + if let LogicalPlan::TableScan(scan) = node { + if scan.preserve_order { + count += 1; + } + } + Ok(TreeNodeRecursion::Continue) + }) + .expect("plan traversal"); + count + } + + #[test] + fn limit_push_down_sort_marks_scans_preserve_order() -> Result<()> { let table_scan = test_table_scan()?; let plan = LogicalPlanBuilder::from(table_scan) @@ -1181,4 +1232,293 @@ mod test { Ok(()) } + + #[test] + fn limit_push_down_sort_with_projection_marks_scans() -> Result<()> { + let table_scan = test_table_scan()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .project(vec![col("a"), col("b")])? + .sort_by(vec![col("a")])? + .limit(0, Some(10))? + .build()?; + + let optimizer_ctx = OptimizerContext::new().with_max_passes(1); + let rules: Vec> = + vec![Arc::new(PushDownLimit::new())]; + let optimized_plan = + Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; + + assert!( + has_preserve_order_scan(&optimized_plan), + "Projection preserves order, scan should be marked" + ); + + Ok(()) + } + + #[test] + fn limit_push_down_sort_with_filter_marks_scans() -> Result<()> { + let table_scan = test_table_scan()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .filter(col("a").gt(lit(5)))? + .sort_by(vec![col("a")])? + .limit(0, Some(10))? + .build()?; + + let optimizer_ctx = OptimizerContext::new().with_max_passes(1); + let rules: Vec> = + vec![Arc::new(PushDownLimit::new())]; + let optimized_plan = + Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; + + assert!( + has_preserve_order_scan(&optimized_plan), + "Filter preserves order, scan should be marked" + ); + + Ok(()) + } + + #[test] + fn limit_push_down_sort_with_aggregate_does_not_mark_scans() -> Result<()> { + let table_scan = test_table_scan()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .aggregate(vec![col("a")], vec![max(col("b"))])? + .sort_by(vec![col("a")])? + .limit(0, Some(10))? + .build()?; + + let optimizer_ctx = OptimizerContext::new().with_max_passes(1); + let rules: Vec> = + vec![Arc::new(PushDownLimit::new())]; + let optimized_plan = + Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; + + assert!( + !has_preserve_order_scan(&optimized_plan), + "Aggregate breaks order, scan should NOT be marked" + ); + + Ok(()) + } + + #[test] + fn limit_push_down_sort_with_join_does_not_mark_scans() -> Result<()> { + let table_scan_1 = test_table_scan()?; + let table_scan_2 = test_table_scan_with_name("test2")?; + + let plan = LogicalPlanBuilder::from(table_scan_1) + .join( + LogicalPlanBuilder::from(table_scan_2).build()?, + JoinType::Inner, + (vec!["a"], vec!["a"]), + None, + )? + .sort_by(vec![col("test.a")])? + .limit(0, Some(10))? + .build()?; + + let optimizer_ctx = OptimizerContext::new().with_max_passes(1); + let rules: Vec> = + vec![Arc::new(PushDownLimit::new())]; + let optimized_plan = + Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; + + assert_eq!( + count_preserve_order_scans(&optimized_plan), + 0, + "Join breaks order, scans should NOT be marked" + ); + + Ok(()) + } + + #[test] + fn limit_push_down_sort_with_union_does_not_mark_scans() -> Result<()> { + let table_scan_1 = test_table_scan()?; + let table_scan_2 = test_table_scan_with_name("test2")?; + + let plan = LogicalPlanBuilder::from(table_scan_1) + .union(LogicalPlanBuilder::from(table_scan_2).build()?)? + .sort_by(vec![col("a")])? + .limit(0, Some(10))? + .build()?; + + let optimizer_ctx = OptimizerContext::new().with_max_passes(1); + let rules: Vec> = + vec![Arc::new(PushDownLimit::new())]; + let optimized_plan = + Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; + + assert_eq!( + count_preserve_order_scans(&optimized_plan), + 0, + "Union breaks order, scans should NOT be marked" + ); + + Ok(()) + } + + #[test] + fn limit_push_down_sort_with_window_does_not_mark_scans() -> Result<()> { + let table_scan = test_table_scan()?; + + let window_expr = + Expr::WindowFunction(Box::new(datafusion_expr::expr::WindowFunction { + fun: datafusion_expr::WindowFunctionDefinition::AggregateUDF( + datafusion_functions_aggregate::sum::sum_udaf(), + ), + params: WindowFunctionParams { + args: vec![col("b")], + partition_by: vec![col("a")], + order_by: vec![], + window_frame: datafusion_expr::WindowFrame::new(None), + null_treatment: None, + filter: None, + distinct: false, + }, + })); + + let plan = LogicalPlanBuilder::from(table_scan) + .window(vec![window_expr.alias("sum_b")])? + .sort_by(vec![col("a")])? + .limit(0, Some(10))? + .build()?; + + let optimizer_ctx = OptimizerContext::new().with_max_passes(1); + let rules: Vec> = + vec![Arc::new(PushDownLimit::new())]; + let optimized_plan = + Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; + + assert!( + !has_preserve_order_scan(&optimized_plan), + "Window function breaks order, scan should NOT be marked" + ); + + Ok(()) + } + + #[test] + fn limit_push_down_sort_with_distinct_does_not_mark_scans() -> Result<()> { + let table_scan = test_table_scan()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .distinct()? + .sort_by(vec![col("a")])? + .limit(0, Some(10))? + .build()?; + + let optimizer_ctx = OptimizerContext::new().with_max_passes(1); + let rules: Vec> = + vec![Arc::new(PushDownLimit::new())]; + let optimized_plan = + Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; + + assert!( + !has_preserve_order_scan(&optimized_plan), + "Distinct breaks order, scan should NOT be marked" + ); + + Ok(()) + } + + #[test] + fn limit_push_down_sort_through_multiple_order_preserving_ops() -> Result<()> { + let table_scan = test_table_scan()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .project(vec![col("a"), col("b")])? + .filter(col("a").gt(lit(5)))? + .limit(0, Some(100))? + .sort_by(vec![col("a")])? + .limit(0, Some(10))? + .build()?; + + let optimizer_ctx = OptimizerContext::new().with_max_passes(1); + let rules: Vec> = + vec![Arc::new(PushDownLimit::new())]; + let optimized_plan = + Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; + + assert!( + has_preserve_order_scan(&optimized_plan), + "Multiple order-preserving ops, scan should be marked" + ); + + Ok(()) + } + + #[test] + fn limit_push_down_without_sort_does_not_mark_scans() -> Result<()> { + let table_scan = test_table_scan()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .limit(0, Some(10))? + .build()?; + + let optimizer_ctx = OptimizerContext::new().with_max_passes(1); + let rules: Vec> = + vec![Arc::new(PushDownLimit::new())]; + let optimized_plan = + Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; + + assert!( + !has_preserve_order_scan(&optimized_plan), + "Limit without Sort should NOT mark scan" + ); + + Ok(()) + } + + #[test] + fn limit_push_down_sort_with_subquery_alias_marks_scans() -> Result<()> { + let table_scan = test_table_scan()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .alias("subquery")? + .sort_by(vec![col("a")])? + .limit(0, Some(10))? + .build()?; + + let optimizer_ctx = OptimizerContext::new().with_max_passes(1); + let rules: Vec> = + vec![Arc::new(PushDownLimit::new())]; + let optimized_plan = + Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; + + assert!( + has_preserve_order_scan(&optimized_plan), + "SubqueryAlias preserves order, scan should be marked" + ); + + Ok(()) + } + + #[test] + fn limit_push_down_sort_complex_aggregate_case() -> Result<()> { + let table_scan = test_table_scan()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .aggregate(vec![col("a")], vec![max(col("b")).alias("max_b")])? + .sort_by(vec![col("max_b")])? + .limit(0, Some(10))? + .build()?; + + let optimizer_ctx = OptimizerContext::new().with_max_passes(1); + let rules: Vec> = + vec![Arc::new(PushDownLimit::new())]; + let optimized_plan = + Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; + + assert!( + !has_preserve_order_scan(&optimized_plan), + "Sort on aggregate result should NOT mark input scan" + ); + + Ok(()) + } } From 62e1725d1d4ba98087623623d87d1c4f9091366a Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Thu, 27 Nov 2025 21:21:58 +0800 Subject: [PATCH 07/26] extract some logic into identify_fully_matched_row_groups --- .../src/row_group_filter.rs | 115 +++++++++++------- 1 file changed, 74 insertions(+), 41 deletions(-) diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs index 50979d3687771..1d5f1e99f1687 100644 --- a/datafusion/datasource-parquet/src/row_group_filter.rs +++ b/datafusion/datasource-parquet/src/row_group_filter.rs @@ -84,6 +84,9 @@ impl RowGroupAccessPlanFilter { } /// Prunes the access plan based on the limit and fully contained row groups. + /// See the [description](https://github.com/apache/datafusion/issues/18860#issuecomment-3563442093) + /// for how the pruning works and improves performance. + /// For more information, see the [paper](https://arxiv.org/pdf/2504.11540)'s "Pruning for LIMIT Queries" part pub fn prune_by_limit( &mut self, limit: usize, @@ -197,47 +200,15 @@ impl RowGroupAccessPlanFilter { } } - // Note: this part of code shouldn't be expensive with a limited number of row groups - // If we do find it's expensive, we can consider optimizing it further. - if !fully_contained_candidates_original_idx.is_empty() { - // Use NotExpr to create the inverted predicate - let inverted_expr = - Arc::new(NotExpr::new(Arc::clone(predicate.orig_expr()))); - // Simplify the NOT expression (e.g., NOT(c1 = 0) -> c1 != 0) - // before building the pruning predicate - let mut simplifier = PhysicalExprSimplifier::new(arrow_schema); - let inverted_expr = simplifier.simplify(inverted_expr).unwrap(); - if let Ok(inverted_predicate) = PruningPredicate::try_new( - inverted_expr, - Arc::clone(predicate.schema()), - ) { - let inverted_pruning_stats = RowGroupPruningStatistics { - parquet_schema, - row_group_metadatas: fully_contained_candidates_original_idx - .iter() - .map(|&i| &groups[i]) - .collect::>(), - arrow_schema, - }; - - if let Ok(inverted_values) = - inverted_predicate.prune(&inverted_pruning_stats) - { - for (i, &original_row_group_idx) in - fully_contained_candidates_original_idx.iter().enumerate() - { - // If the inverted predicate *also* prunes this row group (meaning inverted_values[i] is false), - // it implies that *all* rows in this group satisfy the original predicate. - if !inverted_values[i] { - self.is_fully_matched[original_row_group_idx] = true; - metrics - .row_groups_pruned_statistics - .add_fully_matched(1); - } - } - } - } - } + // Check if any of the matched row groups are fully contained by the predicate + self.identify_fully_matched_row_groups( + fully_contained_candidates_original_idx, + arrow_schema, + parquet_schema, + groups, + predicate, + metrics, + ); } // stats filter array could not be built, so we can't prune Err(e) => { @@ -247,6 +218,68 @@ impl RowGroupAccessPlanFilter { } } + /// Identifies row groups that are fully matched by the predicate. + /// + /// This optimization checks whether all rows in a row group satisfy the predicate + /// by inverting the predicate and checking if it prunes the row group. If the + /// inverted predicate prunes a row group, it means no rows match the inverted + /// predicate, which implies all rows match the original predicate. + /// + /// Note: This optimization is relatively inexpensive for a limited number of row groups. + fn identify_fully_matched_row_groups( + &mut self, + candidate_row_group_indices: Vec, + arrow_schema: &Schema, + parquet_schema: &SchemaDescriptor, + groups: &[RowGroupMetaData], + predicate: &PruningPredicate, + metrics: &ParquetFileMetrics, + ) { + if candidate_row_group_indices.is_empty() { + return; + } + + // Use NotExpr to create the inverted predicate + let inverted_expr = Arc::new(NotExpr::new(Arc::clone(predicate.orig_expr()))); + + // Simplify the NOT expression (e.g., NOT(c1 = 0) -> c1 != 0) + // before building the pruning predicate + let mut simplifier = PhysicalExprSimplifier::new(arrow_schema); + let Ok(inverted_expr) = simplifier.simplify(inverted_expr) else { + return; + }; + + let Ok(inverted_predicate) = + PruningPredicate::try_new(inverted_expr, Arc::clone(predicate.schema())) + else { + return; + }; + + let inverted_pruning_stats = RowGroupPruningStatistics { + parquet_schema, + row_group_metadatas: candidate_row_group_indices + .iter() + .map(|&i| &groups[i]) + .collect::>(), + arrow_schema, + }; + + let Ok(inverted_values) = inverted_predicate.prune(&inverted_pruning_stats) + else { + return; + }; + + for (i, &original_row_group_idx) in candidate_row_group_indices.iter().enumerate() + { + // If the inverted predicate *also* prunes this row group (meaning inverted_values[i] is false), + // it implies that *all* rows in this group satisfy the original predicate. + if !inverted_values[i] { + self.is_fully_matched[original_row_group_idx] = true; + metrics.row_groups_pruned_statistics.add_fully_matched(1); + } + } + } + /// Prune remaining row groups using available bloom filters and the /// [`PruningPredicate`]. /// From 0229bd53688332c6806c4dbb59ab9bee0397e0de Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Thu, 27 Nov 2025 21:31:36 +0800 Subject: [PATCH 08/26] resolve conflicts --- datafusion/datasource/src/file_scan_config.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 42067d2392831..50cdc5c78f804 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -243,7 +243,6 @@ pub struct FileScanConfigBuilder { file_source: Arc, limit: Option, preserve_order: bool, - projection_indices: Option>, constraints: Option, file_groups: Vec, statistics: Option, @@ -274,7 +273,6 @@ impl FileScanConfigBuilder { file_compression_type: None, limit: None, preserve_order: false, - projection_indices: None, constraints: None, batch_size: None, expr_adapter_factory: None, @@ -463,7 +461,6 @@ impl FileScanConfigBuilder { file_source, limit, preserve_order, - projection_indices, constraints, file_groups, statistics, @@ -486,7 +483,6 @@ impl FileScanConfigBuilder { file_source, limit, preserve_order, - projection_exprs, constraints, file_groups, output_ordering, @@ -510,9 +506,6 @@ impl From for FileScanConfigBuilder { file_compression_type: Some(config.file_compression_type), limit: config.limit, preserve_order: config.preserve_order, - projection_indices: config - .projection_exprs - .map(|p| p.ordered_column_indices()), constraints: Some(config.constraints), batch_size: config.batch_size, expr_adapter_factory: config.expr_adapter_factory, From 330775fcbdcd32a0fc6db3f46ab5698f9d85bc5c Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Fri, 28 Nov 2025 09:55:45 +0800 Subject: [PATCH 09/26] Add end to end sqllogictest --- .../src/row_group_filter.rs | 4 +- .../physical-expr-common/src/metrics/value.rs | 26 ++++-- .../sqllogictest/test_files/limit_pruning.slt | 77 ++++++++++++++++++ test_files/scratch/limit_pruning/data.parquet | Bin 0 -> 2320 bytes 4 files changed, 99 insertions(+), 8 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/limit_pruning.slt create mode 100644 test_files/scratch/limit_pruning/data.parquet diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs index 1d5f1e99f1687..6674d442a94d3 100644 --- a/datafusion/datasource-parquet/src/row_group_filter.rs +++ b/datafusion/datasource-parquet/src/row_group_filter.rs @@ -202,7 +202,7 @@ impl RowGroupAccessPlanFilter { // Check if any of the matched row groups are fully contained by the predicate self.identify_fully_matched_row_groups( - fully_contained_candidates_original_idx, + &fully_contained_candidates_original_idx, arrow_schema, parquet_schema, groups, @@ -228,7 +228,7 @@ impl RowGroupAccessPlanFilter { /// Note: This optimization is relatively inexpensive for a limited number of row groups. fn identify_fully_matched_row_groups( &mut self, - candidate_row_group_indices: Vec, + candidate_row_group_indices: &[usize], arrow_schema: &Schema, parquet_schema: &SchemaDescriptor, groups: &[RowGroupMetaData], diff --git a/datafusion/physical-expr-common/src/metrics/value.rs b/datafusion/physical-expr-common/src/metrics/value.rs index 4bd1eb59d9bb6..5ecaa86fc386c 100644 --- a/datafusion/physical-expr-common/src/metrics/value.rs +++ b/datafusion/physical-expr-common/src/metrics/value.rs @@ -379,13 +379,24 @@ impl Display for PruningMetrics { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { let matched = self.matched.load(Ordering::Relaxed); let total = self.pruned.load(Ordering::Relaxed) + matched; + let fully_matched = self.fully_matched.load(Ordering::Relaxed); - write!( - f, - "{} total → {} matched", - human_readable_count(total), - human_readable_count(matched) - ) + if fully_matched != 0 { + write!( + f, + "{} total → {} matched -> {} fully matched", + human_readable_count(total), + human_readable_count(matched), + human_readable_count(fully_matched) + ) + } else { + write!( + f, + "{} total → {} matched", + human_readable_count(total), + human_readable_count(matched) + ) + } } } @@ -920,8 +931,11 @@ impl MetricValue { ) => { let pruned = other_pruning_metrics.pruned.load(Ordering::Relaxed); let matched = other_pruning_metrics.matched.load(Ordering::Relaxed); + let fully_matched = + other_pruning_metrics.fully_matched.load(Ordering::Relaxed); pruning_metrics.add_pruned(pruned); pruning_metrics.add_matched(matched); + pruning_metrics.add_fully_matched(fully_matched); } ( Self::Ratio { ratio_metrics, .. }, diff --git a/datafusion/sqllogictest/test_files/limit_pruning.slt b/datafusion/sqllogictest/test_files/limit_pruning.slt new file mode 100644 index 0000000000000..cc8a17e5b78b7 --- /dev/null +++ b/datafusion/sqllogictest/test_files/limit_pruning.slt @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +statement ok +set datafusion.execution.parquet.pushdown_filters = true; + + +statement ok +CREATE TABLE t AS VALUES + ('Anow Vole', 7), + ('Brown Bear', 133), + ('Gray Wolf', 82), + ('Lynx', 71), + ('Red Fox', 40), + ('Alpine Bat', 6), + ('Nlpine Ibex', 101), + ('Nlpine Goat', 76), + ('Nlpine Sheep', 83), + ('Europ. Mole', 4), + ('Polecat', 16), + ('Alpine Ibex', 97); + +statement ok +COPY (SELECT column1 as a, column2 as b FROM t) +TO 'test_files/scratch/limit_pruning/data.parquet' +STORED AS PARQUET +OPTIONS ( + 'format.max_row_group_size' '3' +); + +statement ok +drop table t; + +statement ok +CREATE EXTERNAL TABLE t +STORED AS PARQUET +LOCATION 'test_files/scratch/limit_pruning/data.parquet'; + + +statement ok +set datafusion.explain.analyze_level = summary; + +# row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched +# limit_pruned_row_groups=2 total → 0 matched +query TT +explain analyze select * from t where a > 'M' AND b >= 50 limit 3; +---- +Plan with Metrics DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[a, b], limit=3, file_type=parquet, predicate=a@0 > M AND b@1 >= 50, pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 > M AND b_null_count@4 != row_count@2 AND b_max@3 >= 50, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=, output_bytes=, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=3 total → 3 matched, limit_pruned_row_groups=2 total → 0 matched, bytes_scanned=, metadata_load_time=, scan_efficiency_ratio=] + +# limit_pruned_row_groups=0 total → 0 matched +# because of order by, scan needs to preserve sort, so limit pruning is disabled +query TT +explain analyze select * from t where a > 'M' AND b >= 50 order by a limit 3; +---- +Plan with Metrics +01)SortExec: TopK(fetch=3), expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[a@0 < Nlpine Sheep], metrics=[output_rows=3, elapsed_compute=, output_bytes=] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[a, b], file_type=parquet, predicate=a@0 > M AND b@1 >= 50 AND DynamicFilter [ a@0 < Nlpine Sheep ], pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 > M AND b_null_count@4 != row_count@2 AND b_max@3 >= 50 AND a_null_count@1 != row_count@2 AND a_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=, output_bytes=, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=9 total → 9 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=, metadata_load_time=, scan_efficiency_ratio=] + +statement ok +drop table t; + +statement ok +reset datafusion.explain.analyze_level; diff --git a/test_files/scratch/limit_pruning/data.parquet b/test_files/scratch/limit_pruning/data.parquet new file mode 100644 index 0000000000000000000000000000000000000000..535026d53c5553427b4ac2bbbcd277b2393460b5 GIT binary patch literal 2320 zcmbtWJ!lj`6n?W=_kOZ5r(}j*xZ(<3qJ;buN<6ff_2&}3Xzmn$KrnmB`h)QzBk_t8I{EW zuJch|%5#ImW`OP&eLvu&WRL&=S+2}vj#noABre=)Wu}~QeXmNcT-CdlIZ>HB%Yk=s zrw2bz;hzyA6aX{~Gnm%41u#WF&$gy`1TYqZTTsL(8XAm!U}NkL;aq;K6)wR*N0j`v z@@x!k(Vxg1sLWE`a&qcI+0VG%^dDkJmYRcZKytzOyHZcDF13TfQab`2t`#<1@@I1u zZ`wS1-uI^v`(o)$_J2zCArD9YsnjB$Xzl7S1FCfYm1}utT@9932ngn{1J=}EEJ`(gH4X`%2I7QGYzj$>+l&thbc;c- z(_&r=Ry191Q;zMlTG8u{=Z13ZBgeZ~-7%L?jx0GAk~Ifr+dX7@ReT}NTpmB~U-GyP z+}x^J%-ah2G}gv-T9rUpcHZuBAk9sC2N)?EL%1mn@sNo{=G0*5V#(KmjdvcOgv!Dyym=R>4pHTx+(;8*JaVz zjmo!+Bn9GQCl_L=ogV<~r!btj_@s4Vi=&v`W#T1smfM`K8c_?SWi<&dDwEi*n0+T^ zcT7ICKC>0wnb(n7s7b3XRARPvH!7cEb~lC0-kZei9x?l>iT7HU7ESGGK<&BVyxu_6 zTGVbe--M`yVxJ=TB!vX;ckpBD6MPqQRtzK<-V-YxULGR2b~h@YBDhEdzjYA7XGHMz z1|q0O6P#yai8*r#XD*Oscmu8Wri&SsOJo(dW#YE5@prOZNID-9$Su6PRy@4W#BJ?v zR6fP+J8`?wN!%V2x4F3ZtaafVn6^w=EFL+LtzA#+3+tn?vmkKj_NrJT9y aR|C6aptrxb-|oSuiC(8y*zr;PU+5RUYIsNh literal 0 HcmV?d00001 From e50361c549988c87f35d38f6073cd0a0979c6879 Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Thu, 18 Dec 2025 14:43:51 +0800 Subject: [PATCH 10/26] resolve conflicts --- datafusion/core/tests/parquet/mod.rs | 11 +++++------ datafusion/core/tests/parquet/row_group_pruning.rs | 4 ++-- datafusion/datasource-parquet/src/row_group_filter.rs | 4 ++-- datafusion/optimizer/src/push_down_limit.rs | 11 +++++------ 4 files changed, 14 insertions(+), 16 deletions(-) diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index 4d0209267514b..9234ff62e1688 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -153,13 +153,12 @@ impl TestOutput { && let MetricValue::PruningMetrics { pruning_metrics, .. } = metric.value() - { - total_pruned += pruning_metrics.pruned(); - total_matched += pruning_metrics.matched(); - total_fully_matched += pruning_metrics.fully_matched(); + { + total_pruned += pruning_metrics.pruned(); + total_matched += pruning_metrics.matched(); + total_fully_matched += pruning_metrics.fully_matched(); - found = true; - } + found = true; } } diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs index f2e2561945140..744d90d22d110 100644 --- a/datafusion/core/tests/parquet/row_group_pruning.rs +++ b/datafusion/core/tests/parquet/row_group_pruning.rs @@ -1840,8 +1840,8 @@ async fn test_limit_pruning_complex_filter() -> datafusion_common::error::Result } #[tokio::test] -async fn test_limit_pruning_multiple_fully_matched( -) -> datafusion_common::error::Result<()> { +async fn test_limit_pruning_multiple_fully_matched() +-> datafusion_common::error::Result<()> { // Test Case 2: Limit requires multiple fully matched row groups // Row Group 0: a=[5,5,5,5] -> Fully matched, 4 rows // Row Group 1: a=[5,5,5,5] -> Fully matched, 4 rows diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs index 6674d442a94d3..4d0d97531bcc1 100644 --- a/datafusion/datasource-parquet/src/row_group_filter.rs +++ b/datafusion/datasource-parquet/src/row_group_filter.rs @@ -24,8 +24,8 @@ use arrow::datatypes::Schema; use datafusion_common::pruning::PruningStatistics; use datafusion_common::{Column, Result, ScalarValue}; use datafusion_datasource::FileRange; -use datafusion_physical_expr::expressions::NotExpr; use datafusion_physical_expr::PhysicalExprSimplifier; +use datafusion_physical_expr::expressions::NotExpr; use datafusion_pruning::PruningPredicate; use parquet::arrow::arrow_reader::statistics::StatisticsConverter; use parquet::arrow::parquet_column; @@ -244,7 +244,7 @@ impl RowGroupAccessPlanFilter { // Simplify the NOT expression (e.g., NOT(c1 = 0) -> c1 != 0) // before building the pruning predicate - let mut simplifier = PhysicalExprSimplifier::new(arrow_schema); + let simplifier = PhysicalExprSimplifier::new(arrow_schema); let Ok(inverted_expr) = simplifier.simplify(inverted_expr) else { return; }; diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs index f2ad2a89ce59c..1838ecbd578aa 100644 --- a/datafusion/optimizer/src/push_down_limit.rs +++ b/datafusion/optimizer/src/push_down_limit.rs @@ -18,15 +18,14 @@ //! [`PushDownLimit`] pushes `LIMIT` earlier in the query plan use std::cmp::min; -use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; use crate::optimizer::ApplyOrder; use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::Result; use datafusion_common::tree_node::Transformed; -use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::utils::combine_limit; use datafusion_expr::logical_plan::{Join, JoinType, Limit, LogicalPlan}; use datafusion_expr::{FetchType, SkipType, lit}; @@ -305,11 +304,11 @@ mod test { use super::*; use crate::test::*; - use crate::{assert_optimized_plan_eq_snapshot, Optimizer}; + use crate::{Optimizer, assert_optimized_plan_eq_snapshot}; use crate::OptimizerContext; - use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; use datafusion_common::DFSchemaRef; + use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; use datafusion_expr::expr::WindowFunctionParams; use datafusion_expr::{ Expr, Extension, UserDefinedLogicalNodeCore, col, exists, @@ -1075,7 +1074,7 @@ mod test { plan, @r" Limit: skip=0, fetch=1000 - Cross Join: + Cross Join: Limit: skip=0, fetch=1000 TableScan: test, fetch=1000 Limit: skip=0, fetch=1000 @@ -1098,7 +1097,7 @@ mod test { plan, @r" Limit: skip=1000, fetch=1000 - Cross Join: + Cross Join: Limit: skip=0, fetch=2000 TableScan: test, fetch=2000 Limit: skip=0, fetch=2000 From 321429cd91caa6f5eb4cfc0c2c91a4f257da5c35 Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Thu, 18 Dec 2025 18:56:04 +0800 Subject: [PATCH 11/26] redesign --- Cargo.lock | 1 + datafusion/catalog-listing/src/table.rs | 1 - datafusion/catalog/src/table.rs | 12 - datafusion/core/src/physical_planner.rs | 4 +- datafusion/expr/src/logical_plan/plan.rs | 11 - datafusion/expr/src/logical_plan/tree_node.rs | 2 - .../optimizer/src/optimize_projections/mod.rs | 4 +- datafusion/optimizer/src/push_down_filter.rs | 1 - datafusion/optimizer/src/push_down_limit.rs | 399 +----------------- datafusion/physical-optimizer/Cargo.toml | 1 + .../src/enforce_sorting/mod.rs | 9 +- .../physical-optimizer/src/limit_pushdown.rs | 64 ++- datafusion/physical-plan/src/limit.rs | 32 ++ datafusion/proto/src/logical_plan/mod.rs | 1 - 14 files changed, 109 insertions(+), 433 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2d40ab4506900..f5e01ea1e10e8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2427,6 +2427,7 @@ version = "52.0.0" dependencies = [ "arrow", "datafusion-common", + "datafusion-datasource", "datafusion-execution", "datafusion-expr", "datafusion-expr-common", diff --git a/datafusion/catalog-listing/src/table.rs b/datafusion/catalog-listing/src/table.rs index be4a16a7bd1e5..38456944075fc 100644 --- a/datafusion/catalog-listing/src/table.rs +++ b/datafusion/catalog-listing/src/table.rs @@ -581,7 +581,6 @@ impl TableProvider for ListingTable { .with_statistics(statistics) .with_projection_indices(projection)? .with_limit(limit) - .with_preserve_order(args.preserve_order()) .with_output_ordering(output_ordering) .with_expr_adapter(self.expr_adapter_factory.clone()) .with_partitioned_by_file_group(partitioned_by_file_group) diff --git a/datafusion/catalog/src/table.rs b/datafusion/catalog/src/table.rs index e5206b9358f8e..1f223852c2b9d 100644 --- a/datafusion/catalog/src/table.rs +++ b/datafusion/catalog/src/table.rs @@ -361,7 +361,6 @@ pub struct ScanArgs<'a> { filters: Option<&'a [Expr]>, projection: Option<&'a [usize]>, limit: Option, - preserve_order: bool, } impl<'a> ScanArgs<'a> { @@ -423,17 +422,6 @@ impl<'a> ScanArgs<'a> { pub fn limit(&self) -> Option { self.limit } - - /// Set whether should keep the output rows in order - pub fn with_preserve_order(mut self, order_sensitive: bool) -> Self { - self.preserve_order = order_sensitive; - self - } - - /// Get whether should keep the output rows in order - pub fn preserve_order(&self) -> bool { - self.preserve_order - } } /// Result of a table scan operation from [`TableProvider::scan_with_args`]. diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index fcc315be00f0f..cc7d534776d7e 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -460,7 +460,6 @@ impl DefaultPhysicalPlanner { projection, filters, fetch, - preserve_order, .. }) => { let source = source_as_provider(source)?; @@ -472,8 +471,7 @@ impl DefaultPhysicalPlanner { let opts = ScanArgs::default() .with_projection(projection.as_deref()) .with_filters(Some(&filters_vec)) - .with_limit(*fetch) - .with_preserve_order(*preserve_order); + .with_limit(*fetch); let res = source.scan_with_args(session_state, opts).await?; Arc::clone(res.plan()) } diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 9c7f365749663..4219c24bfc9c9 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -2683,8 +2683,6 @@ pub struct TableScan { pub filters: Vec, /// Optional number of rows to read pub fetch: Option, - /// If should keep the output rows in order - pub preserve_order: bool, } impl Debug for TableScan { @@ -2707,7 +2705,6 @@ impl PartialEq for TableScan { && self.projected_schema == other.projected_schema && self.filters == other.filters && self.fetch == other.fetch - && self.preserve_order == other.preserve_order } } @@ -2727,22 +2724,18 @@ impl PartialOrd for TableScan { pub filters: &'a Vec, /// Optional number of rows to read pub fetch: &'a Option, - /// Whether the fetch is order-sensitive - pub preserve_order: bool, } let comparable_self = ComparableTableScan { table_name: &self.table_name, projection: &self.projection, filters: &self.filters, fetch: &self.fetch, - preserve_order: self.preserve_order, }; let comparable_other = ComparableTableScan { table_name: &other.table_name, projection: &other.projection, filters: &other.filters, fetch: &other.fetch, - preserve_order: other.preserve_order, }; comparable_self .partial_cmp(&comparable_other) @@ -2758,7 +2751,6 @@ impl Hash for TableScan { self.projected_schema.hash(state); self.filters.hash(state); self.fetch.hash(state); - self.preserve_order.hash(state); } } @@ -2812,7 +2804,6 @@ impl TableScan { projected_schema, filters, fetch, - preserve_order: false, }) } } @@ -4977,7 +4968,6 @@ mod tests { projected_schema: Arc::clone(&schema), filters: vec![], fetch: None, - preserve_order: false, })); let col = schema.field_names()[0].clone(); @@ -5008,7 +4998,6 @@ mod tests { projected_schema: Arc::clone(&unique_schema), filters: vec![], fetch: None, - preserve_order: false, })); let col = schema.field_names()[0].clone(); diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs index 5cae151dd5852..62a27b0a025ad 100644 --- a/datafusion/expr/src/logical_plan/tree_node.rs +++ b/datafusion/expr/src/logical_plan/tree_node.rs @@ -599,7 +599,6 @@ impl LogicalPlan { projected_schema, filters, fetch, - preserve_order, }) => filters.map_elements(f)?.update_data(|filters| { LogicalPlan::TableScan(TableScan { table_name, @@ -608,7 +607,6 @@ impl LogicalPlan { projected_schema, filters, fetch, - preserve_order, }) }), LogicalPlan::Distinct(Distinct::On(DistinctOn { diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index 1d7635f990e9d..f97b05ea68fbd 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -259,7 +259,6 @@ fn optimize_projections( projection, filters, fetch, - preserve_order, projected_schema: _, } = table_scan; @@ -269,9 +268,8 @@ fn optimize_projections( Some(projection) => indices.into_mapped_indices(|idx| projection[idx]), None => indices.into_inner(), }; - let mut new_scan = + let new_scan = TableScan::try_new(table_name, source, Some(projection), filters, fetch)?; - new_scan.preserve_order = preserve_order; return Ok(Transformed::yes(LogicalPlan::TableScan(new_scan))); } diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index c104184d68e1c..755ffdbafc869 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -3119,7 +3119,6 @@ mod tests { projection, source: Arc::new(test_provider), fetch: None, - preserve_order: false, }); Ok(LogicalPlanBuilder::from(table_scan)) diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs index 1838ecbd578aa..7b302adf22acc 100644 --- a/datafusion/optimizer/src/push_down_limit.rs +++ b/datafusion/optimizer/src/push_down_limit.rs @@ -19,7 +19,6 @@ use std::cmp::min; use std::sync::Arc; -use std::sync::atomic::{AtomicBool, Ordering}; use crate::optimizer::ApplyOrder; use crate::{OptimizerConfig, OptimizerRule}; @@ -33,17 +32,12 @@ use datafusion_expr::{FetchType, SkipType, lit}; /// Optimization rule that tries to push down `LIMIT`. //. It will push down through projection, limits (taking the smaller limit) #[derive(Default, Debug)] -pub struct PushDownLimit { - /// Flag to track whether we're currently under a Sort node that requires order preservation - preserve_order: AtomicBool, -} +pub struct PushDownLimit {} impl PushDownLimit { #[expect(missing_docs)] pub fn new() -> Self { - Self { - preserve_order: AtomicBool::new(false), - } + Self {} } } @@ -59,27 +53,6 @@ impl OptimizerRule for PushDownLimit { config: &dyn OptimizerConfig, ) -> Result> { let _ = config.options(); - if let LogicalPlan::TableScan(mut scan) = plan { - if self.preserve_order.load(Ordering::Relaxed) && !scan.preserve_order { - scan.preserve_order = true; - return Ok(Transformed::yes(LogicalPlan::TableScan(scan))); - } - return Ok(Transformed::no(LogicalPlan::TableScan(scan))); - } - - if matches!( - plan, - LogicalPlan::Aggregate(_) - | LogicalPlan::Join(_) - | LogicalPlan::Union(_) - | LogicalPlan::Window(_) - | LogicalPlan::Distinct(_) - ) { - // These operations will break the order, so the downstream TableScan does not need to preserve order - self.preserve_order.store(false, Ordering::Relaxed); - return Ok(Transformed::no(plan)); - } - let LogicalPlan::Limit(mut limit) = plan else { return Ok(Transformed::no(plan)); }; @@ -151,7 +124,6 @@ impl OptimizerRule for PushDownLimit { })), LogicalPlan::Sort(mut sort) => { - self.preserve_order.store(true, Ordering::Relaxed); let new_fetch = { let sort_fetch = skip + fetch; Some(sort.fetch.map(|f| f.min(sort_fetch)).unwrap_or(sort_fetch)) @@ -303,13 +275,11 @@ mod test { use std::vec; use super::*; + use crate::assert_optimized_plan_eq_snapshot; use crate::test::*; - use crate::{Optimizer, assert_optimized_plan_eq_snapshot}; use crate::OptimizerContext; use datafusion_common::DFSchemaRef; - use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; - use datafusion_expr::expr::WindowFunctionParams; use datafusion_expr::{ Expr, Extension, UserDefinedLogicalNodeCore, col, exists, logical_plan::builder::LogicalPlanBuilder, @@ -1074,7 +1044,7 @@ mod test { plan, @r" Limit: skip=0, fetch=1000 - Cross Join: + Cross Join: Limit: skip=0, fetch=1000 TableScan: test, fetch=1000 Limit: skip=0, fetch=1000 @@ -1097,7 +1067,7 @@ mod test { plan, @r" Limit: skip=1000, fetch=1000 - Cross Join: + Cross Join: Limit: skip=0, fetch=2000 TableScan: test, fetch=2000 Limit: skip=0, fetch=2000 @@ -1161,363 +1131,4 @@ mod test { " ) } - - fn has_preserve_order_scan(plan: &LogicalPlan) -> bool { - let mut found = false; - plan.apply(|node| { - if let LogicalPlan::TableScan(scan) = node { - if scan.preserve_order { - found = true; - return Ok(TreeNodeRecursion::Stop); - } - } - Ok(TreeNodeRecursion::Continue) - }) - .expect("plan traversal"); - found - } - - #[test] - fn limit_push_down_sort_marks_scans_preserev_order() -> Result<()> { - let table_scan = test_table_scan()?; - - let plan = LogicalPlanBuilder::from(table_scan) - .sort_by(vec![col("a")])? - .limit(0, Some(10))? - .build()?; - - let optimizer_ctx = OptimizerContext::new().with_max_passes(1); - let rules: Vec> = - vec![Arc::new(PushDownLimit::new())]; - let optimized_plan = - Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; - - assert!(has_preserve_order_scan(&optimized_plan)); - - Ok(()) - } - - // Helper function to count how many TableScans have preserve_order = true - fn count_preserve_order_scans(plan: &LogicalPlan) -> usize { - let mut count = 0; - plan.apply(|node| { - if let LogicalPlan::TableScan(scan) = node { - if scan.preserve_order { - count += 1; - } - } - Ok(TreeNodeRecursion::Continue) - }) - .expect("plan traversal"); - count - } - - #[test] - fn limit_push_down_sort_marks_scans_preserve_order() -> Result<()> { - let table_scan = test_table_scan()?; - - let plan = LogicalPlanBuilder::from(table_scan) - .sort_by(vec![col("a")])? - .limit(0, Some(10))? - .build()?; - - let optimizer_ctx = OptimizerContext::new().with_max_passes(1); - let rules: Vec> = - vec![Arc::new(PushDownLimit::new())]; - let optimized_plan = - Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; - - assert!(has_preserve_order_scan(&optimized_plan)); - - Ok(()) - } - - #[test] - fn limit_push_down_sort_with_projection_marks_scans() -> Result<()> { - let table_scan = test_table_scan()?; - - let plan = LogicalPlanBuilder::from(table_scan) - .project(vec![col("a"), col("b")])? - .sort_by(vec![col("a")])? - .limit(0, Some(10))? - .build()?; - - let optimizer_ctx = OptimizerContext::new().with_max_passes(1); - let rules: Vec> = - vec![Arc::new(PushDownLimit::new())]; - let optimized_plan = - Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; - - assert!( - has_preserve_order_scan(&optimized_plan), - "Projection preserves order, scan should be marked" - ); - - Ok(()) - } - - #[test] - fn limit_push_down_sort_with_filter_marks_scans() -> Result<()> { - let table_scan = test_table_scan()?; - - let plan = LogicalPlanBuilder::from(table_scan) - .filter(col("a").gt(lit(5)))? - .sort_by(vec![col("a")])? - .limit(0, Some(10))? - .build()?; - - let optimizer_ctx = OptimizerContext::new().with_max_passes(1); - let rules: Vec> = - vec![Arc::new(PushDownLimit::new())]; - let optimized_plan = - Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; - - assert!( - has_preserve_order_scan(&optimized_plan), - "Filter preserves order, scan should be marked" - ); - - Ok(()) - } - - #[test] - fn limit_push_down_sort_with_aggregate_does_not_mark_scans() -> Result<()> { - let table_scan = test_table_scan()?; - - let plan = LogicalPlanBuilder::from(table_scan) - .aggregate(vec![col("a")], vec![max(col("b"))])? - .sort_by(vec![col("a")])? - .limit(0, Some(10))? - .build()?; - - let optimizer_ctx = OptimizerContext::new().with_max_passes(1); - let rules: Vec> = - vec![Arc::new(PushDownLimit::new())]; - let optimized_plan = - Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; - - assert!( - !has_preserve_order_scan(&optimized_plan), - "Aggregate breaks order, scan should NOT be marked" - ); - - Ok(()) - } - - #[test] - fn limit_push_down_sort_with_join_does_not_mark_scans() -> Result<()> { - let table_scan_1 = test_table_scan()?; - let table_scan_2 = test_table_scan_with_name("test2")?; - - let plan = LogicalPlanBuilder::from(table_scan_1) - .join( - LogicalPlanBuilder::from(table_scan_2).build()?, - JoinType::Inner, - (vec!["a"], vec!["a"]), - None, - )? - .sort_by(vec![col("test.a")])? - .limit(0, Some(10))? - .build()?; - - let optimizer_ctx = OptimizerContext::new().with_max_passes(1); - let rules: Vec> = - vec![Arc::new(PushDownLimit::new())]; - let optimized_plan = - Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; - - assert_eq!( - count_preserve_order_scans(&optimized_plan), - 0, - "Join breaks order, scans should NOT be marked" - ); - - Ok(()) - } - - #[test] - fn limit_push_down_sort_with_union_does_not_mark_scans() -> Result<()> { - let table_scan_1 = test_table_scan()?; - let table_scan_2 = test_table_scan_with_name("test2")?; - - let plan = LogicalPlanBuilder::from(table_scan_1) - .union(LogicalPlanBuilder::from(table_scan_2).build()?)? - .sort_by(vec![col("a")])? - .limit(0, Some(10))? - .build()?; - - let optimizer_ctx = OptimizerContext::new().with_max_passes(1); - let rules: Vec> = - vec![Arc::new(PushDownLimit::new())]; - let optimized_plan = - Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; - - assert_eq!( - count_preserve_order_scans(&optimized_plan), - 0, - "Union breaks order, scans should NOT be marked" - ); - - Ok(()) - } - - #[test] - fn limit_push_down_sort_with_window_does_not_mark_scans() -> Result<()> { - let table_scan = test_table_scan()?; - - let window_expr = - Expr::WindowFunction(Box::new(datafusion_expr::expr::WindowFunction { - fun: datafusion_expr::WindowFunctionDefinition::AggregateUDF( - datafusion_functions_aggregate::sum::sum_udaf(), - ), - params: WindowFunctionParams { - args: vec![col("b")], - partition_by: vec![col("a")], - order_by: vec![], - window_frame: datafusion_expr::WindowFrame::new(None), - null_treatment: None, - filter: None, - distinct: false, - }, - })); - - let plan = LogicalPlanBuilder::from(table_scan) - .window(vec![window_expr.alias("sum_b")])? - .sort_by(vec![col("a")])? - .limit(0, Some(10))? - .build()?; - - let optimizer_ctx = OptimizerContext::new().with_max_passes(1); - let rules: Vec> = - vec![Arc::new(PushDownLimit::new())]; - let optimized_plan = - Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; - - assert!( - !has_preserve_order_scan(&optimized_plan), - "Window function breaks order, scan should NOT be marked" - ); - - Ok(()) - } - - #[test] - fn limit_push_down_sort_with_distinct_does_not_mark_scans() -> Result<()> { - let table_scan = test_table_scan()?; - - let plan = LogicalPlanBuilder::from(table_scan) - .distinct()? - .sort_by(vec![col("a")])? - .limit(0, Some(10))? - .build()?; - - let optimizer_ctx = OptimizerContext::new().with_max_passes(1); - let rules: Vec> = - vec![Arc::new(PushDownLimit::new())]; - let optimized_plan = - Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; - - assert!( - !has_preserve_order_scan(&optimized_plan), - "Distinct breaks order, scan should NOT be marked" - ); - - Ok(()) - } - - #[test] - fn limit_push_down_sort_through_multiple_order_preserving_ops() -> Result<()> { - let table_scan = test_table_scan()?; - - let plan = LogicalPlanBuilder::from(table_scan) - .project(vec![col("a"), col("b")])? - .filter(col("a").gt(lit(5)))? - .limit(0, Some(100))? - .sort_by(vec![col("a")])? - .limit(0, Some(10))? - .build()?; - - let optimizer_ctx = OptimizerContext::new().with_max_passes(1); - let rules: Vec> = - vec![Arc::new(PushDownLimit::new())]; - let optimized_plan = - Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; - - assert!( - has_preserve_order_scan(&optimized_plan), - "Multiple order-preserving ops, scan should be marked" - ); - - Ok(()) - } - - #[test] - fn limit_push_down_without_sort_does_not_mark_scans() -> Result<()> { - let table_scan = test_table_scan()?; - - let plan = LogicalPlanBuilder::from(table_scan) - .limit(0, Some(10))? - .build()?; - - let optimizer_ctx = OptimizerContext::new().with_max_passes(1); - let rules: Vec> = - vec![Arc::new(PushDownLimit::new())]; - let optimized_plan = - Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; - - assert!( - !has_preserve_order_scan(&optimized_plan), - "Limit without Sort should NOT mark scan" - ); - - Ok(()) - } - - #[test] - fn limit_push_down_sort_with_subquery_alias_marks_scans() -> Result<()> { - let table_scan = test_table_scan()?; - - let plan = LogicalPlanBuilder::from(table_scan) - .alias("subquery")? - .sort_by(vec![col("a")])? - .limit(0, Some(10))? - .build()?; - - let optimizer_ctx = OptimizerContext::new().with_max_passes(1); - let rules: Vec> = - vec![Arc::new(PushDownLimit::new())]; - let optimized_plan = - Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; - - assert!( - has_preserve_order_scan(&optimized_plan), - "SubqueryAlias preserves order, scan should be marked" - ); - - Ok(()) - } - - #[test] - fn limit_push_down_sort_complex_aggregate_case() -> Result<()> { - let table_scan = test_table_scan()?; - - let plan = LogicalPlanBuilder::from(table_scan) - .aggregate(vec![col("a")], vec![max(col("b")).alias("max_b")])? - .sort_by(vec![col("max_b")])? - .limit(0, Some(10))? - .build()?; - - let optimizer_ctx = OptimizerContext::new().with_max_passes(1); - let rules: Vec> = - vec![Arc::new(PushDownLimit::new())]; - let optimized_plan = - Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?; - - assert!( - !has_preserve_order_scan(&optimized_plan), - "Sort on aggregate result should NOT mark input scan" - ); - - Ok(()) - } } diff --git a/datafusion/physical-optimizer/Cargo.toml b/datafusion/physical-optimizer/Cargo.toml index 395da10d629ba..caa9ee7b46914 100644 --- a/datafusion/physical-optimizer/Cargo.toml +++ b/datafusion/physical-optimizer/Cargo.toml @@ -43,6 +43,7 @@ recursive_protection = ["dep:recursive"] [dependencies] arrow = { workspace = true } datafusion-common = { workspace = true } +datafusion-datasource = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true, default-features = true } diff --git a/datafusion/physical-optimizer/src/enforce_sorting/mod.rs b/datafusion/physical-optimizer/src/enforce_sorting/mod.rs index a5fafb9e87e1d..a2e0ddcb3bcca 100644 --- a/datafusion/physical-optimizer/src/enforce_sorting/mod.rs +++ b/datafusion/physical-optimizer/src/enforce_sorting/mod.rs @@ -583,9 +583,14 @@ fn analyze_immediate_sort_removal( if let Some(fetch) = sort_exec.fetch() { // If the sort has a fetch, we need to add a limit: if properties.output_partitioning().partition_count() == 1 { - Arc::new(GlobalLimitExec::new(Arc::clone(sort_input), 0, Some(fetch))) + let mut global_limit = + GlobalLimitExec::new(Arc::clone(sort_input), 0, Some(fetch)); + global_limit.set_order_sensitive(true); + Arc::new(global_limit) } else { - Arc::new(LocalLimitExec::new(Arc::clone(sort_input), fetch)) + let mut local_limit = LocalLimitExec::new(Arc::clone(sort_input), fetch); + local_limit.set_order_sensitive(true); + Arc::new(local_limit) } } else { Arc::clone(sort_input) diff --git a/datafusion/physical-optimizer/src/limit_pushdown.rs b/datafusion/physical-optimizer/src/limit_pushdown.rs index 4cb3abe30bae2..b5f6c35e17295 100644 --- a/datafusion/physical-optimizer/src/limit_pushdown.rs +++ b/datafusion/physical-optimizer/src/limit_pushdown.rs @@ -27,6 +27,8 @@ use datafusion_common::config::ConfigOptions; use datafusion_common::error::Result; use datafusion_common::tree_node::{Transformed, TreeNodeRecursion}; use datafusion_common::utils::combine_limit; +use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; +use datafusion_datasource::source::DataSourceExec; use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; @@ -50,6 +52,7 @@ pub struct GlobalRequirements { fetch: Option, skip: usize, satisfied: bool, + order_sensitive: bool, } impl LimitPushdown { @@ -69,6 +72,7 @@ impl PhysicalOptimizerRule for LimitPushdown { fetch: None, skip: 0, satisfied: false, + order_sensitive: false, }; pushdown_limits(plan, global_state) } @@ -111,6 +115,13 @@ impl LimitExec { Self::Local(_) => 0, } } + + fn order_sensitive(&self) -> bool { + match self { + Self::Global(global) => global.order_sensitive(), + Self::Local(local) => local.order_sensitive(), + } + } } impl From for Arc { @@ -145,6 +156,7 @@ pub fn pushdown_limit_helper( ); global_state.skip = skip; global_state.fetch = fetch; + global_state.order_sensitive = limit_exec.order_sensitive(); // Now the global state has the most recent information, we can remove // the `LimitExec` plan. We will decide later if we should add it again @@ -241,17 +253,30 @@ pub fn pushdown_limit_helper( let maybe_fetchable = pushdown_plan.with_fetch(skip_and_fetch); if global_state.satisfied { if let Some(plan_with_fetch) = maybe_fetchable { - Ok((Transformed::yes(plan_with_fetch), global_state)) + let plan_with_preserve_order = ensure_preserve_order_if_needed( + plan_with_fetch, + global_state.order_sensitive, + ); + Ok((Transformed::yes(plan_with_preserve_order), global_state)) } else { Ok((Transformed::no(pushdown_plan), global_state)) } } else { global_state.satisfied = true; pushdown_plan = if let Some(plan_with_fetch) = maybe_fetchable { + let plan_with_preserve_order = ensure_preserve_order_if_needed( + plan_with_fetch, + global_state.order_sensitive, + ); + if global_skip > 0 { - add_global_limit(plan_with_fetch, global_skip, Some(global_fetch)) + add_global_limit( + plan_with_preserve_order, + global_skip, + Some(global_fetch), + ) } else { - plan_with_fetch + plan_with_preserve_order } } else { add_limit(pushdown_plan, global_skip, global_fetch) @@ -337,4 +362,37 @@ fn add_global_limit( Arc::new(GlobalLimitExec::new(pushdown_plan, skip, fetch)) as _ } +/// Helper function to handle DataSourceExec preserve_order setting +fn ensure_preserve_order_if_needed( + plan: Arc, + order_sensitive: bool, +) -> Arc { + if !order_sensitive { + return plan; + } + + let Some(data_source_exec) = plan.as_any().downcast_ref::() else { + return plan; + }; + + let Some(file_scan_config) = data_source_exec + .data_source() + .as_any() + .downcast_ref::() + else { + return plan; + }; + + if file_scan_config.preserve_order { + return plan; + } + + let new_config = FileScanConfigBuilder::from(file_scan_config.clone()) + .with_preserve_order(true) + .build(); + + let new_data_source_exec = DataSourceExec::new(Arc::new(new_config)); + Arc::new(new_data_source_exec) as Arc +} + // See tests in datafusion/core/tests/physical_optimizer diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs index 05d6882821477..bf27769a9c776 100644 --- a/datafusion/physical-plan/src/limit.rs +++ b/datafusion/physical-plan/src/limit.rs @@ -51,6 +51,9 @@ pub struct GlobalLimitExec { /// Execution metrics metrics: ExecutionPlanMetricsSet, cache: PlanProperties, + /// Whether the limit is order-sensitive + /// Such as the child plan is a sort node, then the limit is order-sensitive + order_sensitive: bool, } impl GlobalLimitExec { @@ -63,6 +66,7 @@ impl GlobalLimitExec { fetch, metrics: ExecutionPlanMetricsSet::new(), cache, + order_sensitive: false, } } @@ -91,6 +95,18 @@ impl GlobalLimitExec { Boundedness::Bounded, ) } + + /// Whether the limit is order-sensitive + /// Such as the child plan is a sort node, then the limit is order-sensitive + pub fn order_sensitive(&self) -> bool { + self.order_sensitive + } + + /// Whether the limit is order-sensitive + /// Such as the child plan is a sort node, then the limit is order-sensitive + pub fn set_order_sensitive(&mut self, order_sensitive: bool) { + self.order_sensitive = order_sensitive; + } } impl DisplayAs for GlobalLimitExec { @@ -223,6 +239,9 @@ pub struct LocalLimitExec { /// Execution metrics metrics: ExecutionPlanMetricsSet, cache: PlanProperties, + /// Whether the limit is order-sensitive + /// Such as the child plan is a sort node, then the limit is order-sensitive + order_sensitive: bool, } impl LocalLimitExec { @@ -234,6 +253,7 @@ impl LocalLimitExec { fetch, metrics: ExecutionPlanMetricsSet::new(), cache, + order_sensitive: false, } } @@ -257,6 +277,18 @@ impl LocalLimitExec { Boundedness::Bounded, ) } + + /// Whether the limit is order-sensitive + /// Such as the child plan is a sort node, then the limit is order-sensitive + pub fn order_sensitive(&self) -> bool { + self.order_sensitive + } + + /// Whether the limit is order-sensitive + /// Such as the child plan is a sort node, then the limit is order-sensitive + pub fn set_order_sensitive(&mut self, order_sensitive: bool) { + self.order_sensitive = order_sensitive; + } } impl DisplayAs for LocalLimitExec { diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs index 1af4db1094840..218c2e4e47d04 100644 --- a/datafusion/proto/src/logical_plan/mod.rs +++ b/datafusion/proto/src/logical_plan/mod.rs @@ -267,7 +267,6 @@ fn from_table_source( projected_schema, filters: vec![], fetch: None, - preserve_order: false, }); LogicalPlanNode::try_from_logical_plan(&r, extension_codec) From 31ae9cf74a5beeaee8adfba0c1052a6baf77bd52 Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Mon, 22 Dec 2025 13:23:52 +0800 Subject: [PATCH 12/26] use required_ordering --- .../src/enforce_sorting/mod.rs | 5 ++- .../physical-optimizer/src/limit_pushdown.rs | 4 +- datafusion/physical-plan/src/limit.rs | 45 +++++++++---------- 3 files changed, 26 insertions(+), 28 deletions(-) diff --git a/datafusion/physical-optimizer/src/enforce_sorting/mod.rs b/datafusion/physical-optimizer/src/enforce_sorting/mod.rs index a2e0ddcb3bcca..247ebb2785dd3 100644 --- a/datafusion/physical-optimizer/src/enforce_sorting/mod.rs +++ b/datafusion/physical-optimizer/src/enforce_sorting/mod.rs @@ -581,15 +581,16 @@ fn analyze_immediate_sort_removal( // Remove the sort: node.children = node.children.swap_remove(0).children; if let Some(fetch) = sort_exec.fetch() { + let required_ordering = sort_exec.properties().output_ordering().cloned(); // If the sort has a fetch, we need to add a limit: if properties.output_partitioning().partition_count() == 1 { let mut global_limit = GlobalLimitExec::new(Arc::clone(sort_input), 0, Some(fetch)); - global_limit.set_order_sensitive(true); + global_limit.set_required_ordering(required_ordering); Arc::new(global_limit) } else { let mut local_limit = LocalLimitExec::new(Arc::clone(sort_input), fetch); - local_limit.set_order_sensitive(true); + local_limit.set_required_ordering(required_ordering); Arc::new(local_limit) } } else { diff --git a/datafusion/physical-optimizer/src/limit_pushdown.rs b/datafusion/physical-optimizer/src/limit_pushdown.rs index b5f6c35e17295..d259025b61bf1 100644 --- a/datafusion/physical-optimizer/src/limit_pushdown.rs +++ b/datafusion/physical-optimizer/src/limit_pushdown.rs @@ -118,8 +118,8 @@ impl LimitExec { fn order_sensitive(&self) -> bool { match self { - Self::Global(global) => global.order_sensitive(), - Self::Local(local) => local.order_sensitive(), + Self::Global(global) => global.required_ordering().is_some(), + Self::Local(local) => local.required_ordering().is_some(), } } } diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs index bf27769a9c776..b85a7a8f80405 100644 --- a/datafusion/physical-plan/src/limit.rs +++ b/datafusion/physical-plan/src/limit.rs @@ -35,6 +35,7 @@ use arrow::record_batch::RecordBatch; use datafusion_common::{Result, assert_eq_or_internal_err, internal_err}; use datafusion_execution::TaskContext; +use datafusion_physical_expr::LexOrdering; use futures::stream::{Stream, StreamExt}; use log::trace; @@ -51,9 +52,9 @@ pub struct GlobalLimitExec { /// Execution metrics metrics: ExecutionPlanMetricsSet, cache: PlanProperties, - /// Whether the limit is order-sensitive - /// Such as the child plan is a sort node, then the limit is order-sensitive - order_sensitive: bool, + /// If the child plan is a sort node, after the sort node is removed during + /// physical optimization, we should add the required ordering to the limit node + required_ordering: Option, } impl GlobalLimitExec { @@ -66,7 +67,7 @@ impl GlobalLimitExec { fetch, metrics: ExecutionPlanMetricsSet::new(), cache, - order_sensitive: false, + required_ordering: None, } } @@ -96,16 +97,14 @@ impl GlobalLimitExec { ) } - /// Whether the limit is order-sensitive - /// Such as the child plan is a sort node, then the limit is order-sensitive - pub fn order_sensitive(&self) -> bool { - self.order_sensitive + /// Get the required ordering from limit + pub fn required_ordering(&self) -> &Option { + &self.required_ordering } - /// Whether the limit is order-sensitive - /// Such as the child plan is a sort node, then the limit is order-sensitive - pub fn set_order_sensitive(&mut self, order_sensitive: bool) { - self.order_sensitive = order_sensitive; + /// Set the required ordering for limit + pub fn set_required_ordering(&mut self, required_ordering: Option) { + self.required_ordering = required_ordering; } } @@ -239,9 +238,9 @@ pub struct LocalLimitExec { /// Execution metrics metrics: ExecutionPlanMetricsSet, cache: PlanProperties, - /// Whether the limit is order-sensitive - /// Such as the child plan is a sort node, then the limit is order-sensitive - order_sensitive: bool, + /// If the child plan is a sort node, after the sort node is removed during + /// physical optimization, we should add the required ordering to the limit node + required_ordering: Option, } impl LocalLimitExec { @@ -253,7 +252,7 @@ impl LocalLimitExec { fetch, metrics: ExecutionPlanMetricsSet::new(), cache, - order_sensitive: false, + required_ordering: None, } } @@ -278,16 +277,14 @@ impl LocalLimitExec { ) } - /// Whether the limit is order-sensitive - /// Such as the child plan is a sort node, then the limit is order-sensitive - pub fn order_sensitive(&self) -> bool { - self.order_sensitive + /// Get the required ordering from limit + pub fn required_ordering(&self) -> &Option { + &self.required_ordering } - /// Whether the limit is order-sensitive - /// Such as the child plan is a sort node, then the limit is order-sensitive - pub fn set_order_sensitive(&mut self, order_sensitive: bool) { - self.order_sensitive = order_sensitive; + /// Set the required ordering for limit + pub fn set_required_ordering(&mut self, required_ordering: Option) { + self.required_ordering = required_ordering; } } From 4602a764bc621ac1e96af0c9af5e2e859db7da94 Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Wed, 24 Dec 2025 17:35:40 +0800 Subject: [PATCH 13/26] resolve conflicts --- datafusion/core/tests/parquet/mod.rs | 51 ++++++++++++++----- .../core/tests/parquet/row_group_pruning.rs | 4 +- datafusion/datasource-parquet/src/opener.rs | 3 ++ .../src/row_group_filter.rs | 2 +- 4 files changed, 43 insertions(+), 17 deletions(-) diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index 9234ff62e1688..3fff22d7c0855 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -110,6 +110,26 @@ struct ContextWithParquet { ctx: SessionContext, } +struct PruningMetric { + total_pruned: usize, + total_matched: usize, + total_fully_matched: usize, +} + +impl PruningMetric { + pub fn total_pruned(&self) -> usize { + self.total_pruned + } + + pub fn total_matched(&self) -> usize { + self.total_matched + } + + pub fn total_fully_matched(&self) -> usize { + self.total_fully_matched + } +} + /// The output of running one of the test cases struct TestOutput { /// The input query SQL @@ -127,8 +147,8 @@ struct TestOutput { impl TestOutput { /// retrieve the value of the named metric, if any fn metric_value(&self, metric_name: &str) -> Option { - if let Some((pruned, _matched, _fully)) = self.pruning_metric(metric_name) { - return Some(pruned); + if let Some(pm) = self.pruning_metric(metric_name) { + return Some(pm.total_pruned()); } self.parquet_metrics @@ -141,7 +161,7 @@ impl TestOutput { }) } - fn pruning_metric(&self, metric_name: &str) -> Option<(usize, usize, usize)> { + fn pruning_metric(&self, metric_name: &str) -> Option { let mut total_pruned = 0; let mut total_matched = 0; let mut total_fully_matched = 0; @@ -163,7 +183,11 @@ impl TestOutput { } if found { - Some((total_pruned, total_matched, total_fully_matched)) + Some(PruningMetric { + total_pruned, + total_matched, + total_fully_matched, + }) } else { None } @@ -175,33 +199,33 @@ impl TestOutput { } /// The number of row_groups pruned / matched by bloom filter - fn row_groups_bloom_filter(&self) -> Option<(usize, usize, usize)> { + fn row_groups_bloom_filter(&self) -> Option { self.pruning_metric("row_groups_pruned_bloom_filter") } /// The number of row_groups matched by statistics fn row_groups_matched_statistics(&self) -> Option { self.pruning_metric("row_groups_pruned_statistics") - .map(|(_pruned, matched, _fully)| matched) + .map(|pm| pm.total_matched()) } /// The number of row_groups fully matched by statistics fn row_groups_fully_matched_statistics(&self) -> Option { self.pruning_metric("row_groups_pruned_statistics") - .map(|(_pruned, _, fully)| fully) + .map(|pm| pm.total_fully_matched()) } /// The number of row_groups pruned by statistics fn row_groups_pruned_statistics(&self) -> Option { self.pruning_metric("row_groups_pruned_statistics") - .map(|(pruned, _matched, _fully)| pruned) + .map(|pm| pm.total_pruned()) } /// Metric `files_ranges_pruned_statistics` tracks both pruned and matched count, /// for testing purpose, here it only aggregate the `pruned` count. fn files_ranges_pruned_statistics(&self) -> Option { self.pruning_metric("files_ranges_pruned_statistics") - .map(|(pruned, _matched, _fully)| pruned) + .map(|pm| pm.total_pruned()) } /// The number of row_groups matched by bloom filter or statistics @@ -210,14 +234,13 @@ impl TestOutput { /// filter: 7 total -> 3 matched, this function returns 3 for the final matched /// count. fn row_groups_matched(&self) -> Option { - self.row_groups_bloom_filter() - .map(|(_pruned, matched, _fully)| matched) + self.row_groups_bloom_filter().map(|pm| pm.total_matched()) } /// The number of row_groups pruned fn row_groups_pruned(&self) -> Option { self.row_groups_bloom_filter() - .map(|(pruned, _matched, _fully)| pruned) + .map(|pm| pm.total_pruned()) .zip(self.row_groups_pruned_statistics()) .map(|(a, b)| a + b) } @@ -225,13 +248,13 @@ impl TestOutput { /// The number of row pages pruned fn row_pages_pruned(&self) -> Option { self.pruning_metric("page_index_rows_pruned") - .map(|(pruned, _matched, _fully)| pruned) + .map(|pm| pm.total_pruned()) } /// The number of row groups pruned by limit pruning fn limit_pruned_row_groups(&self) -> Option { self.pruning_metric("limit_pruned_row_groups") - .map(|(pruned, _, _)| pruned) + .map(|pm| pm.total_pruned()) } fn description(&self) -> String { diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs index 744d90d22d110..789bd90f0b998 100644 --- a/datafusion/core/tests/parquet/row_group_pruning.rs +++ b/datafusion/core/tests/parquet/row_group_pruning.rs @@ -157,12 +157,12 @@ impl RowGroupPruningTest { ); let bloom_filter_metrics = output.row_groups_bloom_filter(); assert_eq!( - bloom_filter_metrics.map(|(_pruned, matched, _)| matched), + bloom_filter_metrics.as_ref().map(|pm| pm.total_matched()), self.expected_row_group_matched_by_bloom_filter, "mismatched row_groups_matched_bloom_filter", ); assert_eq!( - bloom_filter_metrics.map(|(pruned, _matched, _)| pruned), + bloom_filter_metrics.map(|pm| pm.total_pruned()), self.expected_row_group_pruned_by_bloom_filter, "mismatched row_groups_pruned_bloom_filter", ); diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index 891f349635c04..af2cb88b8aa52 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -1059,6 +1059,7 @@ mod test { coerce_int96: Option, max_predicate_cache_size: Option, reverse_row_groups: bool, + preserve_order: bool, } impl ParquetOpenerBuilder { @@ -1084,6 +1085,7 @@ mod test { coerce_int96: None, max_predicate_cache_size: None, reverse_row_groups: false, + preserve_order: false, } } @@ -1191,6 +1193,7 @@ mod test { encryption_factory: None, max_predicate_cache_size: self.max_predicate_cache_size, reverse_row_groups: self.reverse_row_groups, + preserve_order: self.preserve_order, } } } diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs index 4d0d97531bcc1..3381aeec2b523 100644 --- a/datafusion/datasource-parquet/src/row_group_filter.rs +++ b/datafusion/datasource-parquet/src/row_group_filter.rs @@ -48,7 +48,7 @@ use parquet::{ pub struct RowGroupAccessPlanFilter { /// which row groups should be accessed access_plan: ParquetAccessPlan, - /// which row groups are fully contained within the pruning predicate + /// Row groups where ALL rows are known to match the pruning predicate is_fully_matched: Vec, } From e09a1929e27067a63fedcfc22d12b858ebd19c9f Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Wed, 7 Jan 2026 14:10:23 +0800 Subject: [PATCH 14/26] resolve newest review --- datafusion/core/tests/parquet/mod.rs | 5 +- .../core/tests/parquet/row_group_pruning.rs | 2 +- .../src/row_group_filter.rs | 68 ++++++++++++++++++- datafusion/physical-plan/src/limit.rs | 4 +- .../sqllogictest/test_files/limit_pruning.slt | 24 ++++--- docs/source/user-guide/explain-usage.md | 1 + 6 files changed, 86 insertions(+), 18 deletions(-) diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index 3fff22d7c0855..4d52521d62737 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -30,6 +30,7 @@ use arrow::{ record_batch::RecordBatch, util::pretty::pretty_format_batches, }; +use arrow_schema::SchemaRef; use chrono::{Datelike, Duration, TimeDelta}; use datafusion::{ datasource::{TableProvider, provider_as_source}, @@ -294,7 +295,7 @@ impl ContextWithParquet { scenario: Scenario, unit: Unit, mut config: SessionConfig, - custom_schema: Option>, + custom_schema: Option, custom_batches: Option>, ) -> Self { // Use a single partition for deterministic results no matter how many CPUs the host has @@ -1137,7 +1138,7 @@ fn create_data_batch(scenario: Scenario) -> Vec { async fn make_test_file_rg( scenario: Scenario, row_per_group: usize, - custom_schema: Option>, + custom_schema: Option, custom_batches: Option>, ) -> NamedTempFile { let mut output_file = tempfile::Builder::new() diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs index 789bd90f0b998..e588dd06ca8f1 100644 --- a/datafusion/core/tests/parquet/row_group_pruning.rs +++ b/datafusion/core/tests/parquet/row_group_pruning.rs @@ -1949,7 +1949,7 @@ async fn test_limit_pruning_exceeds_fully_matched() -> datafusion_common::error: .with_scenario(Scenario::Int) .with_query(query) .with_expected_errors(Some(0)) - .with_expected_rows(10) // Total: 1 + 3 + 4 + 1 = 9 (less than limit) + .with_expected_rows(10) // Total: 1 + 4 + 4 + 1 = 10 .with_pruned_files(Some(0)) .with_matched_by_stats(Some(4)) // RG0,1,2,3 matched .with_fully_matched_by_stats(Some(2)) diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs index 3381aeec2b523..974e0bcd9cc93 100644 --- a/datafusion/datasource-parquet/src/row_group_filter.rs +++ b/datafusion/datasource-parquet/src/row_group_filter.rs @@ -49,6 +49,7 @@ pub struct RowGroupAccessPlanFilter { /// which row groups should be accessed access_plan: ParquetAccessPlan, /// Row groups where ALL rows are known to match the pruning predicate + /// (the predicate does not filter any rows) is_fully_matched: Vec, } @@ -84,8 +85,66 @@ impl RowGroupAccessPlanFilter { } /// Prunes the access plan based on the limit and fully contained row groups. - /// See the [description](https://github.com/apache/datafusion/issues/18860#issuecomment-3563442093) - /// for how the pruning works and improves performance. + /// + /// The pruning works by leveraging the concept of fully matched row groups. Consider a query like: + /// `WHERE species LIKE 'Alpine%' AND s >= 50 LIMIT N` + /// + /// After initial filtering, row groups can be classified into three states: + /// + /// ``` + /// PRUNING CLASSIFICATION DIAGRAM + /// ------------------------------ + /// Legend: + /// [ ] = Not Matching / Pruned + /// [X] = Partially Matching (Row Group/Page contains some matches) + /// [F] = Fully Matching (Entire range is within predicate) + /// + /// +-----------------------------------------------------------------------+ + /// | NOT MATCHING | + /// | Partition 1 | + /// | +-----------------------------------+-----------------------------+ | + /// | | SPECIES (min: 'B...',max: 'S...') | S (min: 7, max: 133) | | + /// | +-----------------------------------+-----------------------------+ | + /// | | Snow Vole | 7 | | + /// | | Brown Bear | 133 | | + /// | | Gray Wolf | 82 | | + /// | +-----------------------------------+-----------------------------+ | + /// +-----------------------------------------------------------------------+ + /// + /// +-----------------------------------------------------------------------+ + /// | PARTIALLY MATCHING | + /// | Partition 2 Partition 4 | + /// | +------------------+--------------+ +------------------+-------+ | + /// | | SPECIES | S | | SPECIES | S | | + /// | | (min:A, max:R) |(min:6,max:70)| | (min:A, max:P) |[4-51] | | + /// | +------------------+--------------+ +------------------+-------+ | + /// | | Lynx | 71 | | Europ. Mole | 4 | | + /// | | Red Fox | 40 | | Polecat | 16 | | + /// | | Alpine Bat | 6 | | Alpine Ibex | 97 | | + /// | +------------------+--------------+ +------------------+-------+ | + /// +-----------------------------------------------------------------------+ + /// + /// +-----------------------------------------------------------------------+ + /// | FULLY MATCHING | + /// | Partition 3 | + /// | +-----------------------------------+-----------------------------+ | + /// | | SPECIES (min: 'A...',max: 'A...') | S (min: 76, max: 101) | | + /// | +-----------------------------------+-----------------------------+ | + /// | | Alpine Ibex | 101 | | + /// | | Alpine Goat | 76 | | + /// | | Alpine Sheep | 83 | | + /// | +-----------------------------------+-----------------------------+ | + /// +-----------------------------------------------------------------------+ + + /// Without limit pruning: Scan Partition 2 → Partition 3 → Partition 4 (until limit reached) + /// With limit pruning: If Partition 3 contains enough rows to satisfy the limit, + /// skip Partitions 2 and 4 entirely and go directly to Partition 3. + /// + /// This optimization is particularly effective when: + /// - The limit is small relative to the total dataset size + /// - There are row groups that are fully matched by the filter predicates + /// - The fully matched row groups contain sufficient rows to satisfy the limit + /// /// For more information, see the [paper](https://arxiv.org/pdf/2504.11540)'s "Pruning for LIMIT Queries" part pub fn prune_by_limit( &mut self, @@ -96,7 +155,8 @@ impl RowGroupAccessPlanFilter { let mut fully_matched_row_group_indexes: Vec = Vec::new(); let mut fully_matched_rows_count: usize = 0; - // Iterate through the currently accessible row groups + // Iterate through the currently accessible row groups and try to + // find a set of matching row groups that can satisfy the limit for &idx in self.access_plan.row_group_indexes().iter() { if self.is_fully_matched[idx] { let row_group_row_count = rg_metadata[idx].num_rows() as usize; @@ -108,6 +168,8 @@ impl RowGroupAccessPlanFilter { } } + // If we can satisfy the limit with fully matching row groups, + // rewrite the plan to do so if fully_matched_rows_count >= limit { let original_num_accessible_row_groups = self.access_plan.row_group_indexes().len(); diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs index b85a7a8f80405..fea7acb221304 100644 --- a/datafusion/physical-plan/src/limit.rs +++ b/datafusion/physical-plan/src/limit.rs @@ -52,8 +52,8 @@ pub struct GlobalLimitExec { /// Execution metrics metrics: ExecutionPlanMetricsSet, cache: PlanProperties, - /// If the child plan is a sort node, after the sort node is removed during - /// physical optimization, we should add the required ordering to the limit node + /// Does the limit have to preserve the order of its input, and if so what is it? + /// Some optimizations may reorder the input if no particular sort is required required_ordering: Option, } diff --git a/datafusion/sqllogictest/test_files/limit_pruning.slt b/datafusion/sqllogictest/test_files/limit_pruning.slt index cc8a17e5b78b7..0735e05cb8b6f 100644 --- a/datafusion/sqllogictest/test_files/limit_pruning.slt +++ b/datafusion/sqllogictest/test_files/limit_pruning.slt @@ -20,22 +20,26 @@ set datafusion.execution.parquet.pushdown_filters = true; statement ok -CREATE TABLE t AS VALUES +CREATE TABLE tracking_data AS VALUES +-- ***** Row Group 0 ***** ('Anow Vole', 7), ('Brown Bear', 133), ('Gray Wolf', 82), +-- ***** Row Group 1 ***** ('Lynx', 71), ('Red Fox', 40), ('Alpine Bat', 6), +-- ***** Row Group 2 ***** ('Nlpine Ibex', 101), ('Nlpine Goat', 76), ('Nlpine Sheep', 83), +-- ***** Row Group 3 ***** ('Europ. Mole', 4), ('Polecat', 16), ('Alpine Ibex', 97); statement ok -COPY (SELECT column1 as a, column2 as b FROM t) +COPY (SELECT column1 as species, column2 as s FROM tracking_data) TO 'test_files/scratch/limit_pruning/data.parquet' STORED AS PARQUET OPTIONS ( @@ -43,10 +47,10 @@ OPTIONS ( ); statement ok -drop table t; +drop table tracking_data; statement ok -CREATE EXTERNAL TABLE t +CREATE EXTERNAL TABLE tracking_data STORED AS PARQUET LOCATION 'test_files/scratch/limit_pruning/data.parquet'; @@ -57,21 +61,21 @@ set datafusion.explain.analyze_level = summary; # row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched # limit_pruned_row_groups=2 total → 0 matched query TT -explain analyze select * from t where a > 'M' AND b >= 50 limit 3; +explain analyze select * from tracking_data where species > 'M' AND s >= 50 limit 3; ---- -Plan with Metrics DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[a, b], limit=3, file_type=parquet, predicate=a@0 > M AND b@1 >= 50, pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 > M AND b_null_count@4 != row_count@2 AND b_max@3 >= 50, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=, output_bytes=, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=3 total → 3 matched, limit_pruned_row_groups=2 total → 0 matched, bytes_scanned=, metadata_load_time=, scan_efficiency_ratio=] +Plan with Metrics DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[species, s], limit=3, file_type=parquet, predicate=species@0 > M AND s@1 >= 50, pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=1ns, output_bytes=142.0 B, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=3 total → 3 matched, limit_pruned_row_groups=2 total → 0 matched, bytes_scanned=171, metadata_load_time=444.63µs, scan_efficiency_ratio=7.3% (171/2.35 K)] # limit_pruned_row_groups=0 total → 0 matched # because of order by, scan needs to preserve sort, so limit pruning is disabled query TT -explain analyze select * from t where a > 'M' AND b >= 50 order by a limit 3; +explain analyze select * from tracking_data where species > 'M' AND s >= 50 order by species limit 3; ---- Plan with Metrics -01)SortExec: TopK(fetch=3), expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[a@0 < Nlpine Sheep], metrics=[output_rows=3, elapsed_compute=, output_bytes=] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[a, b], file_type=parquet, predicate=a@0 > M AND b@1 >= 50 AND DynamicFilter [ a@0 < Nlpine Sheep ], pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 > M AND b_null_count@4 != row_count@2 AND b_max@3 >= 50 AND a_null_count@1 != row_count@2 AND a_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=, output_bytes=, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=9 total → 9 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=, metadata_load_time=, scan_efficiency_ratio=] +01)SortExec: TopK(fetch=3), expr=[species@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[species@0 < Nlpine Sheep], metrics=[output_rows=3, elapsed_compute=2.69ms, output_bytes=72.0 B] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[species, s], file_type=parquet, predicate=species@0 > M AND s@1 >= 50 AND DynamicFilter [ species@0 < Nlpine Sheep ], pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50 AND species_null_count@1 != row_count@2 AND species_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=1ns, output_bytes=142.0 B, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=9 total → 9 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=521, metadata_load_time=512.29µs, scan_efficiency_ratio=22% (521/2.35 K)] statement ok -drop table t; +drop table tracking_data; statement ok reset datafusion.explain.analyze_level; diff --git a/docs/source/user-guide/explain-usage.md b/docs/source/user-guide/explain-usage.md index 5a1184539c034..8fe83163813da 100644 --- a/docs/source/user-guide/explain-usage.md +++ b/docs/source/user-guide/explain-usage.md @@ -228,6 +228,7 @@ When predicate pushdown is enabled, `DataSourceExec` with `ParquetSource` gains - `page_index_rows_pruned`: number of rows evaluated by page index filters. The metric reports both how many rows were considered in total and how many matched (were not pruned). - `row_groups_pruned_bloom_filter`: number of row groups evaluated by Bloom Filters, reporting both total checked groups and groups that matched. - `row_groups_pruned_statistics`: number of row groups evaluated by row-group statistics (min/max), reporting both total checked groups and groups that matched. +- `limit_pruned_row_groups`: number of row groups pruned by the limit. - `pushdown_rows_matched`: rows that were tested by any of the above filters, and passed all of them. - `pushdown_rows_pruned`: rows that were tested by any of the above filters, and did not pass at least one of them. - `predicate_evaluation_errors`: number of times evaluating the filter expression failed (expected to be zero in normal operation) From 56cda2d928d07e24e71aaf2671459235d0b3a799 Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Wed, 7 Jan 2026 14:17:56 +0800 Subject: [PATCH 15/26] remove scratch --- datafusion/core/tests/parquet/row_group_pruning.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs index e588dd06ca8f1..445ae7e97f228 100644 --- a/datafusion/core/tests/parquet/row_group_pruning.rs +++ b/datafusion/core/tests/parquet/row_group_pruning.rs @@ -1957,6 +1957,5 @@ async fn test_limit_pruning_exceeds_fully_matched() -> datafusion_common::error: .with_limit_pruned_row_groups(Some(0)) // No limit pruning since we need all RGs .test_row_group_prune_with_custom_data(schema, batches, 4) .await; - Ok(()) } From a875d41010191b292efa968b77e75dd9eaa6d8bc Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Wed, 7 Jan 2026 14:21:26 +0800 Subject: [PATCH 16/26] remove scratch --- test_files/scratch/limit_pruning/data.parquet | Bin 2320 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 test_files/scratch/limit_pruning/data.parquet diff --git a/test_files/scratch/limit_pruning/data.parquet b/test_files/scratch/limit_pruning/data.parquet deleted file mode 100644 index 535026d53c5553427b4ac2bbbcd277b2393460b5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2320 zcmbtWJ!lj`6n?W=_kOZ5r(}j*xZ(<3qJ;buN<6ff_2&}3Xzmn$KrnmB`h)QzBk_t8I{EW zuJch|%5#ImW`OP&eLvu&WRL&=S+2}vj#noABre=)Wu}~QeXmNcT-CdlIZ>HB%Yk=s zrw2bz;hzyA6aX{~Gnm%41u#WF&$gy`1TYqZTTsL(8XAm!U}NkL;aq;K6)wR*N0j`v z@@x!k(Vxg1sLWE`a&qcI+0VG%^dDkJmYRcZKytzOyHZcDF13TfQab`2t`#<1@@I1u zZ`wS1-uI^v`(o)$_J2zCArD9YsnjB$Xzl7S1FCfYm1}utT@9932ngn{1J=}EEJ`(gH4X`%2I7QGYzj$>+l&thbc;c- z(_&r=Ry191Q;zMlTG8u{=Z13ZBgeZ~-7%L?jx0GAk~Ifr+dX7@ReT}NTpmB~U-GyP z+}x^J%-ah2G}gv-T9rUpcHZuBAk9sC2N)?EL%1mn@sNo{=G0*5V#(KmjdvcOgv!Dyym=R>4pHTx+(;8*JaVz zjmo!+Bn9GQCl_L=ogV<~r!btj_@s4Vi=&v`W#T1smfM`K8c_?SWi<&dDwEi*n0+T^ zcT7ICKC>0wnb(n7s7b3XRARPvH!7cEb~lC0-kZei9x?l>iT7HU7ESGGK<&BVyxu_6 zTGVbe--M`yVxJ=TB!vX;ckpBD6MPqQRtzK<-V-YxULGR2b~h@YBDhEdzjYA7XGHMz z1|q0O6P#yai8*r#XD*Oscmu8Wri&SsOJo(dW#YE5@prOZNID-9$Su6PRy@4W#BJ?v zR6fP+J8`?wN!%V2x4F3ZtaafVn6^w=EFL+LtzA#+3+tn?vmkKj_NrJT9y aR|C6aptrxb-|oSuiC(8y*zr;PU+5RUYIsNh From 719fa82cb7d7d6e6968542aaec7509f4180ad051 Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Wed, 7 Jan 2026 14:25:53 +0800 Subject: [PATCH 17/26] fix clippy --- datafusion/datasource-parquet/src/row_group_filter.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs index 974e0bcd9cc93..3f43c939f6ecb 100644 --- a/datafusion/datasource-parquet/src/row_group_filter.rs +++ b/datafusion/datasource-parquet/src/row_group_filter.rs @@ -135,7 +135,7 @@ impl RowGroupAccessPlanFilter { /// | | Alpine Sheep | 83 | | /// | +-----------------------------------+-----------------------------+ | /// +-----------------------------------------------------------------------+ - + /// /// Without limit pruning: Scan Partition 2 → Partition 3 → Partition 4 (until limit reached) /// With limit pruning: If Partition 3 contains enough rows to satisfy the limit, /// skip Partitions 2 and 4 entirely and go directly to Partition 3. From 3540fd391f0a8849f6a2ac1c4cfb633c9ee6e828 Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Wed, 7 Jan 2026 14:39:24 +0800 Subject: [PATCH 18/26] refine comments --- datafusion/datasource-parquet/src/row_group_filter.rs | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs index 3f43c939f6ecb..e0907e140de40 100644 --- a/datafusion/datasource-parquet/src/row_group_filter.rs +++ b/datafusion/datasource-parquet/src/row_group_filter.rs @@ -91,13 +91,9 @@ impl RowGroupAccessPlanFilter { /// /// After initial filtering, row groups can be classified into three states: /// - /// ``` - /// PRUNING CLASSIFICATION DIAGRAM - /// ------------------------------ - /// Legend: - /// [ ] = Not Matching / Pruned - /// [X] = Partially Matching (Row Group/Page contains some matches) - /// [F] = Fully Matching (Entire range is within predicate) + /// 1. Not Matching / Pruned + /// 2. Partially Matching (Row Group/Page contains some matches) + /// 3. Fully Matching (Entire range is within predicate) /// /// +-----------------------------------------------------------------------+ /// | NOT MATCHING | From 8d60e96c14d44e75f456c376f33d1500ec1ac496 Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Wed, 7 Jan 2026 14:48:18 +0800 Subject: [PATCH 19/26] fix test --- datafusion/sqllogictest/test_files/limit_pruning.slt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/sqllogictest/test_files/limit_pruning.slt b/datafusion/sqllogictest/test_files/limit_pruning.slt index 0735e05cb8b6f..8a94bf8adc75f 100644 --- a/datafusion/sqllogictest/test_files/limit_pruning.slt +++ b/datafusion/sqllogictest/test_files/limit_pruning.slt @@ -63,7 +63,7 @@ set datafusion.explain.analyze_level = summary; query TT explain analyze select * from tracking_data where species > 'M' AND s >= 50 limit 3; ---- -Plan with Metrics DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[species, s], limit=3, file_type=parquet, predicate=species@0 > M AND s@1 >= 50, pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=1ns, output_bytes=142.0 B, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=3 total → 3 matched, limit_pruned_row_groups=2 total → 0 matched, bytes_scanned=171, metadata_load_time=444.63µs, scan_efficiency_ratio=7.3% (171/2.35 K)] +Plan with Metrics DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[species, s], limit=3, file_type=parquet, predicate=species@0 > M AND s@1 >= 50, pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=, output_bytes=, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=3 total → 3 matched, limit_pruned_row_groups=2 total → 0 matched, bytes_scanned=, metadata_load_time=, scan_efficiency_ratio= (171/2.35 K)] # limit_pruned_row_groups=0 total → 0 matched # because of order by, scan needs to preserve sort, so limit pruning is disabled @@ -71,8 +71,8 @@ query TT explain analyze select * from tracking_data where species > 'M' AND s >= 50 order by species limit 3; ---- Plan with Metrics -01)SortExec: TopK(fetch=3), expr=[species@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[species@0 < Nlpine Sheep], metrics=[output_rows=3, elapsed_compute=2.69ms, output_bytes=72.0 B] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[species, s], file_type=parquet, predicate=species@0 > M AND s@1 >= 50 AND DynamicFilter [ species@0 < Nlpine Sheep ], pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50 AND species_null_count@1 != row_count@2 AND species_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=1ns, output_bytes=142.0 B, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=9 total → 9 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=521, metadata_load_time=512.29µs, scan_efficiency_ratio=22% (521/2.35 K)] +01)SortExec: TopK(fetch=3), expr=[species@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[species@0 < Nlpine Sheep], metrics=[output_rows=3, elapsed_compute=, output_bytes=] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[species, s], file_type=parquet, predicate=species@0 > M AND s@1 >= 50 AND DynamicFilter [ species@0 < Nlpine Sheep ], pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50 AND species_null_count@1 != row_count@2 AND species_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=, output_bytes=, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=9 total → 9 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=, metadata_load_time=, scan_efficiency_ratio= (521/2.35 K)] statement ok drop table tracking_data; From 88c1c2e62e51e7c1ccf7271f18fbef168a0b58b5 Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Thu, 8 Jan 2026 13:40:02 +0800 Subject: [PATCH 20/26] refine comments --- .../src/row_group_filter.rs | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs index e0907e140de40..f54b1cc4e1bc8 100644 --- a/datafusion/datasource-parquet/src/row_group_filter.rs +++ b/datafusion/datasource-parquet/src/row_group_filter.rs @@ -97,38 +97,38 @@ impl RowGroupAccessPlanFilter { /// /// +-----------------------------------------------------------------------+ /// | NOT MATCHING | - /// | Partition 1 | + /// | Row group 1 | /// | +-----------------------------------+-----------------------------+ | - /// | | SPECIES (min: 'B...',max: 'S...') | S (min: 7, max: 133) | | + /// | | SPECIES | S | | /// | +-----------------------------------+-----------------------------+ | /// | | Snow Vole | 7 | | - /// | | Brown Bear | 133 | | - /// | | Gray Wolf | 82 | | + /// | | Brown Bear | 133 ✅ | | + /// | | Gray Wolf | 82 ✅ | | /// | +-----------------------------------+-----------------------------+ | /// +-----------------------------------------------------------------------+ /// - /// +-----------------------------------------------------------------------+ - /// | PARTIALLY MATCHING | - /// | Partition 2 Partition 4 | - /// | +------------------+--------------+ +------------------+-------+ | - /// | | SPECIES | S | | SPECIES | S | | - /// | | (min:A, max:R) |(min:6,max:70)| | (min:A, max:P) |[4-51] | | - /// | +------------------+--------------+ +------------------+-------+ | - /// | | Lynx | 71 | | Europ. Mole | 4 | | - /// | | Red Fox | 40 | | Polecat | 16 | | - /// | | Alpine Bat | 6 | | Alpine Ibex | 97 | | - /// | +------------------+--------------+ +------------------+-------+ | - /// +-----------------------------------------------------------------------+ + /// +---------------------------------------------------------------------------+ + /// | PARTIALLY MATCHING | + /// | | + /// | Row group 2 Row group 4 | + /// | +------------------+--------------+ +------------------+----------+ | + /// | | SPECIES | S | | SPECIES | S | | + /// | +------------------+--------------+ +------------------+----------+ | + /// | | Lynx | 71 ✅ | | Europ. Mole | 4 | | + /// | | Red Fox | 40 | | Polecat | 16 | | + /// | | Alpine Bat ✅ | 6 | | Alpine Ibex ✅ | 97 ✅ | | + /// | +------------------+--------------+ +------------------+----------+ | + /// +---------------------------------------------------------------------------+ /// /// +-----------------------------------------------------------------------+ /// | FULLY MATCHING | - /// | Partition 3 | + /// | Row group 3 | /// | +-----------------------------------+-----------------------------+ | - /// | | SPECIES (min: 'A...',max: 'A...') | S (min: 76, max: 101) | | + /// | | SPECIES | S | | /// | +-----------------------------------+-----------------------------+ | - /// | | Alpine Ibex | 101 | | - /// | | Alpine Goat | 76 | | - /// | | Alpine Sheep | 83 | | + /// | | Alpine Ibex ✅ | 101 ✅ | | + /// | | Alpine Goat ✅ | 76 ✅ | | + /// | | Alpine Sheep ✅ | 83 ✅ | | /// | +-----------------------------------+-----------------------------+ | /// +-----------------------------------------------------------------------+ /// From f67193b072c533c3bddcb4d9a7d5a559c3cea6b1 Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Thu, 8 Jan 2026 14:16:05 +0800 Subject: [PATCH 21/26] rich comment --- datafusion/datasource/src/file_scan_config.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 50cdc5c78f804..009c1d822c491 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -153,6 +153,9 @@ pub struct FileScanConfig { /// all records after filtering are returned. pub limit: Option, /// Whether the scan's limit is order sensitive + /// When `true`, files must be read in the exact order specified to produce + /// correct results (e.g., for `ORDER BY ... LIMIT` queries). When `false`, + /// DataFusion may reorder file processing for optimization without affecting correctness. pub preserve_order: bool, /// All equivalent lexicographical orderings that describe the schema. pub output_ordering: Vec, @@ -288,6 +291,9 @@ impl FileScanConfigBuilder { } /// Set whether the limit should be order-sensitive. + /// When `true`, files must be read in the exact order specified to produce + /// correct results (e.g., for `ORDER BY ... LIMIT` queries). When `false`, + /// DataFusion may reorder file processing for optimization without affecting correctness. pub fn with_preserve_order(mut self, order_sensitive: bool) -> Self { self.preserve_order = order_sensitive; self From 038285e5fed942f87f7238ef84371219215cbc5b Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Fri, 9 Jan 2026 17:47:19 +0800 Subject: [PATCH 22/26] remove downcast --- datafusion/datasource-parquet/src/opener.rs | 4 +- datafusion/datasource/src/file_scan_config.rs | 15 +++++ datafusion/datasource/src/source.rs | 17 ++++++ .../physical-optimizer/src/limit_pushdown.rs | 57 ++++--------------- .../physical-plan/src/coalesce_partitions.rs | 13 +++++ .../physical-plan/src/execution_plan.rs | 13 +++++ datafusion/physical-plan/src/filter.rs | 13 +++++ datafusion/physical-plan/src/projection.rs | 13 +++++ .../src/sorts/sort_preserving_merge.rs | 13 +++++ 9 files changed, 109 insertions(+), 49 deletions(-) diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index af2cb88b8aa52..8f31d2df24ac2 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -69,13 +69,13 @@ use parquet::file::metadata::{PageIndexPolicy, ParquetMetaDataReader, RowGroupMe /// Implements [`FileOpener`] for a parquet file pub(super) struct ParquetOpener { /// Execution partition index - pub partition_index: usize, + pub(crate) partition_index: usize, /// Projection to apply on top of the table schema (i.e. can reference partition columns). pub projection: ProjectionExprs, /// Target number of rows in each output RecordBatch pub batch_size: usize, /// Optional limit on the number of rows to read - pub limit: Option, + pub(crate) limit: Option, /// If should keep the output rows in order pub preserve_order: bool, /// Optional predicate to apply during the scan diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 009c1d822c491..51b9ba9e06e9b 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -484,6 +484,9 @@ impl FileScanConfigBuilder { let file_compression_type = file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED); + // If there is an output ordering, we should preserve it. + let preserve_order = preserve_order || !output_ordering.is_empty(); + FileScanConfig { object_store_url, file_source, @@ -869,6 +872,18 @@ impl DataSource for FileScanConfig { } } } + + fn with_preserve_order(&self, preserve_order: bool) -> Option> { + if self.preserve_order == preserve_order { + return Some(Arc::new(self.clone())); + } + + let new_config = FileScanConfig { + preserve_order, + ..self.clone() + }; + Some(Arc::new(new_config)) + } } impl FileScanConfig { diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs index a3892dfac9778..de18b6be2235f 100644 --- a/datafusion/datasource/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -210,6 +210,11 @@ pub trait DataSource: Send + Sync + Debug { ) -> Result>> { Ok(SortOrderPushdownResult::Unsupported) } + + /// Returns a variant of this `DataSource` that is aware of order-sensitivity. + fn with_preserve_order(&self, _preserve_order: bool) -> Option> { + None + } } /// [`ExecutionPlan`] that reads one or more files @@ -393,6 +398,18 @@ impl ExecutionPlan for DataSourceExec { Ok(Arc::new(new_exec) as Arc) }) } + + fn with_preserve_order( + &self, + preserve_order: bool, + ) -> Option> { + self.data_source + .with_preserve_order(preserve_order) + .map(|new_data_source| { + Arc::new(self.clone().with_data_source(new_data_source)) + as Arc + }) + } } impl DataSourceExec { diff --git a/datafusion/physical-optimizer/src/limit_pushdown.rs b/datafusion/physical-optimizer/src/limit_pushdown.rs index d259025b61bf1..a4dac81dbacf8 100644 --- a/datafusion/physical-optimizer/src/limit_pushdown.rs +++ b/datafusion/physical-optimizer/src/limit_pushdown.rs @@ -27,8 +27,6 @@ use datafusion_common::config::ConfigOptions; use datafusion_common::error::Result; use datafusion_common::tree_node::{Transformed, TreeNodeRecursion}; use datafusion_common::utils::combine_limit; -use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; -use datafusion_datasource::source::DataSourceExec; use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; @@ -52,7 +50,7 @@ pub struct GlobalRequirements { fetch: Option, skip: usize, satisfied: bool, - order_sensitive: bool, + preserve_order: bool, } impl LimitPushdown { @@ -72,7 +70,7 @@ impl PhysicalOptimizerRule for LimitPushdown { fetch: None, skip: 0, satisfied: false, - order_sensitive: false, + preserve_order: false, }; pushdown_limits(plan, global_state) } @@ -116,7 +114,7 @@ impl LimitExec { } } - fn order_sensitive(&self) -> bool { + fn preserve_order(&self) -> bool { match self { Self::Global(global) => global.required_ordering().is_some(), Self::Local(local) => local.required_ordering().is_some(), @@ -156,7 +154,7 @@ pub fn pushdown_limit_helper( ); global_state.skip = skip; global_state.fetch = fetch; - global_state.order_sensitive = limit_exec.order_sensitive(); + global_state.preserve_order = limit_exec.preserve_order(); // Now the global state has the most recent information, we can remove // the `LimitExec` plan. We will decide later if we should add it again @@ -253,10 +251,9 @@ pub fn pushdown_limit_helper( let maybe_fetchable = pushdown_plan.with_fetch(skip_and_fetch); if global_state.satisfied { if let Some(plan_with_fetch) = maybe_fetchable { - let plan_with_preserve_order = ensure_preserve_order_if_needed( - plan_with_fetch, - global_state.order_sensitive, - ); + let plan_with_preserve_order = plan_with_fetch + .with_preserve_order(global_state.preserve_order) + .unwrap_or(plan_with_fetch); Ok((Transformed::yes(plan_with_preserve_order), global_state)) } else { Ok((Transformed::no(pushdown_plan), global_state)) @@ -264,10 +261,9 @@ pub fn pushdown_limit_helper( } else { global_state.satisfied = true; pushdown_plan = if let Some(plan_with_fetch) = maybe_fetchable { - let plan_with_preserve_order = ensure_preserve_order_if_needed( - plan_with_fetch, - global_state.order_sensitive, - ); + let plan_with_preserve_order = plan_with_fetch + .with_preserve_order(global_state.preserve_order) + .unwrap_or(plan_with_fetch); if global_skip > 0 { add_global_limit( @@ -362,37 +358,4 @@ fn add_global_limit( Arc::new(GlobalLimitExec::new(pushdown_plan, skip, fetch)) as _ } -/// Helper function to handle DataSourceExec preserve_order setting -fn ensure_preserve_order_if_needed( - plan: Arc, - order_sensitive: bool, -) -> Arc { - if !order_sensitive { - return plan; - } - - let Some(data_source_exec) = plan.as_any().downcast_ref::() else { - return plan; - }; - - let Some(file_scan_config) = data_source_exec - .data_source() - .as_any() - .downcast_ref::() - else { - return plan; - }; - - if file_scan_config.preserve_order { - return plan; - } - - let new_config = FileScanConfigBuilder::from(file_scan_config.clone()) - .with_preserve_order(true) - .build(); - - let new_data_source_exec = DataSourceExec::new(Arc::new(new_config)); - Arc::new(new_data_source_exec) as Arc -} - // See tests in datafusion/core/tests/physical_optimizer diff --git a/datafusion/physical-plan/src/coalesce_partitions.rs b/datafusion/physical-plan/src/coalesce_partitions.rs index d83f90eb3d8c1..22dcc85d6ea3a 100644 --- a/datafusion/physical-plan/src/coalesce_partitions.rs +++ b/datafusion/physical-plan/src/coalesce_partitions.rs @@ -278,6 +278,19 @@ impl ExecutionPlan for CoalescePartitionsExec { })) } + fn with_preserve_order( + &self, + preserve_order: bool, + ) -> Option> { + self.input + .with_preserve_order(preserve_order) + .and_then(|new_input| { + Arc::new(self.clone()) + .with_new_children(vec![new_input]) + .ok() + }) + } + fn gather_filters_for_pushdown( &self, _phase: FilterPushdownPhase, diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs index 06da0b8933c18..9101cbb00944b 100644 --- a/datafusion/physical-plan/src/execution_plan.rs +++ b/datafusion/physical-plan/src/execution_plan.rs @@ -708,6 +708,19 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { ) -> Result>> { Ok(SortOrderPushdownResult::Unsupported) } + + /// Returns a variant of this `ExecutionPlan` that is aware of order-sensitivity. + /// + /// This is used to signal to data sources that the output ordering must be + /// preserved, even if it might be more efficient to ignore it (e.g. by + /// skipping some row groups in Parquet). + /// + fn with_preserve_order( + &self, + _preserve_order: bool, + ) -> Option> { + None + } } /// [`ExecutionPlan`] Invariant Level diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 674fe6692adf5..a1c627c959951 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -615,6 +615,19 @@ impl ExecutionPlan for FilterExec { fetch, })) } + + fn with_preserve_order( + &self, + preserve_order: bool, + ) -> Option> { + self.input + .with_preserve_order(preserve_order) + .and_then(|new_input| { + Arc::new(self.clone()) + .with_new_children(vec![new_input]) + .ok() + }) + } } impl EmbeddedProjection for FilterExec { diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs index e8608f17a1b20..8f2f2219f4338 100644 --- a/datafusion/physical-plan/src/projection.rs +++ b/datafusion/physical-plan/src/projection.rs @@ -427,6 +427,19 @@ impl ExecutionPlan for ProjectionExec { } } } + + fn with_preserve_order( + &self, + preserve_order: bool, + ) -> Option> { + self.input + .with_preserve_order(preserve_order) + .and_then(|new_input| { + Arc::new(self.clone()) + .with_new_children(vec![new_input]) + .ok() + }) + } } impl ProjectionStream { diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs index 0ddea90a98bf3..68c457a0d8a3c 100644 --- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs +++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs @@ -245,6 +245,19 @@ impl ExecutionPlan for SortPreservingMergeExec { })) } + fn with_preserve_order( + &self, + preserve_order: bool, + ) -> Option> { + self.input + .with_preserve_order(preserve_order) + .and_then(|new_input| { + Arc::new(self.clone()) + .with_new_children(vec![new_input]) + .ok() + }) + } + fn required_input_distribution(&self) -> Vec { vec![Distribution::UnspecifiedDistribution] } From 661a2c24790f4974aa0b9daf98600de231537595 Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Fri, 9 Jan 2026 17:51:58 +0800 Subject: [PATCH 23/26] remove dependency --- Cargo.lock | 1 - datafusion/physical-optimizer/Cargo.toml | 1 - 2 files changed, 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f5e01ea1e10e8..2d40ab4506900 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2427,7 +2427,6 @@ version = "52.0.0" dependencies = [ "arrow", "datafusion-common", - "datafusion-datasource", "datafusion-execution", "datafusion-expr", "datafusion-expr-common", diff --git a/datafusion/physical-optimizer/Cargo.toml b/datafusion/physical-optimizer/Cargo.toml index caa9ee7b46914..395da10d629ba 100644 --- a/datafusion/physical-optimizer/Cargo.toml +++ b/datafusion/physical-optimizer/Cargo.toml @@ -43,7 +43,6 @@ recursive_protection = ["dep:recursive"] [dependencies] arrow = { workspace = true } datafusion-common = { workspace = true } -datafusion-datasource = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true, default-features = true } From ca7de4fae56aaa6fad7fb15230e5be139dabfaa3 Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Tue, 13 Jan 2026 16:00:18 +0800 Subject: [PATCH 24/26] add an exmaple --- .../src/row_group_filter.rs | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs index f54b1cc4e1bc8..935f179ff4702 100644 --- a/datafusion/datasource-parquet/src/row_group_filter.rs +++ b/datafusion/datasource-parquet/src/row_group_filter.rs @@ -132,6 +132,33 @@ impl RowGroupAccessPlanFilter { /// | +-----------------------------------+-----------------------------+ | /// +-----------------------------------------------------------------------+ /// + /// # Example with Statistics Truncation and NOT Inversion + /// + /// When statistics are truncated to length 6 (e.g., `statistics_truncate_length = 6`), + /// the min/max values become: + /// + /// ``` + /// Row group 3: species_min="Alpine", species_max="Alpine" (truncated from "Alpine Ibex"/"Alpine Sheep") + /// s_min=76, s_max=101 + /// ``` + /// + /// To identify this as fully matching, the system uses NOT inversion: + /// 1. Original predicate: `species LIKE 'Alpine%' AND s >= 50` + /// 2. Inverted predicate: `NOT (species LIKE 'Alpine%' AND s >= 50)` + /// Simplified to: `species NOT LIKE 'Alpine%' OR s < 50` + /// 3. Pruning predicate generated: + /// `(species_min NOT LIKE 'Alpine%' OR species_max NOT LIKE 'Alpine%') OR s_min < 50` + /// + /// For row group 3 with truncated stats: + /// - Evaluating `species_min NOT LIKE 'Alpine%'`: `"A" NOT LIKE 'Alpine%'` = `false` + /// - Evaluating `species_max NOT LIKE 'Alpine%'`: `"A" NOT LIKE 'Alpine%'` = `false` + /// - Evaluating `s_min < 50`: `76 < 50` = `false` + /// - Final result: `(false OR false) OR false` = `false` + /// + /// Since the inverted predicate would prune this row group (returns false), it means + /// no rows in this group could possibly satisfy the inverted predicate. + /// Therefore, all rows in this group must match the original predicate, making it fully matched + /// /// Without limit pruning: Scan Partition 2 → Partition 3 → Partition 4 (until limit reached) /// With limit pruning: If Partition 3 contains enough rows to satisfy the limit, /// skip Partitions 2 and 4 entirely and go directly to Partition 3. From d2b84d4d9ebf2d0ceaf2a4a5e64853949d9ddcb1 Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Tue, 13 Jan 2026 16:26:41 +0800 Subject: [PATCH 25/26] fix doc test --- datafusion/datasource-parquet/src/row_group_filter.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs index 935f179ff4702..c3c803dbfc818 100644 --- a/datafusion/datasource-parquet/src/row_group_filter.rs +++ b/datafusion/datasource-parquet/src/row_group_filter.rs @@ -137,7 +137,7 @@ impl RowGroupAccessPlanFilter { /// When statistics are truncated to length 6 (e.g., `statistics_truncate_length = 6`), /// the min/max values become: /// - /// ``` + /// ```text /// Row group 3: species_min="Alpine", species_max="Alpine" (truncated from "Alpine Ibex"/"Alpine Sheep") /// s_min=76, s_max=101 /// ``` From 6c515b2befb8ff780d12f21c7ccedacc078e186c Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Thu, 15 Jan 2026 14:32:23 +0800 Subject: [PATCH 26/26] update doc --- .../src/row_group_filter.rs | 45 ++++++++++--------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs index c3c803dbfc818..7eea8285ad6b5 100644 --- a/datafusion/datasource-parquet/src/row_group_filter.rs +++ b/datafusion/datasource-parquet/src/row_group_filter.rs @@ -132,32 +132,35 @@ impl RowGroupAccessPlanFilter { /// | +-----------------------------------+-----------------------------+ | /// +-----------------------------------------------------------------------+ /// - /// # Example with Statistics Truncation and NOT Inversion + /// ### Identification of Fully Matching Row Groups /// - /// When statistics are truncated to length 6 (e.g., `statistics_truncate_length = 6`), - /// the min/max values become: + /// DataFusion identifies row groups where ALL rows satisfy the filter by inverting the + /// predicate and checking if statistics prove the inverted version is false for the group. /// - /// ```text - /// Row group 3: species_min="Alpine", species_max="Alpine" (truncated from "Alpine Ibex"/"Alpine Sheep") - /// s_min=76, s_max=101 - /// ``` + /// For example, prefix matches like `species LIKE 'Alpine%'` are pruned using ranges: + /// 1. Candidate Range: `species >= 'Alpine' AND species < 'Alpinf'` + /// 2. Inverted Condition (to prove full match): `species < 'Alpine' OR species >= 'Alpinf'` + /// 3. Statistical Evaluation (check if any row *could* satisfy the inverted condition): + /// `min < 'Alpine' OR max >= 'Alpinf'` /// - /// To identify this as fully matching, the system uses NOT inversion: - /// 1. Original predicate: `species LIKE 'Alpine%' AND s >= 50` - /// 2. Inverted predicate: `NOT (species LIKE 'Alpine%' AND s >= 50)` - /// Simplified to: `species NOT LIKE 'Alpine%' OR s < 50` - /// 3. Pruning predicate generated: - /// `(species_min NOT LIKE 'Alpine%' OR species_max NOT LIKE 'Alpine%') OR s_min < 50` + /// If this evaluation is **false**, it proves no row can fail the original filter, + /// so the row group is **FULLY MATCHING**. /// - /// For row group 3 with truncated stats: - /// - Evaluating `species_min NOT LIKE 'Alpine%'`: `"A" NOT LIKE 'Alpine%'` = `false` - /// - Evaluating `species_max NOT LIKE 'Alpine%'`: `"A" NOT LIKE 'Alpine%'` = `false` - /// - Evaluating `s_min < 50`: `76 < 50` = `false` - /// - Final result: `(false OR false) OR false` = `false` + /// ### Impact of Statistics Truncation /// - /// Since the inverted predicate would prune this row group (returns false), it means - /// no rows in this group could possibly satisfy the inverted predicate. - /// Therefore, all rows in this group must match the original predicate, making it fully matched + /// The precision of pruning depends on the metadata quality. Truncated statistics + /// may prevent the system from proving a full match. + /// + /// **Example**: `WHERE species LIKE 'Alpine%'` (Target range: `['Alpine', 'Alpinf')`) + /// + /// | Truncation Length | min / max | Inverted Evaluation | Status | + /// |-------------------|---------------------|---------------------------------------------------------------------|------------------------| + /// | **Length 6** | `Alpine` / `Alpine` | `"Alpine" < "Alpine" (F) OR "Alpine" >= "Alpinf" (F)` -> **false** | **FULLY MATCHING** | + /// | **Length 3** | `Alp` / `Alq` | `"Alp" < "Alpine" (T) OR "Alq" >= "Alpinf" (T)` -> **true** | **PARTIALLY MATCHING** | + /// + /// Even though Row Group 3 only contains matching rows, truncation to length 3 makes + /// the statistics `[Alp, Alq]` too broad to prove it (they could include "Alpha"). + /// The system must conservatively scan the group. /// /// Without limit pruning: Scan Partition 2 → Partition 3 → Partition 4 (until limit reached) /// With limit pruning: If Partition 3 contains enough rows to satisfy the limit,