From 4c34258d8257fa5cb2ef0e200f0b243b075ea5c4 Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Fri, 31 Oct 2025 15:06:33 +0800
Subject: [PATCH 01/26] Support row group limit pruning

---
 datafusion/core/tests/parquet/mod.rs          |  52 ++-
 .../core/tests/parquet/row_group_pruning.rs   | 336 +++++++++++++++++-
 datafusion/datasource-parquet/src/metrics.rs  |  34 +-
 datafusion/datasource-parquet/src/opener.rs   |   6 +-
 .../src/row_group_filter.rs                   |  97 ++++-
 datafusion/pruning/src/pruning_predicate.rs   |   1 -
 6 files changed, 505 insertions(+), 21 deletions(-)
diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs
index 44c9a2393e3d8..16ac557f18811 100644
--- a/datafusion/core/tests/parquet/mod.rs
+++ b/datafusion/core/tests/parquet/mod.rs
@@ -182,6 +182,11 @@ impl TestOutput {
             .map(|(_pruned, matched)| matched)
     }
 
+    /// The number of row_groups fully matched by statistics
+    fn row_groups_fully_matched_statistics(&self) -> Option<usize> {
+        self.metric_value("row_groups_fully_matched_statistics")
+    }
+
     /// The number of row_groups pruned by statistics
     fn row_groups_pruned_statistics(&self) -> Option<usize> {
         self.pruning_metric("row_groups_pruned_statistics")
@@ -219,6 +224,11 @@ impl TestOutput {
             .map(|(pruned, _matched)| pruned)
     }
 
+    /// The number of row groups pruned by limit pruning
+    fn limit_pruned_row_groups(&self) -> Option<usize> {
+        self.metric_value("limit_pruned_row_groups")
+    }
+
     fn description(&self) -> String {
         format!(
             "Input:\n{}\nQuery:\n{}\nOutput:\n{}\nMetrics:\n{}",
@@ -232,20 +242,41 @@ impl TestOutput {
 /// and the appropriate scenario
 impl ContextWithParquet {
     async fn new(scenario: Scenario, unit: Unit) -> Self {
-        Self::with_config(scenario, unit, SessionConfig::new()).await
+        Self::with_config(scenario, unit, SessionConfig::new(), None, None).await
+    }
+
+    /// Set custom schema and batches for the test
+    pub async fn with_custom_data(
+        scenario: Scenario,
+        unit: Unit,
+        schema: Arc<Schema>,
+        batches: Vec<RecordBatch>,
+    ) -> Self {
+        Self::with_config(
+            scenario,
+            unit,
+            SessionConfig::new(),
+            Some(schema),
+            Some(batches),
+        )
+        .await
     }
 
     async fn with_config(
         scenario: Scenario,
         unit: Unit,
         mut config: SessionConfig,
+        custom_schema: Option<Arc<Schema>>,
+        custom_batches: Option<Vec<RecordBatch>>,
     ) -> Self {
         // Use a single partition for deterministic results no matter how many CPUs the host has
         config = config.with_target_partitions(1);
         let file = match unit {
             Unit::RowGroup(row_per_group) => {
                 config = config.with_parquet_bloom_filter_pruning(true);
-                make_test_file_rg(scenario, row_per_group).await
+                config.options_mut().execution.parquet.pushdown_filters = true;
+                make_test_file_rg(scenario, row_per_group, custom_schema, custom_batches)
+                    .await
             }
             Unit::Page(row_per_page) => {
                 config = config.with_parquet_page_index_pruning(true);
@@ -1075,7 +1106,12 @@ fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
 }
 
 /// Create a test parquet file with various data types
-async fn make_test_file_rg(scenario: Scenario, row_per_group: usize) -> NamedTempFile {
+async fn make_test_file_rg(
+    scenario: Scenario,
+    row_per_group: usize,
+    custom_schema: Option<Arc<Schema>>,
+    custom_batches: Option<Vec<RecordBatch>>,
+) -> NamedTempFile {
     let mut output_file = tempfile::Builder::new()
         .prefix("parquet_pruning")
         .suffix(".parquet")
@@ -1088,8 +1124,14 @@ async fn make_test_file_rg(scenario: Scenario, row_per_group: usize) -> NamedTem
         .set_statistics_enabled(EnabledStatistics::Page)
         .build();
 
-    let batches = create_data_batch(scenario);
-    let schema = batches[0].schema();
+    let (batches, schema) =
+        if let (Some(schema), Some(batches)) = (custom_schema, custom_batches) {
+            (batches, schema)
+        } else {
+            let batches = create_data_batch(scenario);
+            let schema = batches[0].schema();
+            (batches, schema)
+        };
 
     let mut writer = ArrowWriter::try_new(&mut output_file, schema, Some(props)).unwrap();
 
diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs
index 0411298055f26..e0ba462281ce3 100644
--- a/datafusion/core/tests/parquet/row_group_pruning.rs
+++ b/datafusion/core/tests/parquet/row_group_pruning.rs
@@ -18,8 +18,12 @@
 //! This file contains an end to end test of parquet pruning. It writes
 //! data into a parquet file and then verifies row groups are pruned as
 //! expected.
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, Int32Array, RecordBatch};
+use arrow_schema::{DataType, Field, Schema};
 use datafusion::prelude::SessionConfig;
-use datafusion_common::ScalarValue;
+use datafusion_common::{DataFusionError, ScalarValue};
 use itertools::Itertools;
 
 use crate::parquet::Unit::RowGroup;
@@ -30,10 +34,12 @@ struct RowGroupPruningTest {
     query: String,
     expected_errors: Option<usize>,
     expected_row_group_matched_by_statistics: Option<usize>,
+    expected_row_group_fully_matched_by_statistics: Option<usize>,
     expected_row_group_pruned_by_statistics: Option<usize>,
     expected_files_pruned_by_statistics: Option<usize>,
     expected_row_group_matched_by_bloom_filter: Option<usize>,
     expected_row_group_pruned_by_bloom_filter: Option<usize>,
+    expected_limit_pruned_row_groups: Option<usize>,
     expected_rows: usize,
 }
 impl RowGroupPruningTest {
@@ -45,9 +51,11 @@ impl RowGroupPruningTest {
             expected_errors: None,
             expected_row_group_matched_by_statistics: None,
             expected_row_group_pruned_by_statistics: None,
+            expected_row_group_fully_matched_by_statistics: None,
             expected_files_pruned_by_statistics: None,
             expected_row_group_matched_by_bloom_filter: None,
             expected_row_group_pruned_by_bloom_filter: None,
+            expected_limit_pruned_row_groups: None,
             expected_rows: 0,
         }
     }
@@ -76,6 +84,15 @@ impl RowGroupPruningTest {
         self
     }
 
+    // Set the expected fully matched row groups by statistics
+    fn with_fully_matched_by_stats(
+        mut self,
+        fully_matched_by_stats: Option<usize>,
+    ) -> Self {
+        self.expected_row_group_fully_matched_by_statistics = fully_matched_by_stats;
+        self
+    }
+
     // Set the expected pruned row groups by statistics
     fn with_pruned_by_stats(mut self, pruned_by_stats: Option<usize>) -> Self {
         self.expected_row_group_pruned_by_statistics = pruned_by_stats;
@@ -99,6 +116,11 @@ impl RowGroupPruningTest {
         self
     }
 
+    fn with_limit_pruned_row_groups(mut self, pruned_by_limit: Option<usize>) -> Self {
+        self.expected_limit_pruned_row_groups = pruned_by_limit;
+        self
+    }
+
     /// Set the number of expected rows from the output of this test
     fn with_expected_rows(mut self, rows: usize) -> Self {
         self.expected_rows = rows;
@@ -144,6 +166,65 @@ impl RowGroupPruningTest {
             self.expected_row_group_pruned_by_bloom_filter,
             "mismatched row_groups_pruned_bloom_filter",
         );
+
+        assert_eq!(
+            output.result_rows,
+            self.expected_rows,
+            "Expected {} rows, got {}: {}",
+            output.result_rows,
+            self.expected_rows,
+            output.description(),
+        );
+    }
+
+    // Execute the test with the current configuration
+    async fn test_row_group_prune_with_custom_data(
+        self,
+        schema: Arc<Schema>,
+        batches: Vec<RecordBatch>,
+        max_row_per_group: usize,
+    ) {
+        let output = ContextWithParquet::with_custom_data(
+            self.scenario,
+            RowGroup(max_row_per_group),
+            schema,
+            batches,
+        )
+        .await
+        .query(&self.query)
+        .await;
+
+        println!("{}", output.description());
+        assert_eq!(
+            output.predicate_evaluation_errors(),
+            self.expected_errors,
+            "mismatched predicate_evaluation error"
+        );
+        assert_eq!(
+            output.row_groups_matched_statistics(),
+            self.expected_row_group_matched_by_statistics,
+            "mismatched row_groups_matched_statistics",
+        );
+        assert_eq!(
+            output.row_groups_fully_matched_statistics(),
+            self.expected_row_group_fully_matched_by_statistics,
+            "mismatched row_groups_fully_matched_statistics",
+        );
+        assert_eq!(
+            output.row_groups_pruned_statistics(),
+            self.expected_row_group_pruned_by_statistics,
+            "mismatched row_groups_pruned_statistics",
+        );
+        assert_eq!(
+            output.files_ranges_pruned_statistics(),
+            self.expected_files_pruned_by_statistics,
+            "mismatched files_ranges_pruned_statistics",
+        );
+        assert_eq!(
+            output.limit_pruned_row_groups(),
+            self.expected_limit_pruned_row_groups,
+            "mismatched limit_pruned_row_groups",
+        );
         assert_eq!(
             output.result_rows,
             self.expected_rows,
@@ -289,11 +370,16 @@ async fn prune_disabled() {
     let expected_rows = 10;
     let config = SessionConfig::new().with_parquet_pruning(false);
 
-    let output =
-        ContextWithParquet::with_config(Scenario::Timestamps, RowGroup(5), config)
-            .await
-            .query(query)
-            .await;
+    let output = ContextWithParquet::with_config(
+        Scenario::Timestamps,
+        RowGroup(5),
+        config,
+        None,
+        None,
+    )
+    .await
+    .query(query)
+    .await;
     println!("{}", output.description());
 
     // This should not prune any
@@ -1636,3 +1722,241 @@ async fn test_bloom_filter_decimal_dict() {
         .test_row_group_prune()
         .await;
 }
+
+// Helper function to create a batch with a single Int32 column.
+fn make_i32_batch(
+    name: &str,
+    values: Vec<i32>,
+) -> datafusion_common::error::Result<RecordBatch> {
+    let schema = Arc::new(Schema::new(vec![Field::new(name, DataType::Int32, false)]));
+    let array: ArrayRef = Arc::new(Int32Array::from(values));
+    RecordBatch::try_new(schema, vec![array]).map_err(DataFusionError::from)
+}
+
+// Helper function to create a batch with two Int32 columns
+fn make_two_col_i32_batch(
+    name_a: &str,
+    name_b: &str,
+    values_a: Vec<i32>,
+    values_b: Vec<i32>,
+) -> datafusion_common::error::Result<RecordBatch> {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new(name_a, DataType::Int32, false),
+        Field::new(name_b, DataType::Int32, false),
+    ]));
+    let array_a: ArrayRef = Arc::new(Int32Array::from(values_a));
+    let array_b: ArrayRef = Arc::new(Int32Array::from(values_b));
+    RecordBatch::try_new(schema, vec![array_a, array_b]).map_err(DataFusionError::from)
+}
+
+#[tokio::test]
+async fn test_limit_pruning_basic() -> datafusion_common::error::Result<()> {
+    // Scenario: Simple integer column, multiple row groups
+    // Query: SELECT c1 FROM  t WHERE c1 = 0 LIMIT 2
+    // We expect 2 rows in total.
+
+    // Row Group 0: c1 = [0, -2] -> Partially matched, 1 row
+    // Row Group 1: c1 = [1, 2] -> Fully matched, 2 rows
+    // Row Group 2: c1 = [3, 4] -> Fully matched, 2 rows
+    // Row Group 3: c1 = [5, 6] -> Fully matched, 2 rows
+    // Row Group 4: c1 = [-1, -2] -> Not matched
+
+    // If limit = 2, and RG1 is fully matched and has 2 rows, we should
+    // only scan RG1 and prune other row groups
+    // RG4 is pruned by statistics. RG2 and RG3 are pruned by limit.
+    // So 2 row groups are effectively pruned due to limit pruning.
+
+    let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
+    let query = "SELECT c1 FROM t WHERE c1 >= 0 LIMIT 2";
+
+    let batches = vec![
+        make_i32_batch("c1", vec![0, -2])?,
+        make_i32_batch("c1", vec![0, 0])?,
+        make_i32_batch("c1", vec![0, 0])?,
+        make_i32_batch("c1", vec![0, 0])?,
+        make_i32_batch("c1", vec![-1, -2])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int) // Assuming Scenario::Int can handle this data
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(2)
+        .with_pruned_files(Some(0))
+        .with_matched_by_stats(Some(4))
+        .with_fully_matched_by_stats(Some(3))
+        .with_pruned_by_stats(Some(1))
+        .with_limit_pruned_row_groups(Some(3))
+        .test_row_group_prune_with_custom_data(schema, batches, 2)
+        .await;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_limit_pruning_complex_filter() -> datafusion_common::error::Result<()> {
+    // Test Case 1: Complex filter with two columns (a = 1 AND b > 1 AND b < 4)
+    // Row Group 0: a=[1,1,1], b=[0,2,3] -> Partially matched, 2 rows match (b=2,3)
+    // Row Group 1: a=[1,1,1], b=[2,2,2] -> Fully matched, 3 rows
+    // Row Group 2: a=[1,1,1], b=[2,3,3] -> Fully matched, 3 rows
+    // Row Group 3: a=[1,1,1], b=[2,2,3] -> Fully matched, 3 rows
+    // Row Group 4: a=[2,2,2], b=[2,2,2] -> Not matched (a != 1)
+    // Row Group 5: a=[1,1,1], b=[5,6,7] -> Not matched (b >= 4)
+
+    // With LIMIT 5, we need RG1 (3 rows) + RG2 (2 rows from 3) = 5 rows
+    // RG4 and RG5 should be pruned by statistics
+    // RG3 should be pruned by limit
+    // RG0 is partially matched, so it depends on the order
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Int32, false),
+        Field::new("b", DataType::Int32, false),
+    ]));
+    let query = "SELECT a, b FROM t WHERE a = 1 AND b > 1 AND b < 4 LIMIT 5";
+
+    let batches = vec![
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![0, 2, 3])?,
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![2, 2, 2])?,
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![2, 3, 3])?,
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![2, 2, 3])?,
+        make_two_col_i32_batch("a", "b", vec![2, 2, 2], vec![2, 2, 2])?,
+        make_two_col_i32_batch("a", "b", vec![1, 1, 1], vec![5, 6, 7])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int)
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(5)
+        .with_pruned_files(Some(0))
+        .with_matched_by_stats(Some(4)) // RG0,1,2,3 are matched
+        .with_fully_matched_by_stats(Some(3))
+        .with_pruned_by_stats(Some(2)) // RG4,5 are pruned
+        .with_limit_pruned_row_groups(Some(2)) // RG0, RG3 is pruned by limit
+        .test_row_group_prune_with_custom_data(schema, batches, 3)
+        .await;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_limit_pruning_multiple_fully_matched(
+) -> datafusion_common::error::Result<()> {
+    // Test Case 2: Limit requires multiple fully matched row groups
+    // Row Group 0: a=[5,5,5,5] -> Fully matched, 4 rows
+    // Row Group 1: a=[5,5,5,5] -> Fully matched, 4 rows
+    // Row Group 2: a=[5,5,5,5] -> Fully matched, 4 rows
+    // Row Group 3: a=[5,5,5,5] -> Fully matched, 4 rows
+    // Row Group 4: a=[1,2,3,4] -> Not matched
+
+    // With LIMIT 8, we need RG0 (4 rows) + RG1 (4 rows)  8 rows
+    // RG2,3 should be pruned by limit
+    // RG4 should be pruned by statistics
+
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+    let query = "SELECT a FROM t WHERE a = 5 LIMIT 8";
+
+    let batches = vec![
+        make_i32_batch("a", vec![5, 5, 5, 5])?,
+        make_i32_batch("a", vec![5, 5, 5, 5])?,
+        make_i32_batch("a", vec![5, 5, 5, 5])?,
+        make_i32_batch("a", vec![5, 5, 5, 5])?,
+        make_i32_batch("a", vec![1, 2, 3, 4])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int)
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(8)
+        .with_pruned_files(Some(0))
+        .with_matched_by_stats(Some(4)) // RG0,1,2,3 matched
+        .with_fully_matched_by_stats(Some(4))
+        .with_pruned_by_stats(Some(1)) // RG4 pruned
+        .with_limit_pruned_row_groups(Some(2)) // RG2,3 pruned by limit
+        .test_row_group_prune_with_custom_data(schema, batches, 4)
+        .await;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_limit_pruning_no_fully_matched() -> datafusion_common::error::Result<()> {
+    // Test Case 3: No fully matched row groups - all are partially matched
+    // Row Group 0: a=[1,2,3] -> Partially matched, 1 row (a=2)
+    // Row Group 1: a=[2,3,4] -> Partially matched, 1 row (a=2)
+    // Row Group 2: a=[2,5,6] -> Partially matched, 1 row (a=2)
+    // Row Group 3: a=[2,7,8] -> Partially matched, 1 row (a=2)
+    // Row Group 4: a=[9,10,11] -> Not matched
+
+    // With LIMIT 3, we need to scan RG0,1,2 to get 3 matching rows
+    // Cannot prune much by limit since all matching RGs are partial
+    // RG4 should be pruned by statistics
+
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+    let query = "SELECT a FROM t WHERE a = 2 LIMIT 3";
+
+    let batches = vec![
+        make_i32_batch("a", vec![1, 2, 3])?,
+        make_i32_batch("a", vec![2, 3, 4])?,
+        make_i32_batch("a", vec![2, 5, 6])?,
+        make_i32_batch("a", vec![2, 7, 8])?,
+        make_i32_batch("a", vec![9, 10, 11])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int)
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(3)
+        .with_pruned_files(Some(0))
+        .with_matched_by_stats(Some(4)) // RG0,1,2,3 matched
+        .with_fully_matched_by_stats(Some(0))
+        .with_pruned_by_stats(Some(1)) // RG4 pruned
+        .with_limit_pruned_row_groups(Some(0)) // RG3 pruned by limit
+        .test_row_group_prune_with_custom_data(schema, batches, 3)
+        .await;
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_limit_pruning_exceeds_fully_matched() -> datafusion_common::error::Result<()>
+{
+    // Test Case 4: Limit exceeds all fully matched rows, need partially matched
+    // Row Group 0: a=[10,11,12,12] -> Partially matched, 1 row (a=10)
+    // Row Group 1: a=[10,10,10,10] -> Fully matched, 4 rows
+    // Row Group 2: a=[10,10,10,10] -> Fully matched, 4 rows
+    // Row Group 3: a=[10,13,14,11] -> Partially matched, 1 row (a=10)
+    // Row Group 4: a=[20,21,22,22] -> Not matched
+
+    // With LIMIT 10, we need RG1 (4) + RG2 (4) = 8 from fully matched
+    // Still need 2 more, so we need to scan partially matched RG0 and RG3
+    // All matching row groups should be scanned, only RG4 pruned by statistics
+
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+    let query = "SELECT a FROM t WHERE a = 10 LIMIT 10";
+
+    let batches = vec![
+        make_i32_batch("a", vec![10, 11, 12, 12])?,
+        make_i32_batch("a", vec![10, 10, 10, 10])?,
+        make_i32_batch("a", vec![10, 10, 10, 10])?,
+        make_i32_batch("a", vec![10, 13, 14, 11])?,
+        make_i32_batch("a", vec![20, 21, 22, 22])?,
+    ];
+
+    RowGroupPruningTest::new()
+        .with_scenario(Scenario::Int)
+        .with_query(query)
+        .with_expected_errors(Some(0))
+        .with_expected_rows(10) // Total: 1 + 3 + 4 + 1 = 9 (less than limit)
+        .with_pruned_files(Some(0))
+        .with_matched_by_stats(Some(4)) // RG0,1,2,3 matched
+        .with_fully_matched_by_stats(Some(2))
+        .with_pruned_by_stats(Some(1)) // RG4 pruned
+        .with_limit_pruned_row_groups(Some(0)) // No limit pruning since we need all RGs
+        .test_row_group_prune_with_custom_data(schema, batches, 4)
+        .await;
+
+    Ok(())
+}
diff --git a/datafusion/datasource-parquet/src/metrics.rs b/datafusion/datasource-parquet/src/metrics.rs
index 8ce3a081a2e32..e9673e16de56f 100644
--- a/datafusion/datasource-parquet/src/metrics.rs
+++ b/datafusion/datasource-parquet/src/metrics.rs
@@ -45,10 +45,18 @@ pub struct ParquetFileMetrics {
     pub files_ranges_pruned_statistics: PruningMetrics,
     /// Number of times the predicate could not be evaluated
     pub predicate_evaluation_errors: Count,
-    /// Number of row groups whose bloom filters were checked, tracked with matched/pruned counts
-    pub row_groups_pruned_bloom_filter: PruningMetrics,
-    /// Number of row groups whose statistics were checked, tracked with matched/pruned counts
-    pub row_groups_pruned_statistics: PruningMetrics,
+    /// Number of row groups whose bloom filters were checked and matched (not pruned)
+    pub row_groups_matched_bloom_filter: Count,
+    /// Number of row groups pruned by bloom filters
+    pub row_groups_pruned_bloom_filter: Count,
+    /// Number of row groups pruned due to limit pruning.
+    pub limit_pruned_row_groups: Count,
+    /// Number of row groups whose statistics were checked and fully matched
+    pub row_groups_fully_matched_statistics: Count,
+    /// Number of row groups whose statistics were checked and matched (not pruned)
+    pub row_groups_matched_statistics: Count,
+    /// Number of row groups pruned by statistics
+    pub row_groups_pruned_statistics: Count,
     /// Total number of bytes scanned
     pub bytes_scanned: Count,
     /// Total rows filtered out by predicates pushed into parquet scan
@@ -96,8 +104,19 @@ impl ParquetFileMetrics {
         // -----------------------
         let row_groups_pruned_bloom_filter = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
-            .with_type(MetricType::SUMMARY)
-            .pruning_metrics("row_groups_pruned_bloom_filter", partition);
+            .counter("row_groups_pruned_bloom_filter", partition);
+
+        let limit_pruned_row_groups = MetricBuilder::new(metrics)
+            .with_new_label("filename", filename.to_string())
+            .counter("limit_pruned_row_groups", partition);
+
+        let row_groups_fully_matched_statistics = MetricBuilder::new(metrics)
+            .with_new_label("filename", filename.to_string())
+            .counter("row_groups_fully_matched_statistics", partition);
+
+        let row_groups_matched_statistics = MetricBuilder::new(metrics)
+            .with_new_label("filename", filename.to_string())
+            .counter("row_groups_matched_statistics", partition);
 
         let row_groups_pruned_statistics = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
@@ -172,7 +191,10 @@ impl ParquetFileMetrics {
             files_ranges_pruned_statistics,
             predicate_evaluation_errors,
             row_groups_pruned_bloom_filter,
+            row_groups_fully_matched_statistics,
+            row_groups_matched_statistics,
             row_groups_pruned_statistics,
+            limit_pruned_row_groups,
             bytes_scanned,
             pushdown_rows_pruned,
             pushdown_rows_matched,
diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs
index 570f9b4412840..c50d313ed5448 100644
--- a/datafusion/datasource-parquet/src/opener.rs
+++ b/datafusion/datasource-parquet/src/opener.rs
@@ -545,11 +545,15 @@ impl FileOpener for ParquetOpener {
                     .add_matched(n_remaining_row_groups);
             }
 
-            let mut access_plan = row_groups.build();
+            // Prune by limit
+            if let Some(limit) = limit {
+                row_groups.prune_by_limit(limit, rg_metadata, &file_metrics);
+            }
 
             // --------------------------------------------------------
             // Step: prune pages from the kept row groups
             //
+            let mut access_plan = row_groups.build();
             // page index pruning: if all data on individual pages can
             // be ruled using page metadata, rows from other columns
             // with that range can be skipped as well
diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs
index 046379cc25e23..bed29aea4a4ad 100644
--- a/datafusion/datasource-parquet/src/row_group_filter.rs
+++ b/datafusion/datasource-parquet/src/row_group_filter.rs
@@ -24,6 +24,8 @@ use arrow::datatypes::Schema;
 use datafusion_common::pruning::PruningStatistics;
 use datafusion_common::{Column, Result, ScalarValue};
 use datafusion_datasource::FileRange;
+use datafusion_physical_expr::expressions::NotExpr;
+use datafusion_physical_expr::PhysicalExprSimplifier;
 use datafusion_pruning::PruningPredicate;
 use parquet::arrow::arrow_reader::statistics::StatisticsConverter;
 use parquet::arrow::parquet_column;
@@ -46,13 +48,19 @@ use parquet::{
 pub struct RowGroupAccessPlanFilter {
     /// which row groups should be accessed
     access_plan: ParquetAccessPlan,
+    /// which row groups are fully contained within the pruning predicate
+    is_fully_matched: Vec<bool>,
 }
 
 impl RowGroupAccessPlanFilter {
     /// Create a new `RowGroupPlanBuilder` for pruning out the groups to scan
     /// based on metadata and statistics
     pub fn new(access_plan: ParquetAccessPlan) -> Self {
-        Self { access_plan }
+        let num_row_groups = access_plan.len();
+        Self {
+            access_plan,
+            is_fully_matched: vec![false; num_row_groups],
+        }
     }
 
     /// Return true if there are no row groups
@@ -70,6 +78,49 @@ impl RowGroupAccessPlanFilter {
         self.access_plan
     }
 
+    /// Returns the is_fully_matched vector
+    pub fn is_fully_matched(&self) -> &Vec<bool> {
+        &self.is_fully_matched
+    }
+
+    /// Prunes the access plan based on the limit and fully contained row groups.
+    pub fn prune_by_limit(
+        &mut self,
+        limit: usize,
+        rg_metadata: &[RowGroupMetaData],
+        metrics: &ParquetFileMetrics,
+    ) {
+        let mut fully_matched_row_group_indexes: Vec<usize> = Vec::new();
+        let mut fully_matched_rows_count: usize = 0;
+
+        // Iterate through the currently accessible row groups
+        for &idx in self.access_plan.row_group_indexes().iter() {
+            if self.is_fully_matched[idx] {
+                let row_group_row_count = rg_metadata[idx].num_rows() as usize;
+                fully_matched_row_group_indexes.push(idx);
+                fully_matched_rows_count += row_group_row_count;
+                if fully_matched_rows_count >= limit {
+                    break;
+                }
+            }
+        }
+
+        if fully_matched_rows_count >= limit {
+            let original_num_accessible_row_groups =
+                self.access_plan.row_group_indexes().len();
+            let new_num_accessible_row_groups = fully_matched_row_group_indexes.len();
+            let pruned_count = original_num_accessible_row_groups
+                .saturating_sub(new_num_accessible_row_groups);
+            metrics.limit_pruned_row_groups.add(pruned_count);
+
+            let mut new_access_plan = ParquetAccessPlan::new_none(rg_metadata.len());
+            for &idx in &fully_matched_row_group_indexes {
+                new_access_plan.scan(idx);
+            }
+            self.access_plan = new_access_plan;
+        }
+    }
+
     /// Prune remaining row groups to only those  within the specified range.
     ///
     /// Updates this set to mark row groups that should not be scanned
@@ -135,13 +186,55 @@ impl RowGroupAccessPlanFilter {
         // try to prune the row groups in a single call
         match predicate.prune(&pruning_stats) {
             Ok(values) => {
-                // values[i] is false means the predicate could not be true for row group i
+                let mut fully_contained_candidates_original_idx: Vec<usize> = Vec::new();
                 for (idx, &value) in row_group_indexes.iter().zip(values.iter()) {
                     if !value {
                         self.access_plan.skip(*idx);
                         metrics.row_groups_pruned_statistics.add_pruned(1);
                     } else {
                         metrics.row_groups_pruned_statistics.add_matched(1);
+                        fully_contained_candidates_original_idx.push(*idx);
+                        metrics.row_groups_matched_statistics.add(1);
+                    }
+                }
+
+                // Note: this part of code shouldn't be expensive with a limited number of row groups
+                // If we do find it's expensive, we can consider optimizing it further.
+                if !fully_contained_candidates_original_idx.is_empty() {
+                    // Use NotExpr to create the inverted predicate
+                    let inverted_expr =
+                        Arc::new(NotExpr::new(Arc::clone(predicate.orig_expr())));
+                    // Simplify the NOT expression (e.g., NOT(c1 = 0) -> c1 != 0)
+                    // before building the pruning predicate
+                    let mut simplifier = PhysicalExprSimplifier::new(arrow_schema);
+                    let inverted_expr = simplifier.simplify(inverted_expr).unwrap();
+                    if let Ok(inverted_predicate) = PruningPredicate::try_new(
+                        inverted_expr,
+                        Arc::clone(predicate.schema()),
+                    ) {
+                        let inverted_pruning_stats = RowGroupPruningStatistics {
+                            parquet_schema,
+                            row_group_metadatas: fully_contained_candidates_original_idx
+                                .iter()
+                                .map(|&i| &groups[i])
+                                .collect::<Vec<_>>(),
+                            arrow_schema,
+                        };
+
+                        if let Ok(inverted_values) =
+                            inverted_predicate.prune(&inverted_pruning_stats)
+                        {
+                            for (i, &original_row_group_idx) in
+                                fully_contained_candidates_original_idx.iter().enumerate()
+                            {
+                                // If the inverted predicate *also* prunes this row group (meaning inverted_values[i] is false),
+                                // it implies that *all* rows in this group satisfy the original predicate.
+                                if !inverted_values[i] {
+                                    self.is_fully_matched[original_row_group_idx] = true;
+                                    metrics.row_groups_fully_matched_statistics.add(1);
+                                }
+                            }
+                        }
                     }
                 }
             }
diff --git a/datafusion/pruning/src/pruning_predicate.rs b/datafusion/pruning/src/pruning_predicate.rs
index b5b8267d7f93f..5f1b4233b5d48 100644
--- a/datafusion/pruning/src/pruning_predicate.rs
+++ b/datafusion/pruning/src/pruning_predicate.rs
@@ -492,7 +492,6 @@ impl PruningPredicate {
         // Simplify the newly created predicate to get rid of redundant casts, comparisons, etc.
         let predicate_expr =
             PhysicalExprSimplifier::new(&predicate_schema).simplify(predicate_expr)?;
-
         let literal_guarantees = LiteralGuarantee::analyze(&expr);
 
         Ok(Self {

From 1d78b6f9dfe58ea6256b2475f5e02657708dd478 Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Fri, 21 Nov 2025 17:42:35 +0800
Subject: [PATCH 02/26] Support row group limit pruning

---
 datafusion/datasource-parquet/src/metrics.rs  | 28 +++++--------------
 .../src/row_group_filter.rs                   |  7 +++--
 .../physical-expr-common/src/metrics/value.rs |  9 ++++++
 3 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/datafusion/datasource-parquet/src/metrics.rs b/datafusion/datasource-parquet/src/metrics.rs
index e9673e16de56f..fbb14d9a6d90c 100644
--- a/datafusion/datasource-parquet/src/metrics.rs
+++ b/datafusion/datasource-parquet/src/metrics.rs
@@ -45,18 +45,12 @@ pub struct ParquetFileMetrics {
     pub files_ranges_pruned_statistics: PruningMetrics,
     /// Number of times the predicate could not be evaluated
     pub predicate_evaluation_errors: Count,
-    /// Number of row groups whose bloom filters were checked and matched (not pruned)
-    pub row_groups_matched_bloom_filter: Count,
     /// Number of row groups pruned by bloom filters
-    pub row_groups_pruned_bloom_filter: Count,
+    pub row_groups_pruned_bloom_filter: PruningMetrics,
     /// Number of row groups pruned due to limit pruning.
-    pub limit_pruned_row_groups: Count,
-    /// Number of row groups whose statistics were checked and fully matched
-    pub row_groups_fully_matched_statistics: Count,
-    /// Number of row groups whose statistics were checked and matched (not pruned)
-    pub row_groups_matched_statistics: Count,
+    pub limit_pruned_row_groups: PruningMetrics,
     /// Number of row groups pruned by statistics
-    pub row_groups_pruned_statistics: Count,
+    pub row_groups_pruned_statistics: PruningMetrics,
     /// Total number of bytes scanned
     pub bytes_scanned: Count,
     /// Total rows filtered out by predicates pushed into parquet scan
@@ -104,19 +98,13 @@ impl ParquetFileMetrics {
         // -----------------------
         let row_groups_pruned_bloom_filter = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
-            .counter("row_groups_pruned_bloom_filter", partition);
+            .with_type(MetricType::SUMMARY)
+            .pruning_metrics("row_groups_pruned_bloom_filter", partition);
 
         let limit_pruned_row_groups = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
-            .counter("limit_pruned_row_groups", partition);
-
-        let row_groups_fully_matched_statistics = MetricBuilder::new(metrics)
-            .with_new_label("filename", filename.to_string())
-            .counter("row_groups_fully_matched_statistics", partition);
-
-        let row_groups_matched_statistics = MetricBuilder::new(metrics)
-            .with_new_label("filename", filename.to_string())
-            .counter("row_groups_matched_statistics", partition);
+            .with_type(MetricType::SUMMARY)
+            .pruning_metrics("limit_pruned_row_groups", partition);
 
         let row_groups_pruned_statistics = MetricBuilder::new(metrics)
             .with_new_label("filename", filename.to_string())
@@ -191,8 +179,6 @@ impl ParquetFileMetrics {
             files_ranges_pruned_statistics,
             predicate_evaluation_errors,
             row_groups_pruned_bloom_filter,
-            row_groups_fully_matched_statistics,
-            row_groups_matched_statistics,
             row_groups_pruned_statistics,
             limit_pruned_row_groups,
             bytes_scanned,
diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs
index bed29aea4a4ad..50979d3687771 100644
--- a/datafusion/datasource-parquet/src/row_group_filter.rs
+++ b/datafusion/datasource-parquet/src/row_group_filter.rs
@@ -111,7 +111,7 @@ impl RowGroupAccessPlanFilter {
             let new_num_accessible_row_groups = fully_matched_row_group_indexes.len();
             let pruned_count = original_num_accessible_row_groups
                 .saturating_sub(new_num_accessible_row_groups);
-            metrics.limit_pruned_row_groups.add(pruned_count);
+            metrics.limit_pruned_row_groups.add_pruned(pruned_count);
 
             let mut new_access_plan = ParquetAccessPlan::new_none(rg_metadata.len());
             for &idx in &fully_matched_row_group_indexes {
@@ -194,7 +194,6 @@ impl RowGroupAccessPlanFilter {
                     } else {
                         metrics.row_groups_pruned_statistics.add_matched(1);
                         fully_contained_candidates_original_idx.push(*idx);
-                        metrics.row_groups_matched_statistics.add(1);
                     }
                 }
 
@@ -231,7 +230,9 @@ impl RowGroupAccessPlanFilter {
                                 // it implies that *all* rows in this group satisfy the original predicate.
                                 if !inverted_values[i] {
                                     self.is_fully_matched[original_row_group_idx] = true;
-                                    metrics.row_groups_fully_matched_statistics.add(1);
+                                    metrics
+                                        .row_groups_pruned_statistics
+                                        .add_fully_matched(1);
                                 }
                             }
                         }
diff --git a/datafusion/physical-expr-common/src/metrics/value.rs b/datafusion/physical-expr-common/src/metrics/value.rs
index 9a14b804a20b5..0054813164bcc 100644
--- a/datafusion/physical-expr-common/src/metrics/value.rs
+++ b/datafusion/physical-expr-common/src/metrics/value.rs
@@ -372,6 +372,7 @@ impl Drop for ScopedTimerGuard<'_> {
 pub struct PruningMetrics {
     pruned: Arc<AtomicUsize>,
     matched: Arc<AtomicUsize>,
+    fully_matched: Arc<AtomicUsize>,
 }
 
 impl Display for PruningMetrics {
@@ -400,6 +401,7 @@ impl PruningMetrics {
         Self {
             pruned: Arc::new(AtomicUsize::new(0)),
             matched: Arc::new(AtomicUsize::new(0)),
+            fully_matched: Arc::new(AtomicUsize::new(0)),
         }
     }
 
@@ -417,6 +419,13 @@ impl PruningMetrics {
         self.matched.fetch_add(n, Ordering::Relaxed);
     }
 
+    /// Add `n` to the metric's fully matched value
+    pub fn add_fully_matched(&self, n: usize) {
+        // relaxed ordering for operations on `value` poses no issues
+        // we're purely using atomic ops with no associated memory ops
+        self.fully_matched.fetch_add(n, Ordering::Relaxed);
+    }
+
     /// Subtract `n` to the metric's matched value.
     pub fn subtract_matched(&self, n: usize) {
         // relaxed ordering for operations on `value` poses no issues

From d1fc3bd4936239e6cc678d48c52b6b5e35f4ba2b Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Tue, 25 Nov 2025 11:33:32 +0800
Subject: [PATCH 03/26] Add fetch_order_sensitive during limit pushdown to
 decide if use limit pruning

---
 datafusion/catalog-listing/src/table.rs       |  1 +
 datafusion/catalog/src/table.rs               | 20 +++++++
 datafusion/core/src/physical_planner.rs       |  4 +-
 datafusion/core/tests/parquet/mod.rs          | 38 +++++++------
 .../core/tests/parquet/row_group_pruning.rs   |  4 +-
 datafusion/datasource-parquet/src/opener.rs   |  8 ++-
 datafusion/datasource-parquet/src/source.rs   |  1 +
 datafusion/datasource/src/file_scan_config.rs | 20 +++++++
 datafusion/expr/src/logical_plan/builder.rs   |  4 +-
 datafusion/expr/src/logical_plan/plan.rs      | 12 ++++
 datafusion/expr/src/logical_plan/tree_node.rs |  2 +
 .../optimizer/src/optimize_projections/mod.rs | 15 ++---
 datafusion/optimizer/src/push_down_filter.rs  |  5 +-
 datafusion/optimizer/src/push_down_limit.rs   | 57 ++++++++++++++++++-
 .../physical-expr-common/src/metrics/value.rs |  5 ++
 datafusion/proto/src/logical_plan/mod.rs      |  1 +
 16 files changed, 160 insertions(+), 37 deletions(-)

diff --git a/datafusion/catalog-listing/src/table.rs b/datafusion/catalog-listing/src/table.rs
index 38456944075fc..039f276d9492b 100644
--- a/datafusion/catalog-listing/src/table.rs
+++ b/datafusion/catalog-listing/src/table.rs
@@ -581,6 +581,7 @@ impl TableProvider for ListingTable {
                     .with_statistics(statistics)
                     .with_projection_indices(projection)?
                     .with_limit(limit)
+                    .with_limit_order_sensitive(args.limit_order_sensitive())
                     .with_output_ordering(output_ordering)
                     .with_expr_adapter(self.expr_adapter_factory.clone())
                     .with_partitioned_by_file_group(partitioned_by_file_group)
diff --git a/datafusion/catalog/src/table.rs b/datafusion/catalog/src/table.rs
index 1f223852c2b9d..4f604482bb6c2 100644
--- a/datafusion/catalog/src/table.rs
+++ b/datafusion/catalog/src/table.rs
@@ -361,6 +361,7 @@ pub struct ScanArgs<'a> {
     filters: Option<&'a [Expr]>,
     projection: Option<&'a [usize]>,
     limit: Option<usize>,
+    limit_order_sensitive: bool,
 }
 
 impl<'a> ScanArgs<'a> {
@@ -422,6 +423,25 @@ impl<'a> ScanArgs<'a> {
     pub fn limit(&self) -> Option<usize> {
         self.limit
     }
+
+    /// Set whether the scan's limit should be order-sensitive.
+    ///
+    /// If specified, the scan should return the limited rows in a specific order.
+    /// Or we can leverage limit pruning to optimize the scan.
+    ///
+    /// # Arguments
+    /// * `order_sensitive` - Whether the scan's limit should be order-sensitive
+    pub fn with_limit_order_sensitive(mut self, order_sensitive: bool) -> Self {
+        self.limit_order_sensitive = order_sensitive;
+        self
+    }
+
+    /// Get whether the scan's limit should be order-sensitive.
+    ///
+    /// Returns `true` if the scan's limit should be order-sensitive, or `false` if not.
+    pub fn limit_order_sensitive(&self) -> bool {
+        self.limit_order_sensitive
+    }
 }
 
 /// Result of a table scan operation from [`TableProvider::scan_with_args`].
diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs
index cc7d534776d7e..83e6bf1badbbb 100644
--- a/datafusion/core/src/physical_planner.rs
+++ b/datafusion/core/src/physical_planner.rs
@@ -460,6 +460,7 @@ impl DefaultPhysicalPlanner {
                 projection,
                 filters,
                 fetch,
+                fetch_order_sensitive,
                 ..
             }) => {
                 let source = source_as_provider(source)?;
@@ -471,7 +472,8 @@ impl DefaultPhysicalPlanner {
                 let opts = ScanArgs::default()
                     .with_projection(projection.as_deref())
                     .with_filters(Some(&filters_vec))
-                    .with_limit(*fetch);
+                    .with_limit(*fetch)
+                    .with_limit_order_sensitive(*fetch_order_sensitive);
                 let res = source.scan_with_args(session_state, opts).await?;
                 Arc::clone(res.plan())
             }
diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs
index 16ac557f18811..4d0209267514b 100644
--- a/datafusion/core/tests/parquet/mod.rs
+++ b/datafusion/core/tests/parquet/mod.rs
@@ -127,7 +127,7 @@ struct TestOutput {
 impl TestOutput {
     /// retrieve the value of the named metric, if any
     fn metric_value(&self, metric_name: &str) -> Option<usize> {
-        if let Some((pruned, _matched)) = self.pruning_metric(metric_name) {
+        if let Some((pruned, _matched, _fully)) = self.pruning_metric(metric_name) {
             return Some(pruned);
         }
 
@@ -141,9 +141,10 @@ impl TestOutput {
             })
     }
 
-    fn pruning_metric(&self, metric_name: &str) -> Option<(usize, usize)> {
+    fn pruning_metric(&self, metric_name: &str) -> Option<(usize, usize, usize)> {
         let mut total_pruned = 0;
         let mut total_matched = 0;
+        let mut total_fully_matched = 0;
         let mut found = false;
 
         for metric in self.parquet_metrics.iter() {
@@ -152,15 +153,18 @@ impl TestOutput {
                 && let MetricValue::PruningMetrics {
                     pruning_metrics, ..
                 } = metric.value()
-            {
-                total_pruned += pruning_metrics.pruned();
-                total_matched += pruning_metrics.matched();
-                found = true;
+                {
+                    total_pruned += pruning_metrics.pruned();
+                    total_matched += pruning_metrics.matched();
+                    total_fully_matched += pruning_metrics.fully_matched();
+
+                    found = true;
+                }
             }
         }
 
         if found {
-            Some((total_pruned, total_matched))
+            Some((total_pruned, total_matched, total_fully_matched))
         } else {
             None
         }
@@ -172,32 +176,33 @@ impl TestOutput {
     }
 
     /// The number of row_groups pruned / matched by bloom filter
-    fn row_groups_bloom_filter(&self) -> Option<(usize, usize)> {
+    fn row_groups_bloom_filter(&self) -> Option<(usize, usize, usize)> {
         self.pruning_metric("row_groups_pruned_bloom_filter")
     }
 
     /// The number of row_groups matched by statistics
     fn row_groups_matched_statistics(&self) -> Option<usize> {
         self.pruning_metric("row_groups_pruned_statistics")
-            .map(|(_pruned, matched)| matched)
+            .map(|(_pruned, matched, _fully)| matched)
     }
 
     /// The number of row_groups fully matched by statistics
     fn row_groups_fully_matched_statistics(&self) -> Option<usize> {
-        self.metric_value("row_groups_fully_matched_statistics")
+        self.pruning_metric("row_groups_pruned_statistics")
+            .map(|(_pruned, _, fully)| fully)
     }
 
     /// The number of row_groups pruned by statistics
     fn row_groups_pruned_statistics(&self) -> Option<usize> {
         self.pruning_metric("row_groups_pruned_statistics")
-            .map(|(pruned, _matched)| pruned)
+            .map(|(pruned, _matched, _fully)| pruned)
     }
 
     /// Metric `files_ranges_pruned_statistics` tracks both pruned and matched count,
     /// for testing purpose, here it only aggregate the `pruned` count.
     fn files_ranges_pruned_statistics(&self) -> Option<usize> {
         self.pruning_metric("files_ranges_pruned_statistics")
-            .map(|(pruned, _matched)| pruned)
+            .map(|(pruned, _matched, _fully)| pruned)
     }
 
     /// The number of row_groups matched by bloom filter or statistics
@@ -207,13 +212,13 @@ impl TestOutput {
     /// count.
     fn row_groups_matched(&self) -> Option<usize> {
         self.row_groups_bloom_filter()
-            .map(|(_pruned, matched)| matched)
+            .map(|(_pruned, matched, _fully)| matched)
     }
 
     /// The number of row_groups pruned
     fn row_groups_pruned(&self) -> Option<usize> {
         self.row_groups_bloom_filter()
-            .map(|(pruned, _matched)| pruned)
+            .map(|(pruned, _matched, _fully)| pruned)
             .zip(self.row_groups_pruned_statistics())
             .map(|(a, b)| a + b)
     }
@@ -221,12 +226,13 @@ impl TestOutput {
     /// The number of row pages pruned
     fn row_pages_pruned(&self) -> Option<usize> {
         self.pruning_metric("page_index_rows_pruned")
-            .map(|(pruned, _matched)| pruned)
+            .map(|(pruned, _matched, _fully)| pruned)
     }
 
     /// The number of row groups pruned by limit pruning
     fn limit_pruned_row_groups(&self) -> Option<usize> {
-        self.metric_value("limit_pruned_row_groups")
+        self.pruning_metric("limit_pruned_row_groups")
+            .map(|(pruned, _, _)| pruned)
     }
 
     fn description(&self) -> String {
diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs
index e0ba462281ce3..f2e2561945140 100644
--- a/datafusion/core/tests/parquet/row_group_pruning.rs
+++ b/datafusion/core/tests/parquet/row_group_pruning.rs
@@ -157,12 +157,12 @@ impl RowGroupPruningTest {
         );
         let bloom_filter_metrics = output.row_groups_bloom_filter();
         assert_eq!(
-            bloom_filter_metrics.map(|(_pruned, matched)| matched),
+            bloom_filter_metrics.map(|(_pruned, matched, _)| matched),
             self.expected_row_group_matched_by_bloom_filter,
             "mismatched row_groups_matched_bloom_filter",
         );
         assert_eq!(
-            bloom_filter_metrics.map(|(pruned, _matched)| pruned),
+            bloom_filter_metrics.map(|(pruned, _matched, _)| pruned),
             self.expected_row_group_pruned_by_bloom_filter,
             "mismatched row_groups_pruned_bloom_filter",
         );
diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs
index c50d313ed5448..3947524684efa 100644
--- a/datafusion/datasource-parquet/src/opener.rs
+++ b/datafusion/datasource-parquet/src/opener.rs
@@ -76,6 +76,8 @@ pub(super) struct ParquetOpener {
     pub batch_size: usize,
     /// Optional limit on the number of rows to read
     pub limit: Option<usize>,
+    /// limit order sensitivity
+    pub limit_order_sensitive: bool,
     /// Optional predicate to apply during the scan
     pub predicate: Option<Arc<dyn PhysicalExpr>>,
     /// Table schema, including partition columns.
@@ -277,6 +279,8 @@ impl FileOpener for ParquetOpener {
         let max_predicate_cache_size = self.max_predicate_cache_size;
 
         let reverse_row_groups = self.reverse_row_groups;
+        let limit_order_sensitive = self.limit_order_sensitive;
+
         Ok(Box::pin(async move {
             #[cfg(feature = "parquet_encryption")]
             let file_decryption_properties = encryption_context
@@ -545,8 +549,8 @@ impl FileOpener for ParquetOpener {
                     .add_matched(n_remaining_row_groups);
             }
 
-            // Prune by limit
-            if let Some(limit) = limit {
+            // Prune by limit if limit is set and limit order is not sensitive
+            if let (Some(limit), false) = (limit, limit_order_sensitive) {
                 row_groups.prune_by_limit(limit, rg_metadata, &file_metrics);
             }
 
diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs
index 2e0919b1447de..d36e0fa106c0b 100644
--- a/datafusion/datasource-parquet/src/source.rs
+++ b/datafusion/datasource-parquet/src/source.rs
@@ -548,6 +548,7 @@ impl FileSource for ParquetSource {
                 .batch_size
                 .expect("Batch size must set before creating ParquetOpener"),
             limit: base_config.limit,
+            limit_order_sensitive: base_config.limit_order_sensitive,
             predicate: self.predicate.clone(),
             table_schema: self.table_schema.clone(),
             metadata_size_hint: self.metadata_size_hint,
diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs
index 1f7c37315c47a..082f06829f14d 100644
--- a/datafusion/datasource/src/file_scan_config.rs
+++ b/datafusion/datasource/src/file_scan_config.rs
@@ -152,6 +152,8 @@ pub struct FileScanConfig {
     /// The maximum number of records to read from this plan. If `None`,
     /// all records after filtering are returned.
     pub limit: Option<usize>,
+    /// Whether the scan's limit is order sensitive
+    pub limit_order_sensitive: bool,
     /// All equivalent lexicographical orderings that describe the schema.
     pub output_ordering: Vec<LexOrdering>,
     /// File compression type
@@ -240,6 +242,8 @@ pub struct FileScanConfigBuilder {
     object_store_url: ObjectStoreUrl,
     file_source: Arc<dyn FileSource>,
     limit: Option<usize>,
+    limit_order_sensitive: bool,
+    projection_indices: Option<Vec<usize>>,
     constraints: Option<Constraints>,
     file_groups: Vec<FileGroup>,
     statistics: Option<Statistics>,
@@ -269,6 +273,8 @@ impl FileScanConfigBuilder {
             output_ordering: vec![],
             file_compression_type: None,
             limit: None,
+            limit_order_sensitive: false,
+            projection_indices: None,
             constraints: None,
             batch_size: None,
             expr_adapter_factory: None,
@@ -283,6 +289,12 @@ impl FileScanConfigBuilder {
         self
     }
 
+    /// Set whether the limit should be order-sensitive.
+    pub fn with_limit_order_sensitive(mut self, order_sensitive: bool) -> Self {
+        self.limit_order_sensitive = order_sensitive;
+        self
+    }
+
     /// Set the file source for scanning files.
     ///
     /// This method allows you to change the file source implementation (e.g. ParquetSource, CsvSource, etc.)
@@ -450,6 +462,8 @@ impl FileScanConfigBuilder {
             object_store_url,
             file_source,
             limit,
+            limit_order_sensitive,
+            projection_indices,
             constraints,
             file_groups,
             statistics,
@@ -471,6 +485,8 @@ impl FileScanConfigBuilder {
             object_store_url,
             file_source,
             limit,
+            limit_order_sensitive,
+            projection_exprs,
             constraints,
             file_groups,
             output_ordering,
@@ -493,6 +509,10 @@ impl From<FileScanConfig> for FileScanConfigBuilder {
             output_ordering: config.output_ordering,
             file_compression_type: Some(config.file_compression_type),
             limit: config.limit,
+            limit_order_sensitive: config.limit_order_sensitive,
+            projection_indices: config
+                .projection_exprs
+                .map(|p| p.ordered_column_indices()),
             constraints: Some(config.constraints),
             batch_size: config.batch_size,
             expr_adapter_factory: config.expr_adapter_factory,
diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs
index 6f654428e41a1..27852fd4b9897 100644
--- a/datafusion/expr/src/logical_plan/builder.rs
+++ b/datafusion/expr/src/logical_plan/builder.rs
@@ -2756,12 +2756,12 @@ mod tests {
 
         assert_snapshot!(plan, @r"
         Union
-          Cross Join: 
+          Cross Join:
             SubqueryAlias: left
               Values: (Int32(1))
             SubqueryAlias: right
               Values: (Int32(1))
-          Cross Join: 
+          Cross Join:
             SubqueryAlias: left
               Values: (Int32(1))
             SubqueryAlias: right
diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs
index 4219c24bfc9c9..df7cbb7527a9f 100644
--- a/datafusion/expr/src/logical_plan/plan.rs
+++ b/datafusion/expr/src/logical_plan/plan.rs
@@ -2683,6 +2683,9 @@ pub struct TableScan {
     pub filters: Vec<Expr>,
     /// Optional number of rows to read
     pub fetch: Option<usize>,
+    /// If the fetch is order-sensitive, it'll be true.
+    /// And the limit pruning will be enabled.
+    pub fetch_order_sensitive: bool,
 }
 
 impl Debug for TableScan {
@@ -2705,6 +2708,7 @@ impl PartialEq for TableScan {
             && self.projected_schema == other.projected_schema
             && self.filters == other.filters
             && self.fetch == other.fetch
+            && self.fetch_order_sensitive == other.fetch_order_sensitive
     }
 }
 
@@ -2724,18 +2728,22 @@ impl PartialOrd for TableScan {
             pub filters: &'a Vec<Expr>,
             /// Optional number of rows to read
             pub fetch: &'a Option<usize>,
+            /// Whether the fetch is order-sensitive
+            pub fetch_order_sensitive: bool,
         }
         let comparable_self = ComparableTableScan {
             table_name: &self.table_name,
             projection: &self.projection,
             filters: &self.filters,
             fetch: &self.fetch,
+            fetch_order_sensitive: self.fetch_order_sensitive,
         };
         let comparable_other = ComparableTableScan {
             table_name: &other.table_name,
             projection: &other.projection,
             filters: &other.filters,
             fetch: &other.fetch,
+            fetch_order_sensitive: other.fetch_order_sensitive,
         };
         comparable_self
             .partial_cmp(&comparable_other)
@@ -2751,6 +2759,7 @@ impl Hash for TableScan {
         self.projected_schema.hash(state);
         self.filters.hash(state);
         self.fetch.hash(state);
+        self.fetch_order_sensitive.hash(state);
     }
 }
 
@@ -2804,6 +2813,7 @@ impl TableScan {
             projected_schema,
             filters,
             fetch,
+            fetch_order_sensitive: false,
         })
     }
 }
@@ -4968,6 +4978,7 @@ mod tests {
             projected_schema: Arc::clone(&schema),
             filters: vec![],
             fetch: None,
+            fetch_order_sensitive: false,
         }));
         let col = schema.field_names()[0].clone();
 
@@ -4998,6 +5009,7 @@ mod tests {
             projected_schema: Arc::clone(&unique_schema),
             filters: vec![],
             fetch: None,
+            fetch_order_sensitive: false,
         }));
         let col = schema.field_names()[0].clone();
 
diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs
index 62a27b0a025ad..c9ca99c20e08b 100644
--- a/datafusion/expr/src/logical_plan/tree_node.rs
+++ b/datafusion/expr/src/logical_plan/tree_node.rs
@@ -599,6 +599,7 @@ impl LogicalPlan {
                 projected_schema,
                 filters,
                 fetch,
+                fetch_order_sensitive,
             }) => filters.map_elements(f)?.update_data(|filters| {
                 LogicalPlan::TableScan(TableScan {
                     table_name,
@@ -607,6 +608,7 @@ impl LogicalPlan {
                     projected_schema,
                     filters,
                     fetch,
+                    fetch_order_sensitive,
                 })
             }),
             LogicalPlan::Distinct(Distinct::On(DistinctOn {
diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs
index 548eadffa242e..c85793228ba05 100644
--- a/datafusion/optimizer/src/optimize_projections/mod.rs
+++ b/datafusion/optimizer/src/optimize_projections/mod.rs
@@ -259,6 +259,7 @@ fn optimize_projections(
                 projection,
                 filters,
                 fetch,
+                fetch_order_sensitive,
                 projected_schema: _,
             } = table_scan;
 
@@ -268,15 +269,11 @@ fn optimize_projections(
                 Some(projection) => indices.into_mapped_indices(|idx| projection[idx]),
                 None => indices.into_inner(),
             };
-            return TableScan::try_new(
-                table_name,
-                source,
-                Some(projection),
-                filters,
-                fetch,
-            )
-            .map(LogicalPlan::TableScan)
-            .map(Transformed::yes);
+            let mut new_scan =
+                TableScan::try_new(table_name, source, Some(projection), filters, fetch)?;
+            new_scan.fetch_order_sensitive = fetch_order_sensitive;
+
+            return Ok(Transformed::yes(LogicalPlan::TableScan(new_scan)));
         }
         // Other node types are handled below
         _ => {}
diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs
index 755ffdbafc869..cdb372791999d 100644
--- a/datafusion/optimizer/src/push_down_filter.rs
+++ b/datafusion/optimizer/src/push_down_filter.rs
@@ -2331,7 +2331,7 @@ mod tests {
             plan,
             @r"
         Projection: test.a, test1.d
-          Cross Join: 
+          Cross Join:
             Projection: test.a, test.b, test.c
               TableScan: test, full_filters=[test.a = Int32(1)]
             Projection: test1.d, test1.e, test1.f
@@ -2361,7 +2361,7 @@ mod tests {
             plan,
             @r"
         Projection: test.a, test1.a
-          Cross Join: 
+          Cross Join:
             Projection: test.a, test.b, test.c
               TableScan: test, full_filters=[test.a = Int32(1)]
             Projection: test1.a, test1.b, test1.c
@@ -3119,6 +3119,7 @@ mod tests {
             projection,
             source: Arc::new(test_provider),
             fetch: None,
+            fetch_order_sensitive: false,
         });
 
         Ok(LogicalPlanBuilder::from(table_scan))
diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs
index 7b302adf22acc..4e0e357a289df 100644
--- a/datafusion/optimizer/src/push_down_limit.rs
+++ b/datafusion/optimizer/src/push_down_limit.rs
@@ -25,6 +25,7 @@ use crate::{OptimizerConfig, OptimizerRule};
 
 use datafusion_common::Result;
 use datafusion_common::tree_node::Transformed;
+use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_common::utils::combine_limit;
 use datafusion_expr::logical_plan::{Join, JoinType, Limit, LogicalPlan};
 use datafusion_expr::{FetchType, SkipType, lit};
@@ -124,6 +125,9 @@ impl OptimizerRule for PushDownLimit {
                 })),
 
             LogicalPlan::Sort(mut sort) => {
+                let marked_input =
+                    mark_fetch_order_sensitive(Arc::unwrap_or_clone(sort.input))?;
+                sort.input = Arc::new(marked_input);
                 let new_fetch = {
                     let sort_fetch = skip + fetch;
                     Some(sort.fetch.map(|f| f.min(sort_fetch)).unwrap_or(sort_fetch))
@@ -268,6 +272,17 @@ fn push_down_join(mut join: Join, limit: usize) -> Transformed<Join> {
     Transformed::yes(join)
 }
 
+fn mark_fetch_order_sensitive(plan: LogicalPlan) -> Result<LogicalPlan> {
+    plan.transform_down(|node| match node {
+        LogicalPlan::TableScan(mut scan) => {
+            scan.fetch_order_sensitive = true;
+            Ok(Transformed::yes(LogicalPlan::TableScan(scan)))
+        }
+        _ => Ok(Transformed::no(node)),
+    })
+    .map(|t| t.data)
+}
+
 #[cfg(test)]
 mod test {
     use std::cmp::Ordering;
@@ -275,10 +290,11 @@ mod test {
     use std::vec;
 
     use super::*;
-    use crate::assert_optimized_plan_eq_snapshot;
     use crate::test::*;
+    use crate::{assert_optimized_plan_eq_snapshot, Optimizer};
 
     use crate::OptimizerContext;
+    use datafusion_common::tree_node::TreeNodeRecursion;
     use datafusion_common::DFSchemaRef;
     use datafusion_expr::{
         Expr, Extension, UserDefinedLogicalNodeCore, col, exists,
@@ -1044,7 +1060,7 @@ mod test {
             plan,
             @r"
         Limit: skip=0, fetch=1000
-          Cross Join: 
+          Cross Join:
             Limit: skip=0, fetch=1000
               TableScan: test, fetch=1000
             Limit: skip=0, fetch=1000
@@ -1067,7 +1083,7 @@ mod test {
             plan,
             @r"
         Limit: skip=1000, fetch=1000
-          Cross Join: 
+          Cross Join:
             Limit: skip=0, fetch=2000
               TableScan: test, fetch=2000
             Limit: skip=0, fetch=2000
@@ -1131,4 +1147,39 @@ mod test {
         "
         )
     }
+
+    fn has_fetch_order_sensitive_scan(plan: &LogicalPlan) -> bool {
+        let mut found = false;
+        plan.apply(|node| {
+            if let LogicalPlan::TableScan(scan) = node {
+                if scan.fetch_order_sensitive {
+                    found = true;
+                    return Ok(TreeNodeRecursion::Stop);
+                }
+            }
+            Ok(TreeNodeRecursion::Continue)
+        })
+        .expect("plan traversal");
+        found
+    }
+
+    #[test]
+    fn limit_push_down_sort_marks_scans_order_sensitive() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .sort_by(vec![col("a")])?
+            .limit(0, Some(10))?
+            .build()?;
+
+        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
+        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
+            vec![Arc::new(PushDownLimit::new())];
+        let optimized_plan =
+            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
+
+        assert!(has_fetch_order_sensitive_scan(&optimized_plan));
+
+        Ok(())
+    }
 }
diff --git a/datafusion/physical-expr-common/src/metrics/value.rs b/datafusion/physical-expr-common/src/metrics/value.rs
index 0054813164bcc..4bd1eb59d9bb6 100644
--- a/datafusion/physical-expr-common/src/metrics/value.rs
+++ b/datafusion/physical-expr-common/src/metrics/value.rs
@@ -442,6 +442,11 @@ impl PruningMetrics {
     pub fn matched(&self) -> usize {
         self.matched.load(Ordering::Relaxed)
     }
+
+    /// Number of items fully matched
+    pub fn fully_matched(&self) -> usize {
+        self.fully_matched.load(Ordering::Relaxed)
+    }
 }
 
 /// Counters tracking ratio metrics (e.g. matched vs total)
diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs
index 218c2e4e47d04..3f3c9f51e2ce6 100644
--- a/datafusion/proto/src/logical_plan/mod.rs
+++ b/datafusion/proto/src/logical_plan/mod.rs
@@ -267,6 +267,7 @@ fn from_table_source(
         projected_schema,
         filters: vec![],
         fetch: None,
+        fetch_order_sensitive: false,
     });
 
     LogicalPlanNode::try_from_logical_plan(&r, extension_codec)

From 8170789c9a9a3d82c348a4d3a7f7eff8380c6756 Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Tue, 25 Nov 2025 14:08:05 +0800
Subject: [PATCH 04/26] fix test formant

---
 datafusion/expr/src/logical_plan/builder.rs  | 4 ++--
 datafusion/optimizer/src/push_down_filter.rs | 4 ++--
 datafusion/optimizer/src/push_down_limit.rs  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs
index 27852fd4b9897..6f654428e41a1 100644
--- a/datafusion/expr/src/logical_plan/builder.rs
+++ b/datafusion/expr/src/logical_plan/builder.rs
@@ -2756,12 +2756,12 @@ mod tests {
 
         assert_snapshot!(plan, @r"
         Union
-          Cross Join:
+          Cross Join: 
             SubqueryAlias: left
               Values: (Int32(1))
             SubqueryAlias: right
               Values: (Int32(1))
-          Cross Join:
+          Cross Join: 
             SubqueryAlias: left
               Values: (Int32(1))
             SubqueryAlias: right
diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs
index cdb372791999d..cc4d099777e9f 100644
--- a/datafusion/optimizer/src/push_down_filter.rs
+++ b/datafusion/optimizer/src/push_down_filter.rs
@@ -2331,7 +2331,7 @@ mod tests {
             plan,
             @r"
         Projection: test.a, test1.d
-          Cross Join:
+          Cross Join: 
             Projection: test.a, test.b, test.c
               TableScan: test, full_filters=[test.a = Int32(1)]
             Projection: test1.d, test1.e, test1.f
@@ -2361,7 +2361,7 @@ mod tests {
             plan,
             @r"
         Projection: test.a, test1.a
-          Cross Join:
+          Cross Join: 
             Projection: test.a, test.b, test.c
               TableScan: test, full_filters=[test.a = Int32(1)]
             Projection: test1.a, test1.b, test1.c
diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs
index 4e0e357a289df..8accbd5bfaf5f 100644
--- a/datafusion/optimizer/src/push_down_limit.rs
+++ b/datafusion/optimizer/src/push_down_limit.rs
@@ -1060,7 +1060,7 @@ mod test {
             plan,
             @r"
         Limit: skip=0, fetch=1000
-          Cross Join:
+          Cross Join: 
             Limit: skip=0, fetch=1000
               TableScan: test, fetch=1000
             Limit: skip=0, fetch=1000
@@ -1083,7 +1083,7 @@ mod test {
             plan,
             @r"
         Limit: skip=1000, fetch=1000
-          Cross Join:
+          Cross Join: 
             Limit: skip=0, fetch=2000
               TableScan: test, fetch=2000
             Limit: skip=0, fetch=2000

From 187e10b5acc9282a1ad664b82e847c1ce6e6117f Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Thu, 27 Nov 2025 17:36:02 +0800
Subject: [PATCH 05/26] Rename to preserve_order

---
 datafusion/catalog-listing/src/table.rs       |  2 +-
 datafusion/catalog/src/table.rs               | 22 ++++++-------------
 datafusion/core/src/physical_planner.rs       |  4 ++--
 datafusion/datasource-parquet/src/opener.rs   |  8 +++----
 datafusion/datasource-parquet/src/source.rs   |  2 +-
 datafusion/datasource/src/file_scan_config.rs | 16 +++++++-------
 datafusion/expr/src/logical_plan/plan.rs      | 21 +++++++++---------
 datafusion/expr/src/logical_plan/tree_node.rs |  4 ++--
 .../optimizer/src/optimize_projections/mod.rs |  4 ++--
 datafusion/optimizer/src/push_down_filter.rs  |  2 +-
 datafusion/optimizer/src/push_down_limit.rs   | 13 +++++------
 datafusion/proto/src/logical_plan/mod.rs      |  2 +-
 12 files changed, 45 insertions(+), 55 deletions(-)

diff --git a/datafusion/catalog-listing/src/table.rs b/datafusion/catalog-listing/src/table.rs
index 039f276d9492b..be4a16a7bd1e5 100644
--- a/datafusion/catalog-listing/src/table.rs
+++ b/datafusion/catalog-listing/src/table.rs
@@ -581,7 +581,7 @@ impl TableProvider for ListingTable {
                     .with_statistics(statistics)
                     .with_projection_indices(projection)?
                     .with_limit(limit)
-                    .with_limit_order_sensitive(args.limit_order_sensitive())
+                    .with_preserve_order(args.preserve_order())
                     .with_output_ordering(output_ordering)
                     .with_expr_adapter(self.expr_adapter_factory.clone())
                     .with_partitioned_by_file_group(partitioned_by_file_group)
diff --git a/datafusion/catalog/src/table.rs b/datafusion/catalog/src/table.rs
index 4f604482bb6c2..e5206b9358f8e 100644
--- a/datafusion/catalog/src/table.rs
+++ b/datafusion/catalog/src/table.rs
@@ -361,7 +361,7 @@ pub struct ScanArgs<'a> {
     filters: Option<&'a [Expr]>,
     projection: Option<&'a [usize]>,
     limit: Option<usize>,
-    limit_order_sensitive: bool,
+    preserve_order: bool,
 }
 
 impl<'a> ScanArgs<'a> {
@@ -424,23 +424,15 @@ impl<'a> ScanArgs<'a> {
         self.limit
     }
 
-    /// Set whether the scan's limit should be order-sensitive.
-    ///
-    /// If specified, the scan should return the limited rows in a specific order.
-    /// Or we can leverage limit pruning to optimize the scan.
-    ///
-    /// # Arguments
-    /// * `order_sensitive` - Whether the scan's limit should be order-sensitive
-    pub fn with_limit_order_sensitive(mut self, order_sensitive: bool) -> Self {
-        self.limit_order_sensitive = order_sensitive;
+    /// Set whether should keep the output rows in order
+    pub fn with_preserve_order(mut self, order_sensitive: bool) -> Self {
+        self.preserve_order = order_sensitive;
         self
     }
 
-    /// Get whether the scan's limit should be order-sensitive.
-    ///
-    /// Returns `true` if the scan's limit should be order-sensitive, or `false` if not.
-    pub fn limit_order_sensitive(&self) -> bool {
-        self.limit_order_sensitive
+    /// Get whether should keep the output rows in order
+    pub fn preserve_order(&self) -> bool {
+        self.preserve_order
     }
 }
 
diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs
index 83e6bf1badbbb..fcc315be00f0f 100644
--- a/datafusion/core/src/physical_planner.rs
+++ b/datafusion/core/src/physical_planner.rs
@@ -460,7 +460,7 @@ impl DefaultPhysicalPlanner {
                 projection,
                 filters,
                 fetch,
-                fetch_order_sensitive,
+                preserve_order,
                 ..
             }) => {
                 let source = source_as_provider(source)?;
@@ -473,7 +473,7 @@ impl DefaultPhysicalPlanner {
                     .with_projection(projection.as_deref())
                     .with_filters(Some(&filters_vec))
                     .with_limit(*fetch)
-                    .with_limit_order_sensitive(*fetch_order_sensitive);
+                    .with_preserve_order(*preserve_order);
                 let res = source.scan_with_args(session_state, opts).await?;
                 Arc::clone(res.plan())
             }
diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs
index 3947524684efa..891f349635c04 100644
--- a/datafusion/datasource-parquet/src/opener.rs
+++ b/datafusion/datasource-parquet/src/opener.rs
@@ -76,8 +76,8 @@ pub(super) struct ParquetOpener {
     pub batch_size: usize,
     /// Optional limit on the number of rows to read
     pub limit: Option<usize>,
-    /// limit order sensitivity
-    pub limit_order_sensitive: bool,
+    /// If should keep the output rows in order
+    pub preserve_order: bool,
     /// Optional predicate to apply during the scan
     pub predicate: Option<Arc<dyn PhysicalExpr>>,
     /// Table schema, including partition columns.
@@ -279,7 +279,7 @@ impl FileOpener for ParquetOpener {
         let max_predicate_cache_size = self.max_predicate_cache_size;
 
         let reverse_row_groups = self.reverse_row_groups;
-        let limit_order_sensitive = self.limit_order_sensitive;
+        let preserve_order = self.preserve_order;
 
         Ok(Box::pin(async move {
             #[cfg(feature = "parquet_encryption")]
@@ -550,7 +550,7 @@ impl FileOpener for ParquetOpener {
             }
 
             // Prune by limit if limit is set and limit order is not sensitive
-            if let (Some(limit), false) = (limit, limit_order_sensitive) {
+            if let (Some(limit), false) = (limit, preserve_order) {
                 row_groups.prune_by_limit(limit, rg_metadata, &file_metrics);
             }
 
diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs
index d36e0fa106c0b..07f58db185f49 100644
--- a/datafusion/datasource-parquet/src/source.rs
+++ b/datafusion/datasource-parquet/src/source.rs
@@ -548,7 +548,7 @@ impl FileSource for ParquetSource {
                 .batch_size
                 .expect("Batch size must set before creating ParquetOpener"),
             limit: base_config.limit,
-            limit_order_sensitive: base_config.limit_order_sensitive,
+            preserve_order: base_config.preserve_order,
             predicate: self.predicate.clone(),
             table_schema: self.table_schema.clone(),
             metadata_size_hint: self.metadata_size_hint,
diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs
index 082f06829f14d..42067d2392831 100644
--- a/datafusion/datasource/src/file_scan_config.rs
+++ b/datafusion/datasource/src/file_scan_config.rs
@@ -153,7 +153,7 @@ pub struct FileScanConfig {
     /// all records after filtering are returned.
     pub limit: Option<usize>,
     /// Whether the scan's limit is order sensitive
-    pub limit_order_sensitive: bool,
+    pub preserve_order: bool,
     /// All equivalent lexicographical orderings that describe the schema.
     pub output_ordering: Vec<LexOrdering>,
     /// File compression type
@@ -242,7 +242,7 @@ pub struct FileScanConfigBuilder {
     object_store_url: ObjectStoreUrl,
     file_source: Arc<dyn FileSource>,
     limit: Option<usize>,
-    limit_order_sensitive: bool,
+    preserve_order: bool,
     projection_indices: Option<Vec<usize>>,
     constraints: Option<Constraints>,
     file_groups: Vec<FileGroup>,
@@ -273,7 +273,7 @@ impl FileScanConfigBuilder {
             output_ordering: vec![],
             file_compression_type: None,
             limit: None,
-            limit_order_sensitive: false,
+            preserve_order: false,
             projection_indices: None,
             constraints: None,
             batch_size: None,
@@ -290,8 +290,8 @@ impl FileScanConfigBuilder {
     }
 
     /// Set whether the limit should be order-sensitive.
-    pub fn with_limit_order_sensitive(mut self, order_sensitive: bool) -> Self {
-        self.limit_order_sensitive = order_sensitive;
+    pub fn with_preserve_order(mut self, order_sensitive: bool) -> Self {
+        self.preserve_order = order_sensitive;
         self
     }
 
@@ -462,7 +462,7 @@ impl FileScanConfigBuilder {
             object_store_url,
             file_source,
             limit,
-            limit_order_sensitive,
+            preserve_order,
             projection_indices,
             constraints,
             file_groups,
@@ -485,7 +485,7 @@ impl FileScanConfigBuilder {
             object_store_url,
             file_source,
             limit,
-            limit_order_sensitive,
+            preserve_order,
             projection_exprs,
             constraints,
             file_groups,
@@ -509,7 +509,7 @@ impl From<FileScanConfig> for FileScanConfigBuilder {
             output_ordering: config.output_ordering,
             file_compression_type: Some(config.file_compression_type),
             limit: config.limit,
-            limit_order_sensitive: config.limit_order_sensitive,
+            preserve_order: config.preserve_order,
             projection_indices: config
                 .projection_exprs
                 .map(|p| p.ordered_column_indices()),
diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs
index df7cbb7527a9f..9c7f365749663 100644
--- a/datafusion/expr/src/logical_plan/plan.rs
+++ b/datafusion/expr/src/logical_plan/plan.rs
@@ -2683,9 +2683,8 @@ pub struct TableScan {
     pub filters: Vec<Expr>,
     /// Optional number of rows to read
     pub fetch: Option<usize>,
-    /// If the fetch is order-sensitive, it'll be true.
-    /// And the limit pruning will be enabled.
-    pub fetch_order_sensitive: bool,
+    /// If should keep the output rows in order
+    pub preserve_order: bool,
 }
 
 impl Debug for TableScan {
@@ -2708,7 +2707,7 @@ impl PartialEq for TableScan {
             && self.projected_schema == other.projected_schema
             && self.filters == other.filters
             && self.fetch == other.fetch
-            && self.fetch_order_sensitive == other.fetch_order_sensitive
+            && self.preserve_order == other.preserve_order
     }
 }
 
@@ -2729,21 +2728,21 @@ impl PartialOrd for TableScan {
             /// Optional number of rows to read
             pub fetch: &'a Option<usize>,
             /// Whether the fetch is order-sensitive
-            pub fetch_order_sensitive: bool,
+            pub preserve_order: bool,
         }
         let comparable_self = ComparableTableScan {
             table_name: &self.table_name,
             projection: &self.projection,
             filters: &self.filters,
             fetch: &self.fetch,
-            fetch_order_sensitive: self.fetch_order_sensitive,
+            preserve_order: self.preserve_order,
         };
         let comparable_other = ComparableTableScan {
             table_name: &other.table_name,
             projection: &other.projection,
             filters: &other.filters,
             fetch: &other.fetch,
-            fetch_order_sensitive: other.fetch_order_sensitive,
+            preserve_order: other.preserve_order,
         };
         comparable_self
             .partial_cmp(&comparable_other)
@@ -2759,7 +2758,7 @@ impl Hash for TableScan {
         self.projected_schema.hash(state);
         self.filters.hash(state);
         self.fetch.hash(state);
-        self.fetch_order_sensitive.hash(state);
+        self.preserve_order.hash(state);
     }
 }
 
@@ -2813,7 +2812,7 @@ impl TableScan {
             projected_schema,
             filters,
             fetch,
-            fetch_order_sensitive: false,
+            preserve_order: false,
         })
     }
 }
@@ -4978,7 +4977,7 @@ mod tests {
             projected_schema: Arc::clone(&schema),
             filters: vec![],
             fetch: None,
-            fetch_order_sensitive: false,
+            preserve_order: false,
         }));
         let col = schema.field_names()[0].clone();
 
@@ -5009,7 +5008,7 @@ mod tests {
             projected_schema: Arc::clone(&unique_schema),
             filters: vec![],
             fetch: None,
-            fetch_order_sensitive: false,
+            preserve_order: false,
         }));
         let col = schema.field_names()[0].clone();
 
diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs
index c9ca99c20e08b..5cae151dd5852 100644
--- a/datafusion/expr/src/logical_plan/tree_node.rs
+++ b/datafusion/expr/src/logical_plan/tree_node.rs
@@ -599,7 +599,7 @@ impl LogicalPlan {
                 projected_schema,
                 filters,
                 fetch,
-                fetch_order_sensitive,
+                preserve_order,
             }) => filters.map_elements(f)?.update_data(|filters| {
                 LogicalPlan::TableScan(TableScan {
                     table_name,
@@ -608,7 +608,7 @@ impl LogicalPlan {
                     projected_schema,
                     filters,
                     fetch,
-                    fetch_order_sensitive,
+                    preserve_order,
                 })
             }),
             LogicalPlan::Distinct(Distinct::On(DistinctOn {
diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs
index c85793228ba05..1d7635f990e9d 100644
--- a/datafusion/optimizer/src/optimize_projections/mod.rs
+++ b/datafusion/optimizer/src/optimize_projections/mod.rs
@@ -259,7 +259,7 @@ fn optimize_projections(
                 projection,
                 filters,
                 fetch,
-                fetch_order_sensitive,
+                preserve_order,
                 projected_schema: _,
             } = table_scan;
 
@@ -271,7 +271,7 @@ fn optimize_projections(
             };
             let mut new_scan =
                 TableScan::try_new(table_name, source, Some(projection), filters, fetch)?;
-            new_scan.fetch_order_sensitive = fetch_order_sensitive;
+            new_scan.preserve_order = preserve_order;
 
             return Ok(Transformed::yes(LogicalPlan::TableScan(new_scan)));
         }
diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs
index cc4d099777e9f..c104184d68e1c 100644
--- a/datafusion/optimizer/src/push_down_filter.rs
+++ b/datafusion/optimizer/src/push_down_filter.rs
@@ -3119,7 +3119,7 @@ mod tests {
             projection,
             source: Arc::new(test_provider),
             fetch: None,
-            fetch_order_sensitive: false,
+            preserve_order: false,
         });
 
         Ok(LogicalPlanBuilder::from(table_scan))
diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs
index 8accbd5bfaf5f..582f4db20d9e4 100644
--- a/datafusion/optimizer/src/push_down_limit.rs
+++ b/datafusion/optimizer/src/push_down_limit.rs
@@ -125,8 +125,7 @@ impl OptimizerRule for PushDownLimit {
                 })),
 
             LogicalPlan::Sort(mut sort) => {
-                let marked_input =
-                    mark_fetch_order_sensitive(Arc::unwrap_or_clone(sort.input))?;
+                let marked_input = mark_preserve_order(Arc::unwrap_or_clone(sort.input))?;
                 sort.input = Arc::new(marked_input);
                 let new_fetch = {
                     let sort_fetch = skip + fetch;
@@ -272,10 +271,10 @@ fn push_down_join(mut join: Join, limit: usize) -> Transformed<Join> {
     Transformed::yes(join)
 }
 
-fn mark_fetch_order_sensitive(plan: LogicalPlan) -> Result<LogicalPlan> {
+fn mark_preserve_order(plan: LogicalPlan) -> Result<LogicalPlan> {
     plan.transform_down(|node| match node {
         LogicalPlan::TableScan(mut scan) => {
-            scan.fetch_order_sensitive = true;
+            scan.preserve_order = true;
             Ok(Transformed::yes(LogicalPlan::TableScan(scan)))
         }
         _ => Ok(Transformed::no(node)),
@@ -1148,11 +1147,11 @@ mod test {
         )
     }
 
-    fn has_fetch_order_sensitive_scan(plan: &LogicalPlan) -> bool {
+    fn has_preserve_order_scan(plan: &LogicalPlan) -> bool {
         let mut found = false;
         plan.apply(|node| {
             if let LogicalPlan::TableScan(scan) = node {
-                if scan.fetch_order_sensitive {
+                if scan.preserve_order {
                     found = true;
                     return Ok(TreeNodeRecursion::Stop);
                 }
@@ -1178,7 +1177,7 @@ mod test {
         let optimized_plan =
             Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
 
-        assert!(has_fetch_order_sensitive_scan(&optimized_plan));
+        assert!(has_preserve_order_scan(&optimized_plan));
 
         Ok(())
     }
diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs
index 3f3c9f51e2ce6..1af4db1094840 100644
--- a/datafusion/proto/src/logical_plan/mod.rs
+++ b/datafusion/proto/src/logical_plan/mod.rs
@@ -267,7 +267,7 @@ fn from_table_source(
         projected_schema,
         filters: vec![],
         fetch: None,
-        fetch_order_sensitive: false,
+        preserve_order: false,
     });
 
     LogicalPlanNode::try_from_logical_plan(&r, extension_codec)

From d6dc4b7eefc41d1a740eebd219d1fd6a9c78d420 Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Thu, 27 Nov 2025 21:10:40 +0800
Subject: [PATCH 06/26] refactor pushdown limit

---
 datafusion/optimizer/src/push_down_limit.rs | 374 +++++++++++++++++++-
 1 file changed, 357 insertions(+), 17 deletions(-)

diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs
index 582f4db20d9e4..f2ad2a89ce59c 100644
--- a/datafusion/optimizer/src/push_down_limit.rs
+++ b/datafusion/optimizer/src/push_down_limit.rs
@@ -18,6 +18,7 @@
 //! [`PushDownLimit`] pushes `LIMIT` earlier in the query plan
 
 use std::cmp::min;
+use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
 
 use crate::optimizer::ApplyOrder;
@@ -33,12 +34,17 @@ use datafusion_expr::{FetchType, SkipType, lit};
 /// Optimization rule that tries to push down `LIMIT`.
 //. It will push down through projection, limits (taking the smaller limit)
 #[derive(Default, Debug)]
-pub struct PushDownLimit {}
+pub struct PushDownLimit {
+    /// Flag to track whether we're currently under a Sort node that requires order preservation
+    preserve_order: AtomicBool,
+}
 
 impl PushDownLimit {
     #[expect(missing_docs)]
     pub fn new() -> Self {
-        Self {}
+        Self {
+            preserve_order: AtomicBool::new(false),
+        }
     }
 }
 
@@ -54,6 +60,27 @@ impl OptimizerRule for PushDownLimit {
         config: &dyn OptimizerConfig,
     ) -> Result<Transformed<LogicalPlan>> {
         let _ = config.options();
+        if let LogicalPlan::TableScan(mut scan) = plan {
+            if self.preserve_order.load(Ordering::Relaxed) && !scan.preserve_order {
+                scan.preserve_order = true;
+                return Ok(Transformed::yes(LogicalPlan::TableScan(scan)));
+            }
+            return Ok(Transformed::no(LogicalPlan::TableScan(scan)));
+        }
+
+        if matches!(
+            plan,
+            LogicalPlan::Aggregate(_)
+                | LogicalPlan::Join(_)
+                | LogicalPlan::Union(_)
+                | LogicalPlan::Window(_)
+                | LogicalPlan::Distinct(_)
+        ) {
+            // These operations will break the order, so the downstream TableScan does not need to preserve order
+            self.preserve_order.store(false, Ordering::Relaxed);
+            return Ok(Transformed::no(plan));
+        }
+
         let LogicalPlan::Limit(mut limit) = plan else {
             return Ok(Transformed::no(plan));
         };
@@ -125,8 +152,7 @@ impl OptimizerRule for PushDownLimit {
                 })),
 
             LogicalPlan::Sort(mut sort) => {
-                let marked_input = mark_preserve_order(Arc::unwrap_or_clone(sort.input))?;
-                sort.input = Arc::new(marked_input);
+                self.preserve_order.store(true, Ordering::Relaxed);
                 let new_fetch = {
                     let sort_fetch = skip + fetch;
                     Some(sort.fetch.map(|f| f.min(sort_fetch)).unwrap_or(sort_fetch))
@@ -271,17 +297,6 @@ fn push_down_join(mut join: Join, limit: usize) -> Transformed<Join> {
     Transformed::yes(join)
 }
 
-fn mark_preserve_order(plan: LogicalPlan) -> Result<LogicalPlan> {
-    plan.transform_down(|node| match node {
-        LogicalPlan::TableScan(mut scan) => {
-            scan.preserve_order = true;
-            Ok(Transformed::yes(LogicalPlan::TableScan(scan)))
-        }
-        _ => Ok(Transformed::no(node)),
-    })
-    .map(|t| t.data)
-}
-
 #[cfg(test)]
 mod test {
     use std::cmp::Ordering;
@@ -293,8 +308,9 @@ mod test {
     use crate::{assert_optimized_plan_eq_snapshot, Optimizer};
 
     use crate::OptimizerContext;
-    use datafusion_common::tree_node::TreeNodeRecursion;
+    use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
     use datafusion_common::DFSchemaRef;
+    use datafusion_expr::expr::WindowFunctionParams;
     use datafusion_expr::{
         Expr, Extension, UserDefinedLogicalNodeCore, col, exists,
         logical_plan::builder::LogicalPlanBuilder,
@@ -1163,7 +1179,42 @@ mod test {
     }
 
     #[test]
-    fn limit_push_down_sort_marks_scans_order_sensitive() -> Result<()> {
+    fn limit_push_down_sort_marks_scans_preserev_order() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .sort_by(vec![col("a")])?
+            .limit(0, Some(10))?
+            .build()?;
+
+        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
+        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
+            vec![Arc::new(PushDownLimit::new())];
+        let optimized_plan =
+            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
+
+        assert!(has_preserve_order_scan(&optimized_plan));
+
+        Ok(())
+    }
+
+    // Helper function to count how many TableScans have preserve_order = true
+    fn count_preserve_order_scans(plan: &LogicalPlan) -> usize {
+        let mut count = 0;
+        plan.apply(|node| {
+            if let LogicalPlan::TableScan(scan) = node {
+                if scan.preserve_order {
+                    count += 1;
+                }
+            }
+            Ok(TreeNodeRecursion::Continue)
+        })
+        .expect("plan traversal");
+        count
+    }
+
+    #[test]
+    fn limit_push_down_sort_marks_scans_preserve_order() -> Result<()> {
         let table_scan = test_table_scan()?;
 
         let plan = LogicalPlanBuilder::from(table_scan)
@@ -1181,4 +1232,293 @@ mod test {
 
         Ok(())
     }
+
+    #[test]
+    fn limit_push_down_sort_with_projection_marks_scans() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("a"), col("b")])?
+            .sort_by(vec![col("a")])?
+            .limit(0, Some(10))?
+            .build()?;
+
+        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
+        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
+            vec![Arc::new(PushDownLimit::new())];
+        let optimized_plan =
+            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
+
+        assert!(
+            has_preserve_order_scan(&optimized_plan),
+            "Projection preserves order, scan should be marked"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn limit_push_down_sort_with_filter_marks_scans() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .filter(col("a").gt(lit(5)))?
+            .sort_by(vec![col("a")])?
+            .limit(0, Some(10))?
+            .build()?;
+
+        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
+        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
+            vec![Arc::new(PushDownLimit::new())];
+        let optimized_plan =
+            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
+
+        assert!(
+            has_preserve_order_scan(&optimized_plan),
+            "Filter preserves order, scan should be marked"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn limit_push_down_sort_with_aggregate_does_not_mark_scans() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(vec![col("a")], vec![max(col("b"))])?
+            .sort_by(vec![col("a")])?
+            .limit(0, Some(10))?
+            .build()?;
+
+        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
+        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
+            vec![Arc::new(PushDownLimit::new())];
+        let optimized_plan =
+            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
+
+        assert!(
+            !has_preserve_order_scan(&optimized_plan),
+            "Aggregate breaks order, scan should NOT be marked"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn limit_push_down_sort_with_join_does_not_mark_scans() -> Result<()> {
+        let table_scan_1 = test_table_scan()?;
+        let table_scan_2 = test_table_scan_with_name("test2")?;
+
+        let plan = LogicalPlanBuilder::from(table_scan_1)
+            .join(
+                LogicalPlanBuilder::from(table_scan_2).build()?,
+                JoinType::Inner,
+                (vec!["a"], vec!["a"]),
+                None,
+            )?
+            .sort_by(vec![col("test.a")])?
+            .limit(0, Some(10))?
+            .build()?;
+
+        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
+        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
+            vec![Arc::new(PushDownLimit::new())];
+        let optimized_plan =
+            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
+
+        assert_eq!(
+            count_preserve_order_scans(&optimized_plan),
+            0,
+            "Join breaks order, scans should NOT be marked"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn limit_push_down_sort_with_union_does_not_mark_scans() -> Result<()> {
+        let table_scan_1 = test_table_scan()?;
+        let table_scan_2 = test_table_scan_with_name("test2")?;
+
+        let plan = LogicalPlanBuilder::from(table_scan_1)
+            .union(LogicalPlanBuilder::from(table_scan_2).build()?)?
+            .sort_by(vec![col("a")])?
+            .limit(0, Some(10))?
+            .build()?;
+
+        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
+        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
+            vec![Arc::new(PushDownLimit::new())];
+        let optimized_plan =
+            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
+
+        assert_eq!(
+            count_preserve_order_scans(&optimized_plan),
+            0,
+            "Union breaks order, scans should NOT be marked"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn limit_push_down_sort_with_window_does_not_mark_scans() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let window_expr =
+            Expr::WindowFunction(Box::new(datafusion_expr::expr::WindowFunction {
+                fun: datafusion_expr::WindowFunctionDefinition::AggregateUDF(
+                    datafusion_functions_aggregate::sum::sum_udaf(),
+                ),
+                params: WindowFunctionParams {
+                    args: vec![col("b")],
+                    partition_by: vec![col("a")],
+                    order_by: vec![],
+                    window_frame: datafusion_expr::WindowFrame::new(None),
+                    null_treatment: None,
+                    filter: None,
+                    distinct: false,
+                },
+            }));
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .window(vec![window_expr.alias("sum_b")])?
+            .sort_by(vec![col("a")])?
+            .limit(0, Some(10))?
+            .build()?;
+
+        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
+        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
+            vec![Arc::new(PushDownLimit::new())];
+        let optimized_plan =
+            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
+
+        assert!(
+            !has_preserve_order_scan(&optimized_plan),
+            "Window function breaks order, scan should NOT be marked"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn limit_push_down_sort_with_distinct_does_not_mark_scans() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .distinct()?
+            .sort_by(vec![col("a")])?
+            .limit(0, Some(10))?
+            .build()?;
+
+        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
+        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
+            vec![Arc::new(PushDownLimit::new())];
+        let optimized_plan =
+            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
+
+        assert!(
+            !has_preserve_order_scan(&optimized_plan),
+            "Distinct breaks order, scan should NOT be marked"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn limit_push_down_sort_through_multiple_order_preserving_ops() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("a"), col("b")])?
+            .filter(col("a").gt(lit(5)))?
+            .limit(0, Some(100))?
+            .sort_by(vec![col("a")])?
+            .limit(0, Some(10))?
+            .build()?;
+
+        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
+        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
+            vec![Arc::new(PushDownLimit::new())];
+        let optimized_plan =
+            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
+
+        assert!(
+            has_preserve_order_scan(&optimized_plan),
+            "Multiple order-preserving ops, scan should be marked"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn limit_push_down_without_sort_does_not_mark_scans() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .limit(0, Some(10))?
+            .build()?;
+
+        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
+        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
+            vec![Arc::new(PushDownLimit::new())];
+        let optimized_plan =
+            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
+
+        assert!(
+            !has_preserve_order_scan(&optimized_plan),
+            "Limit without Sort should NOT mark scan"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn limit_push_down_sort_with_subquery_alias_marks_scans() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .alias("subquery")?
+            .sort_by(vec![col("a")])?
+            .limit(0, Some(10))?
+            .build()?;
+
+        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
+        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
+            vec![Arc::new(PushDownLimit::new())];
+        let optimized_plan =
+            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
+
+        assert!(
+            has_preserve_order_scan(&optimized_plan),
+            "SubqueryAlias preserves order, scan should be marked"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn limit_push_down_sort_complex_aggregate_case() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(vec![col("a")], vec![max(col("b")).alias("max_b")])?
+            .sort_by(vec![col("max_b")])?
+            .limit(0, Some(10))?
+            .build()?;
+
+        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
+        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
+            vec![Arc::new(PushDownLimit::new())];
+        let optimized_plan =
+            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
+
+        assert!(
+            !has_preserve_order_scan(&optimized_plan),
+            "Sort on aggregate result should NOT mark input scan"
+        );
+
+        Ok(())
+    }
 }

From 62e1725d1d4ba98087623623d87d1c4f9091366a Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Thu, 27 Nov 2025 21:21:58 +0800
Subject: [PATCH 07/26] extract some logic into
 identify_fully_matched_row_groups

---
 .../src/row_group_filter.rs                   | 115 +++++++++++-------
 1 file changed, 74 insertions(+), 41 deletions(-)

diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs
index 50979d3687771..1d5f1e99f1687 100644
--- a/datafusion/datasource-parquet/src/row_group_filter.rs
+++ b/datafusion/datasource-parquet/src/row_group_filter.rs
@@ -84,6 +84,9 @@ impl RowGroupAccessPlanFilter {
     }
 
     /// Prunes the access plan based on the limit and fully contained row groups.
+    /// See the [description](https://github.com/apache/datafusion/issues/18860#issuecomment-3563442093)
+    /// for how the pruning works and improves performance.
+    /// For more information, see the [paper](https://arxiv.org/pdf/2504.11540)'s "Pruning for LIMIT Queries" part
     pub fn prune_by_limit(
         &mut self,
         limit: usize,
@@ -197,47 +200,15 @@ impl RowGroupAccessPlanFilter {
                     }
                 }
 
-                // Note: this part of code shouldn't be expensive with a limited number of row groups
-                // If we do find it's expensive, we can consider optimizing it further.
-                if !fully_contained_candidates_original_idx.is_empty() {
-                    // Use NotExpr to create the inverted predicate
-                    let inverted_expr =
-                        Arc::new(NotExpr::new(Arc::clone(predicate.orig_expr())));
-                    // Simplify the NOT expression (e.g., NOT(c1 = 0) -> c1 != 0)
-                    // before building the pruning predicate
-                    let mut simplifier = PhysicalExprSimplifier::new(arrow_schema);
-                    let inverted_expr = simplifier.simplify(inverted_expr).unwrap();
-                    if let Ok(inverted_predicate) = PruningPredicate::try_new(
-                        inverted_expr,
-                        Arc::clone(predicate.schema()),
-                    ) {
-                        let inverted_pruning_stats = RowGroupPruningStatistics {
-                            parquet_schema,
-                            row_group_metadatas: fully_contained_candidates_original_idx
-                                .iter()
-                                .map(|&i| &groups[i])
-                                .collect::<Vec<_>>(),
-                            arrow_schema,
-                        };
-
-                        if let Ok(inverted_values) =
-                            inverted_predicate.prune(&inverted_pruning_stats)
-                        {
-                            for (i, &original_row_group_idx) in
-                                fully_contained_candidates_original_idx.iter().enumerate()
-                            {
-                                // If the inverted predicate *also* prunes this row group (meaning inverted_values[i] is false),
-                                // it implies that *all* rows in this group satisfy the original predicate.
-                                if !inverted_values[i] {
-                                    self.is_fully_matched[original_row_group_idx] = true;
-                                    metrics
-                                        .row_groups_pruned_statistics
-                                        .add_fully_matched(1);
-                                }
-                            }
-                        }
-                    }
-                }
+                // Check if any of the matched row groups are fully contained by the predicate
+                self.identify_fully_matched_row_groups(
+                    fully_contained_candidates_original_idx,
+                    arrow_schema,
+                    parquet_schema,
+                    groups,
+                    predicate,
+                    metrics,
+                );
             }
             // stats filter array could not be built, so we can't prune
             Err(e) => {
@@ -247,6 +218,68 @@ impl RowGroupAccessPlanFilter {
         }
     }
 
+    /// Identifies row groups that are fully matched by the predicate.
+    ///
+    /// This optimization checks whether all rows in a row group satisfy the predicate
+    /// by inverting the predicate and checking if it prunes the row group. If the
+    /// inverted predicate prunes a row group, it means no rows match the inverted
+    /// predicate, which implies all rows match the original predicate.
+    ///
+    /// Note: This optimization is relatively inexpensive for a limited number of row groups.
+    fn identify_fully_matched_row_groups(
+        &mut self,
+        candidate_row_group_indices: Vec<usize>,
+        arrow_schema: &Schema,
+        parquet_schema: &SchemaDescriptor,
+        groups: &[RowGroupMetaData],
+        predicate: &PruningPredicate,
+        metrics: &ParquetFileMetrics,
+    ) {
+        if candidate_row_group_indices.is_empty() {
+            return;
+        }
+
+        // Use NotExpr to create the inverted predicate
+        let inverted_expr = Arc::new(NotExpr::new(Arc::clone(predicate.orig_expr())));
+
+        // Simplify the NOT expression (e.g., NOT(c1 = 0) -> c1 != 0)
+        // before building the pruning predicate
+        let mut simplifier = PhysicalExprSimplifier::new(arrow_schema);
+        let Ok(inverted_expr) = simplifier.simplify(inverted_expr) else {
+            return;
+        };
+
+        let Ok(inverted_predicate) =
+            PruningPredicate::try_new(inverted_expr, Arc::clone(predicate.schema()))
+        else {
+            return;
+        };
+
+        let inverted_pruning_stats = RowGroupPruningStatistics {
+            parquet_schema,
+            row_group_metadatas: candidate_row_group_indices
+                .iter()
+                .map(|&i| &groups[i])
+                .collect::<Vec<_>>(),
+            arrow_schema,
+        };
+
+        let Ok(inverted_values) = inverted_predicate.prune(&inverted_pruning_stats)
+        else {
+            return;
+        };
+
+        for (i, &original_row_group_idx) in candidate_row_group_indices.iter().enumerate()
+        {
+            // If the inverted predicate *also* prunes this row group (meaning inverted_values[i] is false),
+            // it implies that *all* rows in this group satisfy the original predicate.
+            if !inverted_values[i] {
+                self.is_fully_matched[original_row_group_idx] = true;
+                metrics.row_groups_pruned_statistics.add_fully_matched(1);
+            }
+        }
+    }
+
     /// Prune remaining row groups using available bloom filters and the
     /// [`PruningPredicate`].
     ///

From 0229bd53688332c6806c4dbb59ab9bee0397e0de Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Thu, 27 Nov 2025 21:31:36 +0800
Subject: [PATCH 08/26] resolve conflicts

---
 datafusion/datasource/src/file_scan_config.rs | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs
index 42067d2392831..50cdc5c78f804 100644
--- a/datafusion/datasource/src/file_scan_config.rs
+++ b/datafusion/datasource/src/file_scan_config.rs
@@ -243,7 +243,6 @@ pub struct FileScanConfigBuilder {
     file_source: Arc<dyn FileSource>,
     limit: Option<usize>,
     preserve_order: bool,
-    projection_indices: Option<Vec<usize>>,
     constraints: Option<Constraints>,
     file_groups: Vec<FileGroup>,
     statistics: Option<Statistics>,
@@ -274,7 +273,6 @@ impl FileScanConfigBuilder {
             file_compression_type: None,
             limit: None,
             preserve_order: false,
-            projection_indices: None,
             constraints: None,
             batch_size: None,
             expr_adapter_factory: None,
@@ -463,7 +461,6 @@ impl FileScanConfigBuilder {
             file_source,
             limit,
             preserve_order,
-            projection_indices,
             constraints,
             file_groups,
             statistics,
@@ -486,7 +483,6 @@ impl FileScanConfigBuilder {
             file_source,
             limit,
             preserve_order,
-            projection_exprs,
             constraints,
             file_groups,
             output_ordering,
@@ -510,9 +506,6 @@ impl From<FileScanConfig> for FileScanConfigBuilder {
             file_compression_type: Some(config.file_compression_type),
             limit: config.limit,
             preserve_order: config.preserve_order,
-            projection_indices: config
-                .projection_exprs
-                .map(|p| p.ordered_column_indices()),
             constraints: Some(config.constraints),
             batch_size: config.batch_size,
             expr_adapter_factory: config.expr_adapter_factory,

From 330775fcbdcd32a0fc6db3f46ab5698f9d85bc5c Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Fri, 28 Nov 2025 09:55:45 +0800
Subject: [PATCH 09/26] Add end to end sqllogictest

---
 .../src/row_group_filter.rs                   |   4 +-
 .../physical-expr-common/src/metrics/value.rs |  26 ++++--
 .../sqllogictest/test_files/limit_pruning.slt |  77 ++++++++++++++++++
 test_files/scratch/limit_pruning/data.parquet | Bin 0 -> 2320 bytes
 4 files changed, 99 insertions(+), 8 deletions(-)
 create mode 100644 datafusion/sqllogictest/test_files/limit_pruning.slt
 create mode 100644 test_files/scratch/limit_pruning/data.parquet

diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs
index 1d5f1e99f1687..6674d442a94d3 100644
--- a/datafusion/datasource-parquet/src/row_group_filter.rs
+++ b/datafusion/datasource-parquet/src/row_group_filter.rs
@@ -202,7 +202,7 @@ impl RowGroupAccessPlanFilter {
 
                 // Check if any of the matched row groups are fully contained by the predicate
                 self.identify_fully_matched_row_groups(
-                    fully_contained_candidates_original_idx,
+                    &fully_contained_candidates_original_idx,
                     arrow_schema,
                     parquet_schema,
                     groups,
@@ -228,7 +228,7 @@ impl RowGroupAccessPlanFilter {
     /// Note: This optimization is relatively inexpensive for a limited number of row groups.
     fn identify_fully_matched_row_groups(
         &mut self,
-        candidate_row_group_indices: Vec<usize>,
+        candidate_row_group_indices: &[usize],
         arrow_schema: &Schema,
         parquet_schema: &SchemaDescriptor,
         groups: &[RowGroupMetaData],
diff --git a/datafusion/physical-expr-common/src/metrics/value.rs b/datafusion/physical-expr-common/src/metrics/value.rs
index 4bd1eb59d9bb6..5ecaa86fc386c 100644
--- a/datafusion/physical-expr-common/src/metrics/value.rs
+++ b/datafusion/physical-expr-common/src/metrics/value.rs
@@ -379,13 +379,24 @@ impl Display for PruningMetrics {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         let matched = self.matched.load(Ordering::Relaxed);
         let total = self.pruned.load(Ordering::Relaxed) + matched;
+        let fully_matched = self.fully_matched.load(Ordering::Relaxed);
 
-        write!(
-            f,
-            "{} total → {} matched",
-            human_readable_count(total),
-            human_readable_count(matched)
-        )
+        if fully_matched != 0 {
+            write!(
+                f,
+                "{} total → {} matched -> {} fully matched",
+                human_readable_count(total),
+                human_readable_count(matched),
+                human_readable_count(fully_matched)
+            )
+        } else {
+            write!(
+                f,
+                "{} total → {} matched",
+                human_readable_count(total),
+                human_readable_count(matched)
+            )
+        }
     }
 }
 
@@ -920,8 +931,11 @@ impl MetricValue {
             ) => {
                 let pruned = other_pruning_metrics.pruned.load(Ordering::Relaxed);
                 let matched = other_pruning_metrics.matched.load(Ordering::Relaxed);
+                let fully_matched =
+                    other_pruning_metrics.fully_matched.load(Ordering::Relaxed);
                 pruning_metrics.add_pruned(pruned);
                 pruning_metrics.add_matched(matched);
+                pruning_metrics.add_fully_matched(fully_matched);
             }
             (
                 Self::Ratio { ratio_metrics, .. },
diff --git a/datafusion/sqllogictest/test_files/limit_pruning.slt b/datafusion/sqllogictest/test_files/limit_pruning.slt
new file mode 100644
index 0000000000000..cc8a17e5b78b7
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/limit_pruning.slt
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+
+statement ok
+CREATE TABLE t AS VALUES
+  ('Anow Vole', 7),
+  ('Brown Bear', 133),
+  ('Gray Wolf', 82),
+  ('Lynx', 71),
+  ('Red Fox', 40),
+  ('Alpine Bat', 6),
+  ('Nlpine Ibex', 101),
+  ('Nlpine Goat', 76),
+  ('Nlpine Sheep', 83),
+  ('Europ. Mole', 4),
+  ('Polecat', 16),
+  ('Alpine Ibex', 97);
+
+statement ok
+COPY (SELECT column1 as a, column2 as b FROM t)
+TO 'test_files/scratch/limit_pruning/data.parquet'
+STORED AS PARQUET
+OPTIONS (
+  'format.max_row_group_size' '3'
+);
+
+statement ok
+drop table t;
+
+statement ok
+CREATE EXTERNAL TABLE t
+STORED AS PARQUET
+LOCATION 'test_files/scratch/limit_pruning/data.parquet';
+
+
+statement ok
+set datafusion.explain.analyze_level = summary;
+
+# row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched
+# limit_pruned_row_groups=2 total → 0 matched
+query TT
+explain analyze select * from t where a > 'M' AND b >= 50 limit 3;
+----
+Plan with Metrics DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[a, b], limit=3, file_type=parquet, predicate=a@0 > M AND b@1 >= 50, pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 > M AND b_null_count@4 != row_count@2 AND b_max@3 >= 50, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=3 total → 3 matched, limit_pruned_row_groups=2 total → 0 matched, bytes_scanned=<slt:ignore>, metadata_load_time=<slt:ignore>, scan_efficiency_ratio=<slt:ignore>]
+
+# limit_pruned_row_groups=0 total → 0 matched
+# because of order by, scan needs to preserve sort, so limit pruning is disabled
+query TT
+explain analyze select * from t where a > 'M' AND b >= 50 order by a limit 3;
+----
+Plan with Metrics
+01)SortExec: TopK(fetch=3), expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[a@0 < Nlpine Sheep], metrics=[output_rows=3, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[a, b], file_type=parquet, predicate=a@0 > M AND b@1 >= 50 AND DynamicFilter [ a@0 < Nlpine Sheep ], pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 > M AND b_null_count@4 != row_count@2 AND b_max@3 >= 50 AND a_null_count@1 != row_count@2 AND a_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=9 total → 9 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=<slt:ignore>, metadata_load_time=<slt:ignore>, scan_efficiency_ratio=<slt:ignore>]
+
+statement ok
+drop table t;
+
+statement ok
+reset datafusion.explain.analyze_level;
diff --git a/test_files/scratch/limit_pruning/data.parquet b/test_files/scratch/limit_pruning/data.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..535026d53c5553427b4ac2bbbcd277b2393460b5
GIT binary patch
literal 2320
zcmbtWJ!lj`6n?W=_kOZ5r(}j*xZ(<3qJ;buN<6ff_2&}3Xzmn$KrnmB`h)Q<oY6!p
z5wx@tNg);zEi6O?gRK@Jg@~0^3K0<t3rh(ozPC4fKgmY?8OXhvd2he>zBk_t8I{EW
zuJch|%5#ImW`OP&eLvu&WRL&=S+2}vj#noABre=)Wu}~QeXmNcT-CdlIZ>HB%Yk=s
zrw2bz;hzyA6aX{~Gnm%41u#WF&$gy`1TYqZTTsL(8XAm!U}NkL;aq;K6)wR*N0j`v
z@@x!k(Vxg1sLWE`a&qcI+0VG%^dDkJmYRcZKytzOyHZcDF13TfQab`2t`#<1@@I1u
zZ`wS1-uI^v`(o)$_J2zCArD9YsnjB$Xzl7S1FCfYm1<?GH#34tpyrOE9rvcmqnb+`
z{-<OeQsDi4$@XT=wi0!f;=CbC%@(lD!7~ej;#4;c+lt?+&49OtBpX7pt(X$fs+f2v
z0ffm#!?deor6aX1Q>}utT@9932ngn{1J=}EEJ`(gH4X`%2I7QGYzj$>+l&thbc;c-
z(_&r=Ry191Q;zMlTG8u{=Z13ZBgeZ~-7%L?jx0GAk~Ifr+dX7@ReT}NTpmB~U-GyP
z+}x^J%-ah2G}gv-T9rUpcHZuBAk9sC2N)?EL%1mn@sNo{=G<mZJiQaT8wc5RzMD3I
z-Ggrlo*dv$3e9dY7#uQ)#ZI0UKa>0*5V#(KmjdvcOgv!Dyym=R>4pHTx+(;8*JaVz
zjmo!+Bn9GQCl_L=ogV<~r!btj_@s4Vi=&v`W#T1smfM`K8c_?SWi<&dDwEi*n0+T^
zcT7ICKC>0wnb(n7s7b3XRARPvH!7cEb~lC0-kZei9x?l>iT7HU7ESGGK<&BVyxu_6
zTGVbe--M`yVxJ=TB!vX;ckpBD6MPqQRtzK<-V-YxULGR2b~h@YBDhEdzjYA7XGHMz
z1|q0O6P#yai8*r#XD*Oscmu8Wri&SsOJo(dW#YE5@prOZNID-9$Su6PRy@4W#BJ?v
zR6fP+J8`?wN!%V2x4F3ZtaafVn6^w=EFL+LtzA#+3+tn?vmkK<KWGn0IV1O;lLu)F
zt&}dgaTIVZ<uFZgE3~+xsXL<fr5w4Apo6Dq9R$&&Fg#8MzsESlnW?$Lq*CrDH@T$D
z1z#1UTcF(|QVyyT&eft^DvEG|;krdNLxXkUCjn|u^{HUGXAfBRgg5P-y>j_NrJT9y
aR|C6aptrxb-|oSuiC(8y*zr;PU+5RUYIsNh

literal 0
HcmV?d00001


From e50361c549988c87f35d38f6073cd0a0979c6879 Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Thu, 18 Dec 2025 14:43:51 +0800
Subject: [PATCH 10/26] resolve conflicts

---
 datafusion/core/tests/parquet/mod.rs                  | 11 +++++------
 datafusion/core/tests/parquet/row_group_pruning.rs    |  4 ++--
 datafusion/datasource-parquet/src/row_group_filter.rs |  4 ++--
 datafusion/optimizer/src/push_down_limit.rs           | 11 +++++------
 4 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs
index 4d0209267514b..9234ff62e1688 100644
--- a/datafusion/core/tests/parquet/mod.rs
+++ b/datafusion/core/tests/parquet/mod.rs
@@ -153,13 +153,12 @@ impl TestOutput {
                 && let MetricValue::PruningMetrics {
                     pruning_metrics, ..
                 } = metric.value()
-                {
-                    total_pruned += pruning_metrics.pruned();
-                    total_matched += pruning_metrics.matched();
-                    total_fully_matched += pruning_metrics.fully_matched();
+            {
+                total_pruned += pruning_metrics.pruned();
+                total_matched += pruning_metrics.matched();
+                total_fully_matched += pruning_metrics.fully_matched();
 
-                    found = true;
-                }
+                found = true;
             }
         }
 
diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs
index f2e2561945140..744d90d22d110 100644
--- a/datafusion/core/tests/parquet/row_group_pruning.rs
+++ b/datafusion/core/tests/parquet/row_group_pruning.rs
@@ -1840,8 +1840,8 @@ async fn test_limit_pruning_complex_filter() -> datafusion_common::error::Result
 }
 
 #[tokio::test]
-async fn test_limit_pruning_multiple_fully_matched(
-) -> datafusion_common::error::Result<()> {
+async fn test_limit_pruning_multiple_fully_matched()
+-> datafusion_common::error::Result<()> {
     // Test Case 2: Limit requires multiple fully matched row groups
     // Row Group 0: a=[5,5,5,5] -> Fully matched, 4 rows
     // Row Group 1: a=[5,5,5,5] -> Fully matched, 4 rows
diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs
index 6674d442a94d3..4d0d97531bcc1 100644
--- a/datafusion/datasource-parquet/src/row_group_filter.rs
+++ b/datafusion/datasource-parquet/src/row_group_filter.rs
@@ -24,8 +24,8 @@ use arrow::datatypes::Schema;
 use datafusion_common::pruning::PruningStatistics;
 use datafusion_common::{Column, Result, ScalarValue};
 use datafusion_datasource::FileRange;
-use datafusion_physical_expr::expressions::NotExpr;
 use datafusion_physical_expr::PhysicalExprSimplifier;
+use datafusion_physical_expr::expressions::NotExpr;
 use datafusion_pruning::PruningPredicate;
 use parquet::arrow::arrow_reader::statistics::StatisticsConverter;
 use parquet::arrow::parquet_column;
@@ -244,7 +244,7 @@ impl RowGroupAccessPlanFilter {
 
         // Simplify the NOT expression (e.g., NOT(c1 = 0) -> c1 != 0)
         // before building the pruning predicate
-        let mut simplifier = PhysicalExprSimplifier::new(arrow_schema);
+        let simplifier = PhysicalExprSimplifier::new(arrow_schema);
         let Ok(inverted_expr) = simplifier.simplify(inverted_expr) else {
             return;
         };
diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs
index f2ad2a89ce59c..1838ecbd578aa 100644
--- a/datafusion/optimizer/src/push_down_limit.rs
+++ b/datafusion/optimizer/src/push_down_limit.rs
@@ -18,15 +18,14 @@
 //! [`PushDownLimit`] pushes `LIMIT` earlier in the query plan
 
 use std::cmp::min;
-use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
 
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
 
 use datafusion_common::Result;
 use datafusion_common::tree_node::Transformed;
-use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_common::utils::combine_limit;
 use datafusion_expr::logical_plan::{Join, JoinType, Limit, LogicalPlan};
 use datafusion_expr::{FetchType, SkipType, lit};
@@ -305,11 +304,11 @@ mod test {
 
     use super::*;
     use crate::test::*;
-    use crate::{assert_optimized_plan_eq_snapshot, Optimizer};
+    use crate::{Optimizer, assert_optimized_plan_eq_snapshot};
 
     use crate::OptimizerContext;
-    use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
     use datafusion_common::DFSchemaRef;
+    use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
     use datafusion_expr::expr::WindowFunctionParams;
     use datafusion_expr::{
         Expr, Extension, UserDefinedLogicalNodeCore, col, exists,
@@ -1075,7 +1074,7 @@ mod test {
             plan,
             @r"
         Limit: skip=0, fetch=1000
-          Cross Join: 
+          Cross Join:
             Limit: skip=0, fetch=1000
               TableScan: test, fetch=1000
             Limit: skip=0, fetch=1000
@@ -1098,7 +1097,7 @@ mod test {
             plan,
             @r"
         Limit: skip=1000, fetch=1000
-          Cross Join: 
+          Cross Join:
             Limit: skip=0, fetch=2000
               TableScan: test, fetch=2000
             Limit: skip=0, fetch=2000

From 321429cd91caa6f5eb4cfc0c2c91a4f257da5c35 Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Thu, 18 Dec 2025 18:56:04 +0800
Subject: [PATCH 11/26] redesign

---
 Cargo.lock                                    |   1 +
 datafusion/catalog-listing/src/table.rs       |   1 -
 datafusion/catalog/src/table.rs               |  12 -
 datafusion/core/src/physical_planner.rs       |   4 +-
 datafusion/expr/src/logical_plan/plan.rs      |  11 -
 datafusion/expr/src/logical_plan/tree_node.rs |   2 -
 .../optimizer/src/optimize_projections/mod.rs |   4 +-
 datafusion/optimizer/src/push_down_filter.rs  |   1 -
 datafusion/optimizer/src/push_down_limit.rs   | 399 +-----------------
 datafusion/physical-optimizer/Cargo.toml      |   1 +
 .../src/enforce_sorting/mod.rs                |   9 +-
 .../physical-optimizer/src/limit_pushdown.rs  |  64 ++-
 datafusion/physical-plan/src/limit.rs         |  32 ++
 datafusion/proto/src/logical_plan/mod.rs      |   1 -
 14 files changed, 109 insertions(+), 433 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2d40ab4506900..f5e01ea1e10e8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2427,6 +2427,7 @@ version = "52.0.0"
 dependencies = [
  "arrow",
  "datafusion-common",
+ "datafusion-datasource",
  "datafusion-execution",
  "datafusion-expr",
  "datafusion-expr-common",
diff --git a/datafusion/catalog-listing/src/table.rs b/datafusion/catalog-listing/src/table.rs
index be4a16a7bd1e5..38456944075fc 100644
--- a/datafusion/catalog-listing/src/table.rs
+++ b/datafusion/catalog-listing/src/table.rs
@@ -581,7 +581,6 @@ impl TableProvider for ListingTable {
                     .with_statistics(statistics)
                     .with_projection_indices(projection)?
                     .with_limit(limit)
-                    .with_preserve_order(args.preserve_order())
                     .with_output_ordering(output_ordering)
                     .with_expr_adapter(self.expr_adapter_factory.clone())
                     .with_partitioned_by_file_group(partitioned_by_file_group)
diff --git a/datafusion/catalog/src/table.rs b/datafusion/catalog/src/table.rs
index e5206b9358f8e..1f223852c2b9d 100644
--- a/datafusion/catalog/src/table.rs
+++ b/datafusion/catalog/src/table.rs
@@ -361,7 +361,6 @@ pub struct ScanArgs<'a> {
     filters: Option<&'a [Expr]>,
     projection: Option<&'a [usize]>,
     limit: Option<usize>,
-    preserve_order: bool,
 }
 
 impl<'a> ScanArgs<'a> {
@@ -423,17 +422,6 @@ impl<'a> ScanArgs<'a> {
     pub fn limit(&self) -> Option<usize> {
         self.limit
     }
-
-    /// Set whether should keep the output rows in order
-    pub fn with_preserve_order(mut self, order_sensitive: bool) -> Self {
-        self.preserve_order = order_sensitive;
-        self
-    }
-
-    /// Get whether should keep the output rows in order
-    pub fn preserve_order(&self) -> bool {
-        self.preserve_order
-    }
 }
 
 /// Result of a table scan operation from [`TableProvider::scan_with_args`].
diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs
index fcc315be00f0f..cc7d534776d7e 100644
--- a/datafusion/core/src/physical_planner.rs
+++ b/datafusion/core/src/physical_planner.rs
@@ -460,7 +460,6 @@ impl DefaultPhysicalPlanner {
                 projection,
                 filters,
                 fetch,
-                preserve_order,
                 ..
             }) => {
                 let source = source_as_provider(source)?;
@@ -472,8 +471,7 @@ impl DefaultPhysicalPlanner {
                 let opts = ScanArgs::default()
                     .with_projection(projection.as_deref())
                     .with_filters(Some(&filters_vec))
-                    .with_limit(*fetch)
-                    .with_preserve_order(*preserve_order);
+                    .with_limit(*fetch);
                 let res = source.scan_with_args(session_state, opts).await?;
                 Arc::clone(res.plan())
             }
diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs
index 9c7f365749663..4219c24bfc9c9 100644
--- a/datafusion/expr/src/logical_plan/plan.rs
+++ b/datafusion/expr/src/logical_plan/plan.rs
@@ -2683,8 +2683,6 @@ pub struct TableScan {
     pub filters: Vec<Expr>,
     /// Optional number of rows to read
     pub fetch: Option<usize>,
-    /// If should keep the output rows in order
-    pub preserve_order: bool,
 }
 
 impl Debug for TableScan {
@@ -2707,7 +2705,6 @@ impl PartialEq for TableScan {
             && self.projected_schema == other.projected_schema
             && self.filters == other.filters
             && self.fetch == other.fetch
-            && self.preserve_order == other.preserve_order
     }
 }
 
@@ -2727,22 +2724,18 @@ impl PartialOrd for TableScan {
             pub filters: &'a Vec<Expr>,
             /// Optional number of rows to read
             pub fetch: &'a Option<usize>,
-            /// Whether the fetch is order-sensitive
-            pub preserve_order: bool,
         }
         let comparable_self = ComparableTableScan {
             table_name: &self.table_name,
             projection: &self.projection,
             filters: &self.filters,
             fetch: &self.fetch,
-            preserve_order: self.preserve_order,
         };
         let comparable_other = ComparableTableScan {
             table_name: &other.table_name,
             projection: &other.projection,
             filters: &other.filters,
             fetch: &other.fetch,
-            preserve_order: other.preserve_order,
         };
         comparable_self
             .partial_cmp(&comparable_other)
@@ -2758,7 +2751,6 @@ impl Hash for TableScan {
         self.projected_schema.hash(state);
         self.filters.hash(state);
         self.fetch.hash(state);
-        self.preserve_order.hash(state);
     }
 }
 
@@ -2812,7 +2804,6 @@ impl TableScan {
             projected_schema,
             filters,
             fetch,
-            preserve_order: false,
         })
     }
 }
@@ -4977,7 +4968,6 @@ mod tests {
             projected_schema: Arc::clone(&schema),
             filters: vec![],
             fetch: None,
-            preserve_order: false,
         }));
         let col = schema.field_names()[0].clone();
 
@@ -5008,7 +4998,6 @@ mod tests {
             projected_schema: Arc::clone(&unique_schema),
             filters: vec![],
             fetch: None,
-            preserve_order: false,
         }));
         let col = schema.field_names()[0].clone();
 
diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs
index 5cae151dd5852..62a27b0a025ad 100644
--- a/datafusion/expr/src/logical_plan/tree_node.rs
+++ b/datafusion/expr/src/logical_plan/tree_node.rs
@@ -599,7 +599,6 @@ impl LogicalPlan {
                 projected_schema,
                 filters,
                 fetch,
-                preserve_order,
             }) => filters.map_elements(f)?.update_data(|filters| {
                 LogicalPlan::TableScan(TableScan {
                     table_name,
@@ -608,7 +607,6 @@ impl LogicalPlan {
                     projected_schema,
                     filters,
                     fetch,
-                    preserve_order,
                 })
             }),
             LogicalPlan::Distinct(Distinct::On(DistinctOn {
diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs
index 1d7635f990e9d..f97b05ea68fbd 100644
--- a/datafusion/optimizer/src/optimize_projections/mod.rs
+++ b/datafusion/optimizer/src/optimize_projections/mod.rs
@@ -259,7 +259,6 @@ fn optimize_projections(
                 projection,
                 filters,
                 fetch,
-                preserve_order,
                 projected_schema: _,
             } = table_scan;
 
@@ -269,9 +268,8 @@ fn optimize_projections(
                 Some(projection) => indices.into_mapped_indices(|idx| projection[idx]),
                 None => indices.into_inner(),
             };
-            let mut new_scan =
+            let new_scan =
                 TableScan::try_new(table_name, source, Some(projection), filters, fetch)?;
-            new_scan.preserve_order = preserve_order;
 
             return Ok(Transformed::yes(LogicalPlan::TableScan(new_scan)));
         }
diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs
index c104184d68e1c..755ffdbafc869 100644
--- a/datafusion/optimizer/src/push_down_filter.rs
+++ b/datafusion/optimizer/src/push_down_filter.rs
@@ -3119,7 +3119,6 @@ mod tests {
             projection,
             source: Arc::new(test_provider),
             fetch: None,
-            preserve_order: false,
         });
 
         Ok(LogicalPlanBuilder::from(table_scan))
diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs
index 1838ecbd578aa..7b302adf22acc 100644
--- a/datafusion/optimizer/src/push_down_limit.rs
+++ b/datafusion/optimizer/src/push_down_limit.rs
@@ -19,7 +19,6 @@
 
 use std::cmp::min;
 use std::sync::Arc;
-use std::sync::atomic::{AtomicBool, Ordering};
 
 use crate::optimizer::ApplyOrder;
 use crate::{OptimizerConfig, OptimizerRule};
@@ -33,17 +32,12 @@ use datafusion_expr::{FetchType, SkipType, lit};
 /// Optimization rule that tries to push down `LIMIT`.
 //. It will push down through projection, limits (taking the smaller limit)
 #[derive(Default, Debug)]
-pub struct PushDownLimit {
-    /// Flag to track whether we're currently under a Sort node that requires order preservation
-    preserve_order: AtomicBool,
-}
+pub struct PushDownLimit {}
 
 impl PushDownLimit {
     #[expect(missing_docs)]
     pub fn new() -> Self {
-        Self {
-            preserve_order: AtomicBool::new(false),
-        }
+        Self {}
     }
 }
 
@@ -59,27 +53,6 @@ impl OptimizerRule for PushDownLimit {
         config: &dyn OptimizerConfig,
     ) -> Result<Transformed<LogicalPlan>> {
         let _ = config.options();
-        if let LogicalPlan::TableScan(mut scan) = plan {
-            if self.preserve_order.load(Ordering::Relaxed) && !scan.preserve_order {
-                scan.preserve_order = true;
-                return Ok(Transformed::yes(LogicalPlan::TableScan(scan)));
-            }
-            return Ok(Transformed::no(LogicalPlan::TableScan(scan)));
-        }
-
-        if matches!(
-            plan,
-            LogicalPlan::Aggregate(_)
-                | LogicalPlan::Join(_)
-                | LogicalPlan::Union(_)
-                | LogicalPlan::Window(_)
-                | LogicalPlan::Distinct(_)
-        ) {
-            // These operations will break the order, so the downstream TableScan does not need to preserve order
-            self.preserve_order.store(false, Ordering::Relaxed);
-            return Ok(Transformed::no(plan));
-        }
-
         let LogicalPlan::Limit(mut limit) = plan else {
             return Ok(Transformed::no(plan));
         };
@@ -151,7 +124,6 @@ impl OptimizerRule for PushDownLimit {
                 })),
 
             LogicalPlan::Sort(mut sort) => {
-                self.preserve_order.store(true, Ordering::Relaxed);
                 let new_fetch = {
                     let sort_fetch = skip + fetch;
                     Some(sort.fetch.map(|f| f.min(sort_fetch)).unwrap_or(sort_fetch))
@@ -303,13 +275,11 @@ mod test {
     use std::vec;
 
     use super::*;
+    use crate::assert_optimized_plan_eq_snapshot;
     use crate::test::*;
-    use crate::{Optimizer, assert_optimized_plan_eq_snapshot};
 
     use crate::OptimizerContext;
     use datafusion_common::DFSchemaRef;
-    use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
-    use datafusion_expr::expr::WindowFunctionParams;
     use datafusion_expr::{
         Expr, Extension, UserDefinedLogicalNodeCore, col, exists,
         logical_plan::builder::LogicalPlanBuilder,
@@ -1074,7 +1044,7 @@ mod test {
             plan,
             @r"
         Limit: skip=0, fetch=1000
-          Cross Join:
+          Cross Join: 
             Limit: skip=0, fetch=1000
               TableScan: test, fetch=1000
             Limit: skip=0, fetch=1000
@@ -1097,7 +1067,7 @@ mod test {
             plan,
             @r"
         Limit: skip=1000, fetch=1000
-          Cross Join:
+          Cross Join: 
             Limit: skip=0, fetch=2000
               TableScan: test, fetch=2000
             Limit: skip=0, fetch=2000
@@ -1161,363 +1131,4 @@ mod test {
         "
         )
     }
-
-    fn has_preserve_order_scan(plan: &LogicalPlan) -> bool {
-        let mut found = false;
-        plan.apply(|node| {
-            if let LogicalPlan::TableScan(scan) = node {
-                if scan.preserve_order {
-                    found = true;
-                    return Ok(TreeNodeRecursion::Stop);
-                }
-            }
-            Ok(TreeNodeRecursion::Continue)
-        })
-        .expect("plan traversal");
-        found
-    }
-
-    #[test]
-    fn limit_push_down_sort_marks_scans_preserev_order() -> Result<()> {
-        let table_scan = test_table_scan()?;
-
-        let plan = LogicalPlanBuilder::from(table_scan)
-            .sort_by(vec![col("a")])?
-            .limit(0, Some(10))?
-            .build()?;
-
-        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
-        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
-            vec![Arc::new(PushDownLimit::new())];
-        let optimized_plan =
-            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
-
-        assert!(has_preserve_order_scan(&optimized_plan));
-
-        Ok(())
-    }
-
-    // Helper function to count how many TableScans have preserve_order = true
-    fn count_preserve_order_scans(plan: &LogicalPlan) -> usize {
-        let mut count = 0;
-        plan.apply(|node| {
-            if let LogicalPlan::TableScan(scan) = node {
-                if scan.preserve_order {
-                    count += 1;
-                }
-            }
-            Ok(TreeNodeRecursion::Continue)
-        })
-        .expect("plan traversal");
-        count
-    }
-
-    #[test]
-    fn limit_push_down_sort_marks_scans_preserve_order() -> Result<()> {
-        let table_scan = test_table_scan()?;
-
-        let plan = LogicalPlanBuilder::from(table_scan)
-            .sort_by(vec![col("a")])?
-            .limit(0, Some(10))?
-            .build()?;
-
-        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
-        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
-            vec![Arc::new(PushDownLimit::new())];
-        let optimized_plan =
-            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
-
-        assert!(has_preserve_order_scan(&optimized_plan));
-
-        Ok(())
-    }
-
-    #[test]
-    fn limit_push_down_sort_with_projection_marks_scans() -> Result<()> {
-        let table_scan = test_table_scan()?;
-
-        let plan = LogicalPlanBuilder::from(table_scan)
-            .project(vec![col("a"), col("b")])?
-            .sort_by(vec![col("a")])?
-            .limit(0, Some(10))?
-            .build()?;
-
-        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
-        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
-            vec![Arc::new(PushDownLimit::new())];
-        let optimized_plan =
-            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
-
-        assert!(
-            has_preserve_order_scan(&optimized_plan),
-            "Projection preserves order, scan should be marked"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn limit_push_down_sort_with_filter_marks_scans() -> Result<()> {
-        let table_scan = test_table_scan()?;
-
-        let plan = LogicalPlanBuilder::from(table_scan)
-            .filter(col("a").gt(lit(5)))?
-            .sort_by(vec![col("a")])?
-            .limit(0, Some(10))?
-            .build()?;
-
-        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
-        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
-            vec![Arc::new(PushDownLimit::new())];
-        let optimized_plan =
-            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
-
-        assert!(
-            has_preserve_order_scan(&optimized_plan),
-            "Filter preserves order, scan should be marked"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn limit_push_down_sort_with_aggregate_does_not_mark_scans() -> Result<()> {
-        let table_scan = test_table_scan()?;
-
-        let plan = LogicalPlanBuilder::from(table_scan)
-            .aggregate(vec![col("a")], vec![max(col("b"))])?
-            .sort_by(vec![col("a")])?
-            .limit(0, Some(10))?
-            .build()?;
-
-        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
-        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
-            vec![Arc::new(PushDownLimit::new())];
-        let optimized_plan =
-            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
-
-        assert!(
-            !has_preserve_order_scan(&optimized_plan),
-            "Aggregate breaks order, scan should NOT be marked"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn limit_push_down_sort_with_join_does_not_mark_scans() -> Result<()> {
-        let table_scan_1 = test_table_scan()?;
-        let table_scan_2 = test_table_scan_with_name("test2")?;
-
-        let plan = LogicalPlanBuilder::from(table_scan_1)
-            .join(
-                LogicalPlanBuilder::from(table_scan_2).build()?,
-                JoinType::Inner,
-                (vec!["a"], vec!["a"]),
-                None,
-            )?
-            .sort_by(vec![col("test.a")])?
-            .limit(0, Some(10))?
-            .build()?;
-
-        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
-        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
-            vec![Arc::new(PushDownLimit::new())];
-        let optimized_plan =
-            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
-
-        assert_eq!(
-            count_preserve_order_scans(&optimized_plan),
-            0,
-            "Join breaks order, scans should NOT be marked"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn limit_push_down_sort_with_union_does_not_mark_scans() -> Result<()> {
-        let table_scan_1 = test_table_scan()?;
-        let table_scan_2 = test_table_scan_with_name("test2")?;
-
-        let plan = LogicalPlanBuilder::from(table_scan_1)
-            .union(LogicalPlanBuilder::from(table_scan_2).build()?)?
-            .sort_by(vec![col("a")])?
-            .limit(0, Some(10))?
-            .build()?;
-
-        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
-        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
-            vec![Arc::new(PushDownLimit::new())];
-        let optimized_plan =
-            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
-
-        assert_eq!(
-            count_preserve_order_scans(&optimized_plan),
-            0,
-            "Union breaks order, scans should NOT be marked"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn limit_push_down_sort_with_window_does_not_mark_scans() -> Result<()> {
-        let table_scan = test_table_scan()?;
-
-        let window_expr =
-            Expr::WindowFunction(Box::new(datafusion_expr::expr::WindowFunction {
-                fun: datafusion_expr::WindowFunctionDefinition::AggregateUDF(
-                    datafusion_functions_aggregate::sum::sum_udaf(),
-                ),
-                params: WindowFunctionParams {
-                    args: vec![col("b")],
-                    partition_by: vec![col("a")],
-                    order_by: vec![],
-                    window_frame: datafusion_expr::WindowFrame::new(None),
-                    null_treatment: None,
-                    filter: None,
-                    distinct: false,
-                },
-            }));
-
-        let plan = LogicalPlanBuilder::from(table_scan)
-            .window(vec![window_expr.alias("sum_b")])?
-            .sort_by(vec![col("a")])?
-            .limit(0, Some(10))?
-            .build()?;
-
-        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
-        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
-            vec![Arc::new(PushDownLimit::new())];
-        let optimized_plan =
-            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
-
-        assert!(
-            !has_preserve_order_scan(&optimized_plan),
-            "Window function breaks order, scan should NOT be marked"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn limit_push_down_sort_with_distinct_does_not_mark_scans() -> Result<()> {
-        let table_scan = test_table_scan()?;
-
-        let plan = LogicalPlanBuilder::from(table_scan)
-            .distinct()?
-            .sort_by(vec![col("a")])?
-            .limit(0, Some(10))?
-            .build()?;
-
-        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
-        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
-            vec![Arc::new(PushDownLimit::new())];
-        let optimized_plan =
-            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
-
-        assert!(
-            !has_preserve_order_scan(&optimized_plan),
-            "Distinct breaks order, scan should NOT be marked"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn limit_push_down_sort_through_multiple_order_preserving_ops() -> Result<()> {
-        let table_scan = test_table_scan()?;
-
-        let plan = LogicalPlanBuilder::from(table_scan)
-            .project(vec![col("a"), col("b")])?
-            .filter(col("a").gt(lit(5)))?
-            .limit(0, Some(100))?
-            .sort_by(vec![col("a")])?
-            .limit(0, Some(10))?
-            .build()?;
-
-        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
-        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
-            vec![Arc::new(PushDownLimit::new())];
-        let optimized_plan =
-            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
-
-        assert!(
-            has_preserve_order_scan(&optimized_plan),
-            "Multiple order-preserving ops, scan should be marked"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn limit_push_down_without_sort_does_not_mark_scans() -> Result<()> {
-        let table_scan = test_table_scan()?;
-
-        let plan = LogicalPlanBuilder::from(table_scan)
-            .limit(0, Some(10))?
-            .build()?;
-
-        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
-        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
-            vec![Arc::new(PushDownLimit::new())];
-        let optimized_plan =
-            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
-
-        assert!(
-            !has_preserve_order_scan(&optimized_plan),
-            "Limit without Sort should NOT mark scan"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn limit_push_down_sort_with_subquery_alias_marks_scans() -> Result<()> {
-        let table_scan = test_table_scan()?;
-
-        let plan = LogicalPlanBuilder::from(table_scan)
-            .alias("subquery")?
-            .sort_by(vec![col("a")])?
-            .limit(0, Some(10))?
-            .build()?;
-
-        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
-        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
-            vec![Arc::new(PushDownLimit::new())];
-        let optimized_plan =
-            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
-
-        assert!(
-            has_preserve_order_scan(&optimized_plan),
-            "SubqueryAlias preserves order, scan should be marked"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn limit_push_down_sort_complex_aggregate_case() -> Result<()> {
-        let table_scan = test_table_scan()?;
-
-        let plan = LogicalPlanBuilder::from(table_scan)
-            .aggregate(vec![col("a")], vec![max(col("b")).alias("max_b")])?
-            .sort_by(vec![col("max_b")])?
-            .limit(0, Some(10))?
-            .build()?;
-
-        let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
-        let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> =
-            vec![Arc::new(PushDownLimit::new())];
-        let optimized_plan =
-            Optimizer::with_rules(rules).optimize(plan, &optimizer_ctx, |_, _| {})?;
-
-        assert!(
-            !has_preserve_order_scan(&optimized_plan),
-            "Sort on aggregate result should NOT mark input scan"
-        );
-
-        Ok(())
-    }
 }
diff --git a/datafusion/physical-optimizer/Cargo.toml b/datafusion/physical-optimizer/Cargo.toml
index 395da10d629ba..caa9ee7b46914 100644
--- a/datafusion/physical-optimizer/Cargo.toml
+++ b/datafusion/physical-optimizer/Cargo.toml
@@ -43,6 +43,7 @@ recursive_protection = ["dep:recursive"]
 [dependencies]
 arrow = { workspace = true }
 datafusion-common = { workspace = true }
+datafusion-datasource = { workspace = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-expr-common = { workspace = true, default-features = true }
diff --git a/datafusion/physical-optimizer/src/enforce_sorting/mod.rs b/datafusion/physical-optimizer/src/enforce_sorting/mod.rs
index a5fafb9e87e1d..a2e0ddcb3bcca 100644
--- a/datafusion/physical-optimizer/src/enforce_sorting/mod.rs
+++ b/datafusion/physical-optimizer/src/enforce_sorting/mod.rs
@@ -583,9 +583,14 @@ fn analyze_immediate_sort_removal(
         if let Some(fetch) = sort_exec.fetch() {
             // If the sort has a fetch, we need to add a limit:
             if properties.output_partitioning().partition_count() == 1 {
-                Arc::new(GlobalLimitExec::new(Arc::clone(sort_input), 0, Some(fetch)))
+                let mut global_limit =
+                    GlobalLimitExec::new(Arc::clone(sort_input), 0, Some(fetch));
+                global_limit.set_order_sensitive(true);
+                Arc::new(global_limit)
             } else {
-                Arc::new(LocalLimitExec::new(Arc::clone(sort_input), fetch))
+                let mut local_limit = LocalLimitExec::new(Arc::clone(sort_input), fetch);
+                local_limit.set_order_sensitive(true);
+                Arc::new(local_limit)
             }
         } else {
             Arc::clone(sort_input)
diff --git a/datafusion/physical-optimizer/src/limit_pushdown.rs b/datafusion/physical-optimizer/src/limit_pushdown.rs
index 4cb3abe30bae2..b5f6c35e17295 100644
--- a/datafusion/physical-optimizer/src/limit_pushdown.rs
+++ b/datafusion/physical-optimizer/src/limit_pushdown.rs
@@ -27,6 +27,8 @@ use datafusion_common::config::ConfigOptions;
 use datafusion_common::error::Result;
 use datafusion_common::tree_node::{Transformed, TreeNodeRecursion};
 use datafusion_common::utils::combine_limit;
+use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
+use datafusion_datasource::source::DataSourceExec;
 use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
@@ -50,6 +52,7 @@ pub struct GlobalRequirements {
     fetch: Option<usize>,
     skip: usize,
     satisfied: bool,
+    order_sensitive: bool,
 }
 
 impl LimitPushdown {
@@ -69,6 +72,7 @@ impl PhysicalOptimizerRule for LimitPushdown {
             fetch: None,
             skip: 0,
             satisfied: false,
+            order_sensitive: false,
         };
         pushdown_limits(plan, global_state)
     }
@@ -111,6 +115,13 @@ impl LimitExec {
             Self::Local(_) => 0,
         }
     }
+
+    fn order_sensitive(&self) -> bool {
+        match self {
+            Self::Global(global) => global.order_sensitive(),
+            Self::Local(local) => local.order_sensitive(),
+        }
+    }
 }
 
 impl From<LimitExec> for Arc<dyn ExecutionPlan> {
@@ -145,6 +156,7 @@ pub fn pushdown_limit_helper(
         );
         global_state.skip = skip;
         global_state.fetch = fetch;
+        global_state.order_sensitive = limit_exec.order_sensitive();
 
         // Now the global state has the most recent information, we can remove
         // the `LimitExec` plan. We will decide later if we should add it again
@@ -241,17 +253,30 @@ pub fn pushdown_limit_helper(
         let maybe_fetchable = pushdown_plan.with_fetch(skip_and_fetch);
         if global_state.satisfied {
             if let Some(plan_with_fetch) = maybe_fetchable {
-                Ok((Transformed::yes(plan_with_fetch), global_state))
+                let plan_with_preserve_order = ensure_preserve_order_if_needed(
+                    plan_with_fetch,
+                    global_state.order_sensitive,
+                );
+                Ok((Transformed::yes(plan_with_preserve_order), global_state))
             } else {
                 Ok((Transformed::no(pushdown_plan), global_state))
             }
         } else {
             global_state.satisfied = true;
             pushdown_plan = if let Some(plan_with_fetch) = maybe_fetchable {
+                let plan_with_preserve_order = ensure_preserve_order_if_needed(
+                    plan_with_fetch,
+                    global_state.order_sensitive,
+                );
+
                 if global_skip > 0 {
-                    add_global_limit(plan_with_fetch, global_skip, Some(global_fetch))
+                    add_global_limit(
+                        plan_with_preserve_order,
+                        global_skip,
+                        Some(global_fetch),
+                    )
                 } else {
-                    plan_with_fetch
+                    plan_with_preserve_order
                 }
             } else {
                 add_limit(pushdown_plan, global_skip, global_fetch)
@@ -337,4 +362,37 @@ fn add_global_limit(
     Arc::new(GlobalLimitExec::new(pushdown_plan, skip, fetch)) as _
 }
 
+/// Helper function to handle DataSourceExec preserve_order setting
+fn ensure_preserve_order_if_needed(
+    plan: Arc<dyn ExecutionPlan>,
+    order_sensitive: bool,
+) -> Arc<dyn ExecutionPlan> {
+    if !order_sensitive {
+        return plan;
+    }
+
+    let Some(data_source_exec) = plan.as_any().downcast_ref::<DataSourceExec>() else {
+        return plan;
+    };
+
+    let Some(file_scan_config) = data_source_exec
+        .data_source()
+        .as_any()
+        .downcast_ref::<FileScanConfig>()
+    else {
+        return plan;
+    };
+
+    if file_scan_config.preserve_order {
+        return plan;
+    }
+
+    let new_config = FileScanConfigBuilder::from(file_scan_config.clone())
+        .with_preserve_order(true)
+        .build();
+
+    let new_data_source_exec = DataSourceExec::new(Arc::new(new_config));
+    Arc::new(new_data_source_exec) as Arc<dyn ExecutionPlan>
+}
+
 // See tests in datafusion/core/tests/physical_optimizer
diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs
index 05d6882821477..bf27769a9c776 100644
--- a/datafusion/physical-plan/src/limit.rs
+++ b/datafusion/physical-plan/src/limit.rs
@@ -51,6 +51,9 @@ pub struct GlobalLimitExec {
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     cache: PlanProperties,
+    /// Whether the limit is order-sensitive
+    /// Such as the child plan is a sort node, then the limit is order-sensitive
+    order_sensitive: bool,
 }
 
 impl GlobalLimitExec {
@@ -63,6 +66,7 @@ impl GlobalLimitExec {
             fetch,
             metrics: ExecutionPlanMetricsSet::new(),
             cache,
+            order_sensitive: false,
         }
     }
 
@@ -91,6 +95,18 @@ impl GlobalLimitExec {
             Boundedness::Bounded,
         )
     }
+
+    /// Whether the limit is order-sensitive
+    /// Such as the child plan is a sort node, then the limit is order-sensitive
+    pub fn order_sensitive(&self) -> bool {
+        self.order_sensitive
+    }
+
+    /// Whether the limit is order-sensitive
+    /// Such as the child plan is a sort node, then the limit is order-sensitive
+    pub fn set_order_sensitive(&mut self, order_sensitive: bool) {
+        self.order_sensitive = order_sensitive;
+    }
 }
 
 impl DisplayAs for GlobalLimitExec {
@@ -223,6 +239,9 @@ pub struct LocalLimitExec {
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     cache: PlanProperties,
+    /// Whether the limit is order-sensitive
+    /// Such as the child plan is a sort node, then the limit is order-sensitive
+    order_sensitive: bool,
 }
 
 impl LocalLimitExec {
@@ -234,6 +253,7 @@ impl LocalLimitExec {
             fetch,
             metrics: ExecutionPlanMetricsSet::new(),
             cache,
+            order_sensitive: false,
         }
     }
 
@@ -257,6 +277,18 @@ impl LocalLimitExec {
             Boundedness::Bounded,
         )
     }
+
+    /// Whether the limit is order-sensitive
+    /// Such as the child plan is a sort node, then the limit is order-sensitive
+    pub fn order_sensitive(&self) -> bool {
+        self.order_sensitive
+    }
+
+    /// Whether the limit is order-sensitive
+    /// Such as the child plan is a sort node, then the limit is order-sensitive
+    pub fn set_order_sensitive(&mut self, order_sensitive: bool) {
+        self.order_sensitive = order_sensitive;
+    }
 }
 
 impl DisplayAs for LocalLimitExec {
diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs
index 1af4db1094840..218c2e4e47d04 100644
--- a/datafusion/proto/src/logical_plan/mod.rs
+++ b/datafusion/proto/src/logical_plan/mod.rs
@@ -267,7 +267,6 @@ fn from_table_source(
         projected_schema,
         filters: vec![],
         fetch: None,
-        preserve_order: false,
     });
 
     LogicalPlanNode::try_from_logical_plan(&r, extension_codec)

From 31ae9cf74a5beeaee8adfba0c1052a6baf77bd52 Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Mon, 22 Dec 2025 13:23:52 +0800
Subject: [PATCH 12/26] use required_ordering

---
 .../src/enforce_sorting/mod.rs                |  5 ++-
 .../physical-optimizer/src/limit_pushdown.rs  |  4 +-
 datafusion/physical-plan/src/limit.rs         | 45 +++++++++----------
 3 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/datafusion/physical-optimizer/src/enforce_sorting/mod.rs b/datafusion/physical-optimizer/src/enforce_sorting/mod.rs
index a2e0ddcb3bcca..247ebb2785dd3 100644
--- a/datafusion/physical-optimizer/src/enforce_sorting/mod.rs
+++ b/datafusion/physical-optimizer/src/enforce_sorting/mod.rs
@@ -581,15 +581,16 @@ fn analyze_immediate_sort_removal(
         // Remove the sort:
         node.children = node.children.swap_remove(0).children;
         if let Some(fetch) = sort_exec.fetch() {
+            let required_ordering = sort_exec.properties().output_ordering().cloned();
             // If the sort has a fetch, we need to add a limit:
             if properties.output_partitioning().partition_count() == 1 {
                 let mut global_limit =
                     GlobalLimitExec::new(Arc::clone(sort_input), 0, Some(fetch));
-                global_limit.set_order_sensitive(true);
+                global_limit.set_required_ordering(required_ordering);
                 Arc::new(global_limit)
             } else {
                 let mut local_limit = LocalLimitExec::new(Arc::clone(sort_input), fetch);
-                local_limit.set_order_sensitive(true);
+                local_limit.set_required_ordering(required_ordering);
                 Arc::new(local_limit)
             }
         } else {
diff --git a/datafusion/physical-optimizer/src/limit_pushdown.rs b/datafusion/physical-optimizer/src/limit_pushdown.rs
index b5f6c35e17295..d259025b61bf1 100644
--- a/datafusion/physical-optimizer/src/limit_pushdown.rs
+++ b/datafusion/physical-optimizer/src/limit_pushdown.rs
@@ -118,8 +118,8 @@ impl LimitExec {
 
     fn order_sensitive(&self) -> bool {
         match self {
-            Self::Global(global) => global.order_sensitive(),
-            Self::Local(local) => local.order_sensitive(),
+            Self::Global(global) => global.required_ordering().is_some(),
+            Self::Local(local) => local.required_ordering().is_some(),
         }
     }
 }
diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs
index bf27769a9c776..b85a7a8f80405 100644
--- a/datafusion/physical-plan/src/limit.rs
+++ b/datafusion/physical-plan/src/limit.rs
@@ -35,6 +35,7 @@ use arrow::record_batch::RecordBatch;
 use datafusion_common::{Result, assert_eq_or_internal_err, internal_err};
 use datafusion_execution::TaskContext;
 
+use datafusion_physical_expr::LexOrdering;
 use futures::stream::{Stream, StreamExt};
 use log::trace;
 
@@ -51,9 +52,9 @@ pub struct GlobalLimitExec {
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     cache: PlanProperties,
-    /// Whether the limit is order-sensitive
-    /// Such as the child plan is a sort node, then the limit is order-sensitive
-    order_sensitive: bool,
+    /// If the child plan is a sort node, after the sort node is removed during
+    /// physical optimization, we should add the required ordering to the limit node
+    required_ordering: Option<LexOrdering>,
 }
 
 impl GlobalLimitExec {
@@ -66,7 +67,7 @@ impl GlobalLimitExec {
             fetch,
             metrics: ExecutionPlanMetricsSet::new(),
             cache,
-            order_sensitive: false,
+            required_ordering: None,
         }
     }
 
@@ -96,16 +97,14 @@ impl GlobalLimitExec {
         )
     }
 
-    /// Whether the limit is order-sensitive
-    /// Such as the child plan is a sort node, then the limit is order-sensitive
-    pub fn order_sensitive(&self) -> bool {
-        self.order_sensitive
+    /// Get the required ordering from limit
+    pub fn required_ordering(&self) -> &Option<LexOrdering> {
+        &self.required_ordering
     }
 
-    /// Whether the limit is order-sensitive
-    /// Such as the child plan is a sort node, then the limit is order-sensitive
-    pub fn set_order_sensitive(&mut self, order_sensitive: bool) {
-        self.order_sensitive = order_sensitive;
+    /// Set the required ordering for limit
+    pub fn set_required_ordering(&mut self, required_ordering: Option<LexOrdering>) {
+        self.required_ordering = required_ordering;
     }
 }
 
@@ -239,9 +238,9 @@ pub struct LocalLimitExec {
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     cache: PlanProperties,
-    /// Whether the limit is order-sensitive
-    /// Such as the child plan is a sort node, then the limit is order-sensitive
-    order_sensitive: bool,
+    /// If the child plan is a sort node, after the sort node is removed during
+    /// physical optimization, we should add the required ordering to the limit node
+    required_ordering: Option<LexOrdering>,
 }
 
 impl LocalLimitExec {
@@ -253,7 +252,7 @@ impl LocalLimitExec {
             fetch,
             metrics: ExecutionPlanMetricsSet::new(),
             cache,
-            order_sensitive: false,
+            required_ordering: None,
         }
     }
 
@@ -278,16 +277,14 @@ impl LocalLimitExec {
         )
     }
 
-    /// Whether the limit is order-sensitive
-    /// Such as the child plan is a sort node, then the limit is order-sensitive
-    pub fn order_sensitive(&self) -> bool {
-        self.order_sensitive
+    /// Get the required ordering from limit
+    pub fn required_ordering(&self) -> &Option<LexOrdering> {
+        &self.required_ordering
     }
 
-    /// Whether the limit is order-sensitive
-    /// Such as the child plan is a sort node, then the limit is order-sensitive
-    pub fn set_order_sensitive(&mut self, order_sensitive: bool) {
-        self.order_sensitive = order_sensitive;
+    /// Set the required ordering for limit
+    pub fn set_required_ordering(&mut self, required_ordering: Option<LexOrdering>) {
+        self.required_ordering = required_ordering;
     }
 }
 

From 4602a764bc621ac1e96af0c9af5e2e859db7da94 Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Wed, 24 Dec 2025 17:35:40 +0800
Subject: [PATCH 13/26] resolve conflicts

---
 datafusion/core/tests/parquet/mod.rs          | 51 ++++++++++++++-----
 .../core/tests/parquet/row_group_pruning.rs   |  4 +-
 datafusion/datasource-parquet/src/opener.rs   |  3 ++
 .../src/row_group_filter.rs                   |  2 +-
 4 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs
index 9234ff62e1688..3fff22d7c0855 100644
--- a/datafusion/core/tests/parquet/mod.rs
+++ b/datafusion/core/tests/parquet/mod.rs
@@ -110,6 +110,26 @@ struct ContextWithParquet {
     ctx: SessionContext,
 }
 
+struct PruningMetric {
+    total_pruned: usize,
+    total_matched: usize,
+    total_fully_matched: usize,
+}
+
+impl PruningMetric {
+    pub fn total_pruned(&self) -> usize {
+        self.total_pruned
+    }
+
+    pub fn total_matched(&self) -> usize {
+        self.total_matched
+    }
+
+    pub fn total_fully_matched(&self) -> usize {
+        self.total_fully_matched
+    }
+}
+
 /// The output of running one of the test cases
 struct TestOutput {
     /// The input query SQL
@@ -127,8 +147,8 @@ struct TestOutput {
 impl TestOutput {
     /// retrieve the value of the named metric, if any
     fn metric_value(&self, metric_name: &str) -> Option<usize> {
-        if let Some((pruned, _matched, _fully)) = self.pruning_metric(metric_name) {
-            return Some(pruned);
+        if let Some(pm) = self.pruning_metric(metric_name) {
+            return Some(pm.total_pruned());
         }
 
         self.parquet_metrics
@@ -141,7 +161,7 @@ impl TestOutput {
             })
     }
 
-    fn pruning_metric(&self, metric_name: &str) -> Option<(usize, usize, usize)> {
+    fn pruning_metric(&self, metric_name: &str) -> Option<PruningMetric> {
         let mut total_pruned = 0;
         let mut total_matched = 0;
         let mut total_fully_matched = 0;
@@ -163,7 +183,11 @@ impl TestOutput {
         }
 
         if found {
-            Some((total_pruned, total_matched, total_fully_matched))
+            Some(PruningMetric {
+                total_pruned,
+                total_matched,
+                total_fully_matched,
+            })
         } else {
             None
         }
@@ -175,33 +199,33 @@ impl TestOutput {
     }
 
     /// The number of row_groups pruned / matched by bloom filter
-    fn row_groups_bloom_filter(&self) -> Option<(usize, usize, usize)> {
+    fn row_groups_bloom_filter(&self) -> Option<PruningMetric> {
         self.pruning_metric("row_groups_pruned_bloom_filter")
     }
 
     /// The number of row_groups matched by statistics
     fn row_groups_matched_statistics(&self) -> Option<usize> {
         self.pruning_metric("row_groups_pruned_statistics")
-            .map(|(_pruned, matched, _fully)| matched)
+            .map(|pm| pm.total_matched())
     }
 
     /// The number of row_groups fully matched by statistics
     fn row_groups_fully_matched_statistics(&self) -> Option<usize> {
         self.pruning_metric("row_groups_pruned_statistics")
-            .map(|(_pruned, _, fully)| fully)
+            .map(|pm| pm.total_fully_matched())
     }
 
     /// The number of row_groups pruned by statistics
     fn row_groups_pruned_statistics(&self) -> Option<usize> {
         self.pruning_metric("row_groups_pruned_statistics")
-            .map(|(pruned, _matched, _fully)| pruned)
+            .map(|pm| pm.total_pruned())
     }
 
     /// Metric `files_ranges_pruned_statistics` tracks both pruned and matched count,
     /// for testing purpose, here it only aggregate the `pruned` count.
     fn files_ranges_pruned_statistics(&self) -> Option<usize> {
         self.pruning_metric("files_ranges_pruned_statistics")
-            .map(|(pruned, _matched, _fully)| pruned)
+            .map(|pm| pm.total_pruned())
     }
 
     /// The number of row_groups matched by bloom filter or statistics
@@ -210,14 +234,13 @@ impl TestOutput {
     /// filter: 7 total -> 3 matched, this function returns 3 for the final matched
     /// count.
     fn row_groups_matched(&self) -> Option<usize> {
-        self.row_groups_bloom_filter()
-            .map(|(_pruned, matched, _fully)| matched)
+        self.row_groups_bloom_filter().map(|pm| pm.total_matched())
     }
 
     /// The number of row_groups pruned
     fn row_groups_pruned(&self) -> Option<usize> {
         self.row_groups_bloom_filter()
-            .map(|(pruned, _matched, _fully)| pruned)
+            .map(|pm| pm.total_pruned())
             .zip(self.row_groups_pruned_statistics())
             .map(|(a, b)| a + b)
     }
@@ -225,13 +248,13 @@ impl TestOutput {
     /// The number of row pages pruned
     fn row_pages_pruned(&self) -> Option<usize> {
         self.pruning_metric("page_index_rows_pruned")
-            .map(|(pruned, _matched, _fully)| pruned)
+            .map(|pm| pm.total_pruned())
     }
 
     /// The number of row groups pruned by limit pruning
     fn limit_pruned_row_groups(&self) -> Option<usize> {
         self.pruning_metric("limit_pruned_row_groups")
-            .map(|(pruned, _, _)| pruned)
+            .map(|pm| pm.total_pruned())
     }
 
     fn description(&self) -> String {
diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs
index 744d90d22d110..789bd90f0b998 100644
--- a/datafusion/core/tests/parquet/row_group_pruning.rs
+++ b/datafusion/core/tests/parquet/row_group_pruning.rs
@@ -157,12 +157,12 @@ impl RowGroupPruningTest {
         );
         let bloom_filter_metrics = output.row_groups_bloom_filter();
         assert_eq!(
-            bloom_filter_metrics.map(|(_pruned, matched, _)| matched),
+            bloom_filter_metrics.as_ref().map(|pm| pm.total_matched()),
             self.expected_row_group_matched_by_bloom_filter,
             "mismatched row_groups_matched_bloom_filter",
         );
         assert_eq!(
-            bloom_filter_metrics.map(|(pruned, _matched, _)| pruned),
+            bloom_filter_metrics.map(|pm| pm.total_pruned()),
             self.expected_row_group_pruned_by_bloom_filter,
             "mismatched row_groups_pruned_bloom_filter",
         );
diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs
index 891f349635c04..af2cb88b8aa52 100644
--- a/datafusion/datasource-parquet/src/opener.rs
+++ b/datafusion/datasource-parquet/src/opener.rs
@@ -1059,6 +1059,7 @@ mod test {
         coerce_int96: Option<arrow::datatypes::TimeUnit>,
         max_predicate_cache_size: Option<usize>,
         reverse_row_groups: bool,
+        preserve_order: bool,
     }
 
     impl ParquetOpenerBuilder {
@@ -1084,6 +1085,7 @@ mod test {
                 coerce_int96: None,
                 max_predicate_cache_size: None,
                 reverse_row_groups: false,
+                preserve_order: false,
             }
         }
 
@@ -1191,6 +1193,7 @@ mod test {
                 encryption_factory: None,
                 max_predicate_cache_size: self.max_predicate_cache_size,
                 reverse_row_groups: self.reverse_row_groups,
+                preserve_order: self.preserve_order,
             }
         }
     }
diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs
index 4d0d97531bcc1..3381aeec2b523 100644
--- a/datafusion/datasource-parquet/src/row_group_filter.rs
+++ b/datafusion/datasource-parquet/src/row_group_filter.rs
@@ -48,7 +48,7 @@ use parquet::{
 pub struct RowGroupAccessPlanFilter {
     /// which row groups should be accessed
     access_plan: ParquetAccessPlan,
-    /// which row groups are fully contained within the pruning predicate
+    /// Row groups where ALL rows are known to match the pruning predicate
     is_fully_matched: Vec<bool>,
 }
 

From e09a1929e27067a63fedcfc22d12b858ebd19c9f Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Wed, 7 Jan 2026 14:10:23 +0800
Subject: [PATCH 14/26] resolve newest review

---
 datafusion/core/tests/parquet/mod.rs          |  5 +-
 .../core/tests/parquet/row_group_pruning.rs   |  2 +-
 .../src/row_group_filter.rs                   | 68 ++++++++++++++++++-
 datafusion/physical-plan/src/limit.rs         |  4 +-
 .../sqllogictest/test_files/limit_pruning.slt | 24 ++++---
 docs/source/user-guide/explain-usage.md       |  1 +
 6 files changed, 86 insertions(+), 18 deletions(-)

diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs
index 3fff22d7c0855..4d52521d62737 100644
--- a/datafusion/core/tests/parquet/mod.rs
+++ b/datafusion/core/tests/parquet/mod.rs
@@ -30,6 +30,7 @@ use arrow::{
     record_batch::RecordBatch,
     util::pretty::pretty_format_batches,
 };
+use arrow_schema::SchemaRef;
 use chrono::{Datelike, Duration, TimeDelta};
 use datafusion::{
     datasource::{TableProvider, provider_as_source},
@@ -294,7 +295,7 @@ impl ContextWithParquet {
         scenario: Scenario,
         unit: Unit,
         mut config: SessionConfig,
-        custom_schema: Option<Arc<Schema>>,
+        custom_schema: Option<SchemaRef>,
         custom_batches: Option<Vec<RecordBatch>>,
     ) -> Self {
         // Use a single partition for deterministic results no matter how many CPUs the host has
@@ -1137,7 +1138,7 @@ fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
 async fn make_test_file_rg(
     scenario: Scenario,
     row_per_group: usize,
-    custom_schema: Option<Arc<Schema>>,
+    custom_schema: Option<SchemaRef>,
     custom_batches: Option<Vec<RecordBatch>>,
 ) -> NamedTempFile {
     let mut output_file = tempfile::Builder::new()
diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs
index 789bd90f0b998..e588dd06ca8f1 100644
--- a/datafusion/core/tests/parquet/row_group_pruning.rs
+++ b/datafusion/core/tests/parquet/row_group_pruning.rs
@@ -1949,7 +1949,7 @@ async fn test_limit_pruning_exceeds_fully_matched() -> datafusion_common::error:
         .with_scenario(Scenario::Int)
         .with_query(query)
         .with_expected_errors(Some(0))
-        .with_expected_rows(10) // Total: 1 + 3 + 4 + 1 = 9 (less than limit)
+        .with_expected_rows(10) // Total: 1 + 4 + 4 + 1 = 10
         .with_pruned_files(Some(0))
         .with_matched_by_stats(Some(4)) // RG0,1,2,3 matched
         .with_fully_matched_by_stats(Some(2))
diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs
index 3381aeec2b523..974e0bcd9cc93 100644
--- a/datafusion/datasource-parquet/src/row_group_filter.rs
+++ b/datafusion/datasource-parquet/src/row_group_filter.rs
@@ -49,6 +49,7 @@ pub struct RowGroupAccessPlanFilter {
     /// which row groups should be accessed
     access_plan: ParquetAccessPlan,
     /// Row groups where ALL rows are known to match the pruning predicate
+    /// (the predicate does not filter any rows)
     is_fully_matched: Vec<bool>,
 }
 
@@ -84,8 +85,66 @@ impl RowGroupAccessPlanFilter {
     }
 
     /// Prunes the access plan based on the limit and fully contained row groups.
-    /// See the [description](https://github.com/apache/datafusion/issues/18860#issuecomment-3563442093)
-    /// for how the pruning works and improves performance.
+    ///
+    /// The pruning works by leveraging the concept of fully matched row groups. Consider a query like:
+    /// `WHERE species LIKE 'Alpine%' AND s >= 50 LIMIT N`
+    ///
+    /// After initial filtering, row groups can be classified into three states:
+    ///
+    /// ```
+    /// PRUNING CLASSIFICATION DIAGRAM
+    /// ------------------------------
+    /// Legend:
+    /// [ ] = Not Matching / Pruned
+    /// [X] = Partially Matching (Row Group/Page contains some matches)
+    /// [F] = Fully Matching (Entire range is within predicate)
+    ///
+    /// +-----------------------------------------------------------------------+
+    /// |                            NOT MATCHING                               |
+    /// |  Partition 1                                                          |
+    /// |  +-----------------------------------+-----------------------------+  |
+    /// |  | SPECIES (min: 'B...',max: 'S...') | S (min: 7, max: 133)        |  |
+    /// |  +-----------------------------------+-----------------------------+  |
+    /// |  | Snow Vole                         | 7                           |  |
+    /// |  | Brown Bear                        | 133                         |  |
+    /// |  | Gray Wolf                         | 82                          |  |
+    /// |  +-----------------------------------+-----------------------------+  |
+    /// +-----------------------------------------------------------------------+
+    ///
+    /// +-----------------------------------------------------------------------+
+    /// |                          PARTIALLY MATCHING                           |
+    /// |  Partition 2                             Partition 4                  |
+    /// |  +------------------+--------------+     +------------------+-------+ |
+    /// |  | SPECIES          | S            |     | SPECIES          | S     | |
+    /// |  | (min:A, max:R)   |(min:6,max:70)|     | (min:A, max:P)   |[4-51] | |
+    /// |  +------------------+--------------+     +------------------+-------+ |
+    /// |  | Lynx             | 71           |     | Europ. Mole      | 4     | |
+    /// |  | Red Fox          | 40           |     | Polecat          | 16    | |
+    /// |  | Alpine Bat       | 6            |     | Alpine Ibex      | 97    | |
+    /// |  +------------------+--------------+     +------------------+-------+ |
+    /// +-----------------------------------------------------------------------+
+    ///
+    /// +-----------------------------------------------------------------------+
+    /// |                           FULLY MATCHING                              |
+    /// |  Partition 3                                                          |
+    /// |  +-----------------------------------+-----------------------------+  |
+    /// |  | SPECIES (min: 'A...',max: 'A...') | S (min: 76, max: 101)       |  |
+    /// |  +-----------------------------------+-----------------------------+  |
+    /// |  | Alpine Ibex                       | 101                         |  |
+    /// |  | Alpine Goat                       | 76                          |  |
+    /// |  | Alpine Sheep                      | 83                          |  |
+    /// |  +-----------------------------------+-----------------------------+  |
+    /// +-----------------------------------------------------------------------+
+
+    /// Without limit pruning: Scan Partition 2 → Partition 3 → Partition 4 (until limit reached)
+    /// With limit pruning: If Partition 3 contains enough rows to satisfy the limit,
+    /// skip Partitions 2 and 4 entirely and go directly to Partition 3.
+    ///
+    /// This optimization is particularly effective when:
+    /// - The limit is small relative to the total dataset size
+    /// - There are row groups that are fully matched by the filter predicates
+    /// - The fully matched row groups contain sufficient rows to satisfy the limit
+    ///
     /// For more information, see the [paper](https://arxiv.org/pdf/2504.11540)'s "Pruning for LIMIT Queries" part
     pub fn prune_by_limit(
         &mut self,
@@ -96,7 +155,8 @@ impl RowGroupAccessPlanFilter {
         let mut fully_matched_row_group_indexes: Vec<usize> = Vec::new();
         let mut fully_matched_rows_count: usize = 0;
 
-        // Iterate through the currently accessible row groups
+        // Iterate through the currently accessible row groups and try to
+        // find a set of matching row groups that can satisfy the limit
         for &idx in self.access_plan.row_group_indexes().iter() {
             if self.is_fully_matched[idx] {
                 let row_group_row_count = rg_metadata[idx].num_rows() as usize;
@@ -108,6 +168,8 @@ impl RowGroupAccessPlanFilter {
             }
         }
 
+        // If we can satisfy the limit with fully matching row groups,
+        // rewrite the plan to do so
         if fully_matched_rows_count >= limit {
             let original_num_accessible_row_groups =
                 self.access_plan.row_group_indexes().len();
diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs
index b85a7a8f80405..fea7acb221304 100644
--- a/datafusion/physical-plan/src/limit.rs
+++ b/datafusion/physical-plan/src/limit.rs
@@ -52,8 +52,8 @@ pub struct GlobalLimitExec {
     /// Execution metrics
     metrics: ExecutionPlanMetricsSet,
     cache: PlanProperties,
-    /// If the child plan is a sort node, after the sort node is removed during
-    /// physical optimization, we should add the required ordering to the limit node
+    /// Does the limit have to preserve the order of its input, and if so what is it?
+    /// Some optimizations may reorder the input if no particular sort is required
     required_ordering: Option<LexOrdering>,
 }
 
diff --git a/datafusion/sqllogictest/test_files/limit_pruning.slt b/datafusion/sqllogictest/test_files/limit_pruning.slt
index cc8a17e5b78b7..0735e05cb8b6f 100644
--- a/datafusion/sqllogictest/test_files/limit_pruning.slt
+++ b/datafusion/sqllogictest/test_files/limit_pruning.slt
@@ -20,22 +20,26 @@ set datafusion.execution.parquet.pushdown_filters = true;
 
 
 statement ok
-CREATE TABLE t AS VALUES
+CREATE TABLE tracking_data AS VALUES
+-- ***** Row Group 0 *****
   ('Anow Vole', 7),
   ('Brown Bear', 133),
   ('Gray Wolf', 82),
+-- ***** Row Group 1 *****
   ('Lynx', 71),
   ('Red Fox', 40),
   ('Alpine Bat', 6),
+-- ***** Row Group 2 *****
   ('Nlpine Ibex', 101),
   ('Nlpine Goat', 76),
   ('Nlpine Sheep', 83),
+-- ***** Row Group 3 *****
   ('Europ. Mole', 4),
   ('Polecat', 16),
   ('Alpine Ibex', 97);
 
 statement ok
-COPY (SELECT column1 as a, column2 as b FROM t)
+COPY (SELECT column1 as species, column2 as s FROM tracking_data)
 TO 'test_files/scratch/limit_pruning/data.parquet'
 STORED AS PARQUET
 OPTIONS (
@@ -43,10 +47,10 @@ OPTIONS (
 );
 
 statement ok
-drop table t;
+drop table tracking_data;
 
 statement ok
-CREATE EXTERNAL TABLE t
+CREATE EXTERNAL TABLE tracking_data
 STORED AS PARQUET
 LOCATION 'test_files/scratch/limit_pruning/data.parquet';
 
@@ -57,21 +61,21 @@ set datafusion.explain.analyze_level = summary;
 # row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched
 # limit_pruned_row_groups=2 total → 0 matched
 query TT
-explain analyze select * from t where a > 'M' AND b >= 50 limit 3;
+explain analyze select * from tracking_data where species > 'M' AND s >= 50 limit 3;
 ----
-Plan with Metrics DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[a, b], limit=3, file_type=parquet, predicate=a@0 > M AND b@1 >= 50, pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 > M AND b_null_count@4 != row_count@2 AND b_max@3 >= 50, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=3 total → 3 matched, limit_pruned_row_groups=2 total → 0 matched, bytes_scanned=<slt:ignore>, metadata_load_time=<slt:ignore>, scan_efficiency_ratio=<slt:ignore>]
+Plan with Metrics DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[species, s], limit=3, file_type=parquet, predicate=species@0 > M AND s@1 >= 50, pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=1ns, output_bytes=142.0 B, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=3 total → 3 matched, limit_pruned_row_groups=2 total → 0 matched, bytes_scanned=171, metadata_load_time=444.63µs, scan_efficiency_ratio=7.3% (171/2.35 K)]
 
 # limit_pruned_row_groups=0 total → 0 matched
 # because of order by, scan needs to preserve sort, so limit pruning is disabled
 query TT
-explain analyze select * from t where a > 'M' AND b >= 50 order by a limit 3;
+explain analyze select * from tracking_data where species > 'M' AND s >= 50 order by species limit 3;
 ----
 Plan with Metrics
-01)SortExec: TopK(fetch=3), expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[a@0 < Nlpine Sheep], metrics=[output_rows=3, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[a, b], file_type=parquet, predicate=a@0 > M AND b@1 >= 50 AND DynamicFilter [ a@0 < Nlpine Sheep ], pruning_predicate=a_null_count@1 != row_count@2 AND a_max@0 > M AND b_null_count@4 != row_count@2 AND b_max@3 >= 50 AND a_null_count@1 != row_count@2 AND a_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=9 total → 9 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=<slt:ignore>, metadata_load_time=<slt:ignore>, scan_efficiency_ratio=<slt:ignore>]
+01)SortExec: TopK(fetch=3), expr=[species@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[species@0 < Nlpine Sheep], metrics=[output_rows=3, elapsed_compute=2.69ms, output_bytes=72.0 B]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[species, s], file_type=parquet, predicate=species@0 > M AND s@1 >= 50 AND DynamicFilter [ species@0 < Nlpine Sheep ], pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50 AND species_null_count@1 != row_count@2 AND species_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=1ns, output_bytes=142.0 B, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=9 total → 9 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=521, metadata_load_time=512.29µs, scan_efficiency_ratio=22% (521/2.35 K)]
 
 statement ok
-drop table t;
+drop table tracking_data;
 
 statement ok
 reset datafusion.explain.analyze_level;
diff --git a/docs/source/user-guide/explain-usage.md b/docs/source/user-guide/explain-usage.md
index 5a1184539c034..8fe83163813da 100644
--- a/docs/source/user-guide/explain-usage.md
+++ b/docs/source/user-guide/explain-usage.md
@@ -228,6 +228,7 @@ When predicate pushdown is enabled, `DataSourceExec` with `ParquetSource` gains
 - `page_index_rows_pruned`: number of rows evaluated by page index filters. The metric reports both how many rows were considered in total and how many matched (were not pruned).
 - `row_groups_pruned_bloom_filter`: number of row groups evaluated by Bloom Filters, reporting both total checked groups and groups that matched.
 - `row_groups_pruned_statistics`: number of row groups evaluated by row-group statistics (min/max), reporting both total checked groups and groups that matched.
+- `limit_pruned_row_groups`: number of row groups pruned by the limit.
 - `pushdown_rows_matched`: rows that were tested by any of the above filters, and passed all of them.
 - `pushdown_rows_pruned`: rows that were tested by any of the above filters, and did not pass at least one of them.
 - `predicate_evaluation_errors`: number of times evaluating the filter expression failed (expected to be zero in normal operation)

From 56cda2d928d07e24e71aaf2671459235d0b3a799 Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Wed, 7 Jan 2026 14:17:56 +0800
Subject: [PATCH 15/26] remove scratch

---
 datafusion/core/tests/parquet/row_group_pruning.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs
index e588dd06ca8f1..445ae7e97f228 100644
--- a/datafusion/core/tests/parquet/row_group_pruning.rs
+++ b/datafusion/core/tests/parquet/row_group_pruning.rs
@@ -1957,6 +1957,5 @@ async fn test_limit_pruning_exceeds_fully_matched() -> datafusion_common::error:
         .with_limit_pruned_row_groups(Some(0)) // No limit pruning since we need all RGs
         .test_row_group_prune_with_custom_data(schema, batches, 4)
         .await;
-
     Ok(())
 }

From a875d41010191b292efa968b77e75dd9eaa6d8bc Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Wed, 7 Jan 2026 14:21:26 +0800
Subject: [PATCH 16/26] remove scratch

---
 test_files/scratch/limit_pruning/data.parquet | Bin 2320 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 test_files/scratch/limit_pruning/data.parquet

diff --git a/test_files/scratch/limit_pruning/data.parquet b/test_files/scratch/limit_pruning/data.parquet
deleted file mode 100644
index 535026d53c5553427b4ac2bbbcd277b2393460b5..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2320
zcmbtWJ!lj`6n?W=_kOZ5r(}j*xZ(<3qJ;buN<6ff_2&}3Xzmn$KrnmB`h)Q<oY6!p
z5wx@tNg);zEi6O?gRK@Jg@~0^3K0<t3rh(ozPC4fKgmY?8OXhvd2he>zBk_t8I{EW
zuJch|%5#ImW`OP&eLvu&WRL&=S+2}vj#noABre=)Wu}~QeXmNcT-CdlIZ>HB%Yk=s
zrw2bz;hzyA6aX{~Gnm%41u#WF&$gy`1TYqZTTsL(8XAm!U}NkL;aq;K6)wR*N0j`v
z@@x!k(Vxg1sLWE`a&qcI+0VG%^dDkJmYRcZKytzOyHZcDF13TfQab`2t`#<1@@I1u
zZ`wS1-uI^v`(o)$_J2zCArD9YsnjB$Xzl7S1FCfYm1<?GH#34tpyrOE9rvcmqnb+`
z{-<OeQsDi4$@XT=wi0!f;=CbC%@(lD!7~ej;#4;c+lt?+&49OtBpX7pt(X$fs+f2v
z0ffm#!?deor6aX1Q>}utT@9932ngn{1J=}EEJ`(gH4X`%2I7QGYzj$>+l&thbc;c-
z(_&r=Ry191Q;zMlTG8u{=Z13ZBgeZ~-7%L?jx0GAk~Ifr+dX7@ReT}NTpmB~U-GyP
z+}x^J%-ah2G}gv-T9rUpcHZuBAk9sC2N)?EL%1mn@sNo{=G<mZJiQaT8wc5RzMD3I
z-Ggrlo*dv$3e9dY7#uQ)#ZI0UKa>0*5V#(KmjdvcOgv!Dyym=R>4pHTx+(;8*JaVz
zjmo!+Bn9GQCl_L=ogV<~r!btj_@s4Vi=&v`W#T1smfM`K8c_?SWi<&dDwEi*n0+T^
zcT7ICKC>0wnb(n7s7b3XRARPvH!7cEb~lC0-kZei9x?l>iT7HU7ESGGK<&BVyxu_6
zTGVbe--M`yVxJ=TB!vX;ckpBD6MPqQRtzK<-V-YxULGR2b~h@YBDhEdzjYA7XGHMz
z1|q0O6P#yai8*r#XD*Oscmu8Wri&SsOJo(dW#YE5@prOZNID-9$Su6PRy@4W#BJ?v
zR6fP+J8`?wN!%V2x4F3ZtaafVn6^w=EFL+LtzA#+3+tn?vmkK<KWGn0IV1O;lLu)F
zt&}dgaTIVZ<uFZgE3~+xsXL<fr5w4Apo6Dq9R$&&Fg#8MzsESlnW?$Lq*CrDH@T$D
z1z#1UTcF(|QVyyT&eft^DvEG|;krdNLxXkUCjn|u^{HUGXAfBRgg5P-y>j_NrJT9y
aR|C6aptrxb-|oSuiC(8y*zr;PU+5RUYIsNh


From 719fa82cb7d7d6e6968542aaec7509f4180ad051 Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Wed, 7 Jan 2026 14:25:53 +0800
Subject: [PATCH 17/26] fix clippy

---
 datafusion/datasource-parquet/src/row_group_filter.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs
index 974e0bcd9cc93..3f43c939f6ecb 100644
--- a/datafusion/datasource-parquet/src/row_group_filter.rs
+++ b/datafusion/datasource-parquet/src/row_group_filter.rs
@@ -135,7 +135,7 @@ impl RowGroupAccessPlanFilter {
     /// |  | Alpine Sheep                      | 83                          |  |
     /// |  +-----------------------------------+-----------------------------+  |
     /// +-----------------------------------------------------------------------+
-
+    ///
     /// Without limit pruning: Scan Partition 2 → Partition 3 → Partition 4 (until limit reached)
     /// With limit pruning: If Partition 3 contains enough rows to satisfy the limit,
     /// skip Partitions 2 and 4 entirely and go directly to Partition 3.

From 3540fd391f0a8849f6a2ac1c4cfb633c9ee6e828 Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Wed, 7 Jan 2026 14:39:24 +0800
Subject: [PATCH 18/26] refine comments

---
 datafusion/datasource-parquet/src/row_group_filter.rs | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs
index 3f43c939f6ecb..e0907e140de40 100644
--- a/datafusion/datasource-parquet/src/row_group_filter.rs
+++ b/datafusion/datasource-parquet/src/row_group_filter.rs
@@ -91,13 +91,9 @@ impl RowGroupAccessPlanFilter {
     ///
     /// After initial filtering, row groups can be classified into three states:
     ///
-    /// ```
-    /// PRUNING CLASSIFICATION DIAGRAM
-    /// ------------------------------
-    /// Legend:
-    /// [ ] = Not Matching / Pruned
-    /// [X] = Partially Matching (Row Group/Page contains some matches)
-    /// [F] = Fully Matching (Entire range is within predicate)
+    /// 1. Not Matching / Pruned
+    /// 2. Partially Matching (Row Group/Page contains some matches)
+    /// 3. Fully Matching (Entire range is within predicate)
     ///
     /// +-----------------------------------------------------------------------+
     /// |                            NOT MATCHING                               |

From 8d60e96c14d44e75f456c376f33d1500ec1ac496 Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Wed, 7 Jan 2026 14:48:18 +0800
Subject: [PATCH 19/26] fix test

---
 datafusion/sqllogictest/test_files/limit_pruning.slt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/datafusion/sqllogictest/test_files/limit_pruning.slt b/datafusion/sqllogictest/test_files/limit_pruning.slt
index 0735e05cb8b6f..8a94bf8adc75f 100644
--- a/datafusion/sqllogictest/test_files/limit_pruning.slt
+++ b/datafusion/sqllogictest/test_files/limit_pruning.slt
@@ -63,7 +63,7 @@ set datafusion.explain.analyze_level = summary;
 query TT
 explain analyze select * from tracking_data where species > 'M' AND s >= 50 limit 3;
 ----
-Plan with Metrics DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[species, s], limit=3, file_type=parquet, predicate=species@0 > M AND s@1 >= 50, pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=1ns, output_bytes=142.0 B, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=3 total → 3 matched, limit_pruned_row_groups=2 total → 0 matched, bytes_scanned=171, metadata_load_time=444.63µs, scan_efficiency_ratio=7.3% (171/2.35 K)]
+Plan with Metrics DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[species, s], limit=3, file_type=parquet, predicate=species@0 > M AND s@1 >= 50, pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=3 total → 3 matched, limit_pruned_row_groups=2 total → 0 matched, bytes_scanned=<slt:ignore>, metadata_load_time=<slt:ignore>, scan_efficiency_ratio=<slt:ignore> (171/2.35 K)]
 
 # limit_pruned_row_groups=0 total → 0 matched
 # because of order by, scan needs to preserve sort, so limit pruning is disabled
@@ -71,8 +71,8 @@ query TT
 explain analyze select * from tracking_data where species > 'M' AND s >= 50 order by species limit 3;
 ----
 Plan with Metrics
-01)SortExec: TopK(fetch=3), expr=[species@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[species@0 < Nlpine Sheep], metrics=[output_rows=3, elapsed_compute=2.69ms, output_bytes=72.0 B]
-02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[species, s], file_type=parquet, predicate=species@0 > M AND s@1 >= 50 AND DynamicFilter [ species@0 < Nlpine Sheep ], pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50 AND species_null_count@1 != row_count@2 AND species_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=1ns, output_bytes=142.0 B, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=9 total → 9 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=521, metadata_load_time=512.29µs, scan_efficiency_ratio=22% (521/2.35 K)]
+01)SortExec: TopK(fetch=3), expr=[species@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[species@0 < Nlpine Sheep], metrics=[output_rows=3, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/limit_pruning/data.parquet]]}, projection=[species, s], file_type=parquet, predicate=species@0 > M AND s@1 >= 50 AND DynamicFilter [ species@0 < Nlpine Sheep ], pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50 AND species_null_count@1 != row_count@2 AND species_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[output_rows=3, elapsed_compute=<slt:ignore>, output_bytes=<slt:ignore>, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_rows_pruned=9 total → 9 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=<slt:ignore>, metadata_load_time=<slt:ignore>, scan_efficiency_ratio=<slt:ignore> (521/2.35 K)]
 
 statement ok
 drop table tracking_data;

From 88c1c2e62e51e7c1ccf7271f18fbef168a0b58b5 Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Thu, 8 Jan 2026 13:40:02 +0800
Subject: [PATCH 20/26] refine comments

---
 .../src/row_group_filter.rs                   | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs
index e0907e140de40..f54b1cc4e1bc8 100644
--- a/datafusion/datasource-parquet/src/row_group_filter.rs
+++ b/datafusion/datasource-parquet/src/row_group_filter.rs
@@ -97,38 +97,38 @@ impl RowGroupAccessPlanFilter {
     ///
     /// +-----------------------------------------------------------------------+
     /// |                            NOT MATCHING                               |
-    /// |  Partition 1                                                          |
+    /// |  Row group 1                                                          |
     /// |  +-----------------------------------+-----------------------------+  |
-    /// |  | SPECIES (min: 'B...',max: 'S...') | S (min: 7, max: 133)        |  |
+    /// |  | SPECIES                           | S                           |  |
     /// |  +-----------------------------------+-----------------------------+  |
     /// |  | Snow Vole                         | 7                           |  |
-    /// |  | Brown Bear                        | 133                         |  |
-    /// |  | Gray Wolf                         | 82                          |  |
+    /// |  | Brown Bear                        | 133 ✅                      |  |
+    /// |  | Gray Wolf                         | 82  ✅                      |  |
     /// |  +-----------------------------------+-----------------------------+  |
     /// +-----------------------------------------------------------------------+
     ///
-    /// +-----------------------------------------------------------------------+
-    /// |                          PARTIALLY MATCHING                           |
-    /// |  Partition 2                             Partition 4                  |
-    /// |  +------------------+--------------+     +------------------+-------+ |
-    /// |  | SPECIES          | S            |     | SPECIES          | S     | |
-    /// |  | (min:A, max:R)   |(min:6,max:70)|     | (min:A, max:P)   |[4-51] | |
-    /// |  +------------------+--------------+     +------------------+-------+ |
-    /// |  | Lynx             | 71           |     | Europ. Mole      | 4     | |
-    /// |  | Red Fox          | 40           |     | Polecat          | 16    | |
-    /// |  | Alpine Bat       | 6            |     | Alpine Ibex      | 97    | |
-    /// |  +------------------+--------------+     +------------------+-------+ |
-    /// +-----------------------------------------------------------------------+
+    /// +---------------------------------------------------------------------------+
+    /// |                          PARTIALLY MATCHING                               |
+    /// |                                                                           |
+    /// |  Row group 2                              Row group 4                     |
+    /// |  +------------------+--------------+      +------------------+----------+ |
+    /// |  | SPECIES          | S            |      | SPECIES          | S        | |
+    /// |  +------------------+--------------+      +------------------+----------+ |
+    /// |  | Lynx             | 71 ✅        |      | Europ. Mole      | 4        | |
+    /// |  | Red Fox          | 40           |      | Polecat          | 16       | |
+    /// |  | Alpine Bat  ✅   | 6            |      | Alpine Ibex ✅  | 97 ✅    | |
+    /// |  +------------------+--------------+      +------------------+----------+ |
+    /// +---------------------------------------------------------------------------+
     ///
     /// +-----------------------------------------------------------------------+
     /// |                           FULLY MATCHING                              |
-    /// |  Partition 3                                                          |
+    /// |  Row group 3                                                          |
     /// |  +-----------------------------------+-----------------------------+  |
-    /// |  | SPECIES (min: 'A...',max: 'A...') | S (min: 76, max: 101)       |  |
+    /// |  | SPECIES                           | S                           |  |
     /// |  +-----------------------------------+-----------------------------+  |
-    /// |  | Alpine Ibex                       | 101                         |  |
-    /// |  | Alpine Goat                       | 76                          |  |
-    /// |  | Alpine Sheep                      | 83                          |  |
+    /// |  | Alpine Ibex  ✅                  | 101    ✅                   |  |
+    /// |  | Alpine Goat  ✅                  | 76     ✅                   |  |
+    /// |  | Alpine Sheep ✅                  | 83     ✅                   |  |
     /// |  +-----------------------------------+-----------------------------+  |
     /// +-----------------------------------------------------------------------+
     ///

From f67193b072c533c3bddcb4d9a7d5a559c3cea6b1 Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Thu, 8 Jan 2026 14:16:05 +0800
Subject: [PATCH 21/26] rich comment

---
 datafusion/datasource/src/file_scan_config.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs
index 50cdc5c78f804..009c1d822c491 100644
--- a/datafusion/datasource/src/file_scan_config.rs
+++ b/datafusion/datasource/src/file_scan_config.rs
@@ -153,6 +153,9 @@ pub struct FileScanConfig {
     /// all records after filtering are returned.
     pub limit: Option<usize>,
     /// Whether the scan's limit is order sensitive
+    /// When `true`, files must be read in the exact order specified to produce
+    /// correct results (e.g., for `ORDER BY ... LIMIT` queries). When `false`,
+    /// DataFusion may reorder file processing for optimization without affecting correctness.
     pub preserve_order: bool,
     /// All equivalent lexicographical orderings that describe the schema.
     pub output_ordering: Vec<LexOrdering>,
@@ -288,6 +291,9 @@ impl FileScanConfigBuilder {
     }
 
     /// Set whether the limit should be order-sensitive.
+    /// When `true`, files must be read in the exact order specified to produce
+    /// correct results (e.g., for `ORDER BY ... LIMIT` queries). When `false`,
+    /// DataFusion may reorder file processing for optimization without affecting correctness.
     pub fn with_preserve_order(mut self, order_sensitive: bool) -> Self {
         self.preserve_order = order_sensitive;
         self

From 038285e5fed942f87f7238ef84371219215cbc5b Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Fri, 9 Jan 2026 17:47:19 +0800
Subject: [PATCH 22/26] remove downcast

---
 datafusion/datasource-parquet/src/opener.rs   |  4 +-
 datafusion/datasource/src/file_scan_config.rs | 15 +++++
 datafusion/datasource/src/source.rs           | 17 ++++++
 .../physical-optimizer/src/limit_pushdown.rs  | 57 ++++---------------
 .../physical-plan/src/coalesce_partitions.rs  | 13 +++++
 .../physical-plan/src/execution_plan.rs       | 13 +++++
 datafusion/physical-plan/src/filter.rs        | 13 +++++
 datafusion/physical-plan/src/projection.rs    | 13 +++++
 .../src/sorts/sort_preserving_merge.rs        | 13 +++++
 9 files changed, 109 insertions(+), 49 deletions(-)

diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs
index af2cb88b8aa52..8f31d2df24ac2 100644
--- a/datafusion/datasource-parquet/src/opener.rs
+++ b/datafusion/datasource-parquet/src/opener.rs
@@ -69,13 +69,13 @@ use parquet::file::metadata::{PageIndexPolicy, ParquetMetaDataReader, RowGroupMe
 /// Implements [`FileOpener`] for a parquet file
 pub(super) struct ParquetOpener {
     /// Execution partition index
-    pub partition_index: usize,
+    pub(crate) partition_index: usize,
     /// Projection to apply on top of the table schema (i.e. can reference partition columns).
     pub projection: ProjectionExprs,
     /// Target number of rows in each output RecordBatch
     pub batch_size: usize,
     /// Optional limit on the number of rows to read
-    pub limit: Option<usize>,
+    pub(crate) limit: Option<usize>,
     /// If should keep the output rows in order
     pub preserve_order: bool,
     /// Optional predicate to apply during the scan
diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs
index 009c1d822c491..51b9ba9e06e9b 100644
--- a/datafusion/datasource/src/file_scan_config.rs
+++ b/datafusion/datasource/src/file_scan_config.rs
@@ -484,6 +484,9 @@ impl FileScanConfigBuilder {
         let file_compression_type =
             file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED);
 
+        // If there is an output ordering, we should preserve it.
+        let preserve_order = preserve_order || !output_ordering.is_empty();
+
         FileScanConfig {
             object_store_url,
             file_source,
@@ -869,6 +872,18 @@ impl DataSource for FileScanConfig {
             }
         }
     }
+
+    fn with_preserve_order(&self, preserve_order: bool) -> Option<Arc<dyn DataSource>> {
+        if self.preserve_order == preserve_order {
+            return Some(Arc::new(self.clone()));
+        }
+
+        let new_config = FileScanConfig {
+            preserve_order,
+            ..self.clone()
+        };
+        Some(Arc::new(new_config))
+    }
 }
 
 impl FileScanConfig {
diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs
index a3892dfac9778..de18b6be2235f 100644
--- a/datafusion/datasource/src/source.rs
+++ b/datafusion/datasource/src/source.rs
@@ -210,6 +210,11 @@ pub trait DataSource: Send + Sync + Debug {
     ) -> Result<SortOrderPushdownResult<Arc<dyn DataSource>>> {
         Ok(SortOrderPushdownResult::Unsupported)
     }
+
+    /// Returns a variant of this `DataSource` that is aware of order-sensitivity.
+    fn with_preserve_order(&self, _preserve_order: bool) -> Option<Arc<dyn DataSource>> {
+        None
+    }
 }
 
 /// [`ExecutionPlan`] that reads one or more files
@@ -393,6 +398,18 @@ impl ExecutionPlan for DataSourceExec {
                 Ok(Arc::new(new_exec) as Arc<dyn ExecutionPlan>)
             })
     }
+
+    fn with_preserve_order(
+        &self,
+        preserve_order: bool,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        self.data_source
+            .with_preserve_order(preserve_order)
+            .map(|new_data_source| {
+                Arc::new(self.clone().with_data_source(new_data_source))
+                    as Arc<dyn ExecutionPlan>
+            })
+    }
 }
 
 impl DataSourceExec {
diff --git a/datafusion/physical-optimizer/src/limit_pushdown.rs b/datafusion/physical-optimizer/src/limit_pushdown.rs
index d259025b61bf1..a4dac81dbacf8 100644
--- a/datafusion/physical-optimizer/src/limit_pushdown.rs
+++ b/datafusion/physical-optimizer/src/limit_pushdown.rs
@@ -27,8 +27,6 @@ use datafusion_common::config::ConfigOptions;
 use datafusion_common::error::Result;
 use datafusion_common::tree_node::{Transformed, TreeNodeRecursion};
 use datafusion_common::utils::combine_limit;
-use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
-use datafusion_datasource::source::DataSourceExec;
 use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
 use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
@@ -52,7 +50,7 @@ pub struct GlobalRequirements {
     fetch: Option<usize>,
     skip: usize,
     satisfied: bool,
-    order_sensitive: bool,
+    preserve_order: bool,
 }
 
 impl LimitPushdown {
@@ -72,7 +70,7 @@ impl PhysicalOptimizerRule for LimitPushdown {
             fetch: None,
             skip: 0,
             satisfied: false,
-            order_sensitive: false,
+            preserve_order: false,
         };
         pushdown_limits(plan, global_state)
     }
@@ -116,7 +114,7 @@ impl LimitExec {
         }
     }
 
-    fn order_sensitive(&self) -> bool {
+    fn preserve_order(&self) -> bool {
         match self {
             Self::Global(global) => global.required_ordering().is_some(),
             Self::Local(local) => local.required_ordering().is_some(),
@@ -156,7 +154,7 @@ pub fn pushdown_limit_helper(
         );
         global_state.skip = skip;
         global_state.fetch = fetch;
-        global_state.order_sensitive = limit_exec.order_sensitive();
+        global_state.preserve_order = limit_exec.preserve_order();
 
         // Now the global state has the most recent information, we can remove
         // the `LimitExec` plan. We will decide later if we should add it again
@@ -253,10 +251,9 @@ pub fn pushdown_limit_helper(
         let maybe_fetchable = pushdown_plan.with_fetch(skip_and_fetch);
         if global_state.satisfied {
             if let Some(plan_with_fetch) = maybe_fetchable {
-                let plan_with_preserve_order = ensure_preserve_order_if_needed(
-                    plan_with_fetch,
-                    global_state.order_sensitive,
-                );
+                let plan_with_preserve_order = plan_with_fetch
+                    .with_preserve_order(global_state.preserve_order)
+                    .unwrap_or(plan_with_fetch);
                 Ok((Transformed::yes(plan_with_preserve_order), global_state))
             } else {
                 Ok((Transformed::no(pushdown_plan), global_state))
@@ -264,10 +261,9 @@ pub fn pushdown_limit_helper(
         } else {
             global_state.satisfied = true;
             pushdown_plan = if let Some(plan_with_fetch) = maybe_fetchable {
-                let plan_with_preserve_order = ensure_preserve_order_if_needed(
-                    plan_with_fetch,
-                    global_state.order_sensitive,
-                );
+                let plan_with_preserve_order = plan_with_fetch
+                    .with_preserve_order(global_state.preserve_order)
+                    .unwrap_or(plan_with_fetch);
 
                 if global_skip > 0 {
                     add_global_limit(
@@ -362,37 +358,4 @@ fn add_global_limit(
     Arc::new(GlobalLimitExec::new(pushdown_plan, skip, fetch)) as _
 }
 
-/// Helper function to handle DataSourceExec preserve_order setting
-fn ensure_preserve_order_if_needed(
-    plan: Arc<dyn ExecutionPlan>,
-    order_sensitive: bool,
-) -> Arc<dyn ExecutionPlan> {
-    if !order_sensitive {
-        return plan;
-    }
-
-    let Some(data_source_exec) = plan.as_any().downcast_ref::<DataSourceExec>() else {
-        return plan;
-    };
-
-    let Some(file_scan_config) = data_source_exec
-        .data_source()
-        .as_any()
-        .downcast_ref::<FileScanConfig>()
-    else {
-        return plan;
-    };
-
-    if file_scan_config.preserve_order {
-        return plan;
-    }
-
-    let new_config = FileScanConfigBuilder::from(file_scan_config.clone())
-        .with_preserve_order(true)
-        .build();
-
-    let new_data_source_exec = DataSourceExec::new(Arc::new(new_config));
-    Arc::new(new_data_source_exec) as Arc<dyn ExecutionPlan>
-}
-
 // See tests in datafusion/core/tests/physical_optimizer
diff --git a/datafusion/physical-plan/src/coalesce_partitions.rs b/datafusion/physical-plan/src/coalesce_partitions.rs
index d83f90eb3d8c1..22dcc85d6ea3a 100644
--- a/datafusion/physical-plan/src/coalesce_partitions.rs
+++ b/datafusion/physical-plan/src/coalesce_partitions.rs
@@ -278,6 +278,19 @@ impl ExecutionPlan for CoalescePartitionsExec {
         }))
     }
 
+    fn with_preserve_order(
+        &self,
+        preserve_order: bool,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        self.input
+            .with_preserve_order(preserve_order)
+            .and_then(|new_input| {
+                Arc::new(self.clone())
+                    .with_new_children(vec![new_input])
+                    .ok()
+            })
+    }
+
     fn gather_filters_for_pushdown(
         &self,
         _phase: FilterPushdownPhase,
diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs
index 06da0b8933c18..9101cbb00944b 100644
--- a/datafusion/physical-plan/src/execution_plan.rs
+++ b/datafusion/physical-plan/src/execution_plan.rs
@@ -708,6 +708,19 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
     ) -> Result<SortOrderPushdownResult<Arc<dyn ExecutionPlan>>> {
         Ok(SortOrderPushdownResult::Unsupported)
     }
+
+    /// Returns a variant of this `ExecutionPlan` that is aware of order-sensitivity.
+    ///
+    /// This is used to signal to data sources that the output ordering must be
+    /// preserved, even if it might be more efficient to ignore it (e.g. by
+    /// skipping some row groups in Parquet).
+    ///
+    fn with_preserve_order(
+        &self,
+        _preserve_order: bool,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        None
+    }
 }
 
 /// [`ExecutionPlan`] Invariant Level
diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs
index 674fe6692adf5..a1c627c959951 100644
--- a/datafusion/physical-plan/src/filter.rs
+++ b/datafusion/physical-plan/src/filter.rs
@@ -615,6 +615,19 @@ impl ExecutionPlan for FilterExec {
             fetch,
         }))
     }
+
+    fn with_preserve_order(
+        &self,
+        preserve_order: bool,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        self.input
+            .with_preserve_order(preserve_order)
+            .and_then(|new_input| {
+                Arc::new(self.clone())
+                    .with_new_children(vec![new_input])
+                    .ok()
+            })
+    }
 }
 
 impl EmbeddedProjection for FilterExec {
diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs
index e8608f17a1b20..8f2f2219f4338 100644
--- a/datafusion/physical-plan/src/projection.rs
+++ b/datafusion/physical-plan/src/projection.rs
@@ -427,6 +427,19 @@ impl ExecutionPlan for ProjectionExec {
             }
         }
     }
+
+    fn with_preserve_order(
+        &self,
+        preserve_order: bool,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        self.input
+            .with_preserve_order(preserve_order)
+            .and_then(|new_input| {
+                Arc::new(self.clone())
+                    .with_new_children(vec![new_input])
+                    .ok()
+            })
+    }
 }
 
 impl ProjectionStream {
diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs
index 0ddea90a98bf3..68c457a0d8a3c 100644
--- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs
+++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs
@@ -245,6 +245,19 @@ impl ExecutionPlan for SortPreservingMergeExec {
         }))
     }
 
+    fn with_preserve_order(
+        &self,
+        preserve_order: bool,
+    ) -> Option<Arc<dyn ExecutionPlan>> {
+        self.input
+            .with_preserve_order(preserve_order)
+            .and_then(|new_input| {
+                Arc::new(self.clone())
+                    .with_new_children(vec![new_input])
+                    .ok()
+            })
+    }
+
     fn required_input_distribution(&self) -> Vec<Distribution> {
         vec![Distribution::UnspecifiedDistribution]
     }

From 661a2c24790f4974aa0b9daf98600de231537595 Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Fri, 9 Jan 2026 17:51:58 +0800
Subject: [PATCH 23/26] remove dependency

---
 Cargo.lock                               | 1 -
 datafusion/physical-optimizer/Cargo.toml | 1 -
 2 files changed, 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f5e01ea1e10e8..2d40ab4506900 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2427,7 +2427,6 @@ version = "52.0.0"
 dependencies = [
  "arrow",
  "datafusion-common",
- "datafusion-datasource",
  "datafusion-execution",
  "datafusion-expr",
  "datafusion-expr-common",
diff --git a/datafusion/physical-optimizer/Cargo.toml b/datafusion/physical-optimizer/Cargo.toml
index caa9ee7b46914..395da10d629ba 100644
--- a/datafusion/physical-optimizer/Cargo.toml
+++ b/datafusion/physical-optimizer/Cargo.toml
@@ -43,7 +43,6 @@ recursive_protection = ["dep:recursive"]
 [dependencies]
 arrow = { workspace = true }
 datafusion-common = { workspace = true }
-datafusion-datasource = { workspace = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-expr-common = { workspace = true, default-features = true }

From ca7de4fae56aaa6fad7fb15230e5be139dabfaa3 Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Tue, 13 Jan 2026 16:00:18 +0800
Subject: [PATCH 24/26] add an exmaple

---
 .../src/row_group_filter.rs                   | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs
index f54b1cc4e1bc8..935f179ff4702 100644
--- a/datafusion/datasource-parquet/src/row_group_filter.rs
+++ b/datafusion/datasource-parquet/src/row_group_filter.rs
@@ -132,6 +132,33 @@ impl RowGroupAccessPlanFilter {
     /// |  +-----------------------------------+-----------------------------+  |
     /// +-----------------------------------------------------------------------+
     ///
+    /// # Example with Statistics Truncation and NOT Inversion
+    ///
+    /// When statistics are truncated to length 6 (e.g., `statistics_truncate_length = 6`),
+    /// the min/max values become:
+    ///
+    /// ```
+    /// Row group 3: species_min="Alpine", species_max="Alpine" (truncated from "Alpine Ibex"/"Alpine Sheep")
+    ///              s_min=76, s_max=101
+    /// ```
+    ///
+    /// To identify this as fully matching, the system uses NOT inversion:
+    /// 1. Original predicate: `species LIKE 'Alpine%' AND s >= 50`
+    /// 2. Inverted predicate: `NOT (species LIKE 'Alpine%' AND s >= 50)`
+    ///    Simplified to: `species NOT LIKE 'Alpine%' OR s < 50`
+    /// 3. Pruning predicate generated:
+    ///    `(species_min NOT LIKE 'Alpine%' OR species_max NOT LIKE 'Alpine%') OR s_min < 50`
+    ///
+    /// For row group 3 with truncated stats:
+    /// - Evaluating `species_min NOT LIKE 'Alpine%'`: `"A" NOT LIKE 'Alpine%'` = `false`
+    /// - Evaluating `species_max NOT LIKE 'Alpine%'`: `"A" NOT LIKE 'Alpine%'` = `false`
+    /// - Evaluating `s_min < 50`: `76 < 50` = `false`
+    /// - Final result: `(false OR false) OR false` = `false`
+    ///
+    /// Since the inverted predicate would prune this row group (returns false), it means
+    /// no rows in this group could possibly satisfy the inverted predicate.
+    /// Therefore, all rows in this group must match the original predicate, making it fully matched
+    ///
     /// Without limit pruning: Scan Partition 2 → Partition 3 → Partition 4 (until limit reached)
     /// With limit pruning: If Partition 3 contains enough rows to satisfy the limit,
     /// skip Partitions 2 and 4 entirely and go directly to Partition 3.

From d2b84d4d9ebf2d0ceaf2a4a5e64853949d9ddcb1 Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Tue, 13 Jan 2026 16:26:41 +0800
Subject: [PATCH 25/26] fix doc test

---
 datafusion/datasource-parquet/src/row_group_filter.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs
index 935f179ff4702..c3c803dbfc818 100644
--- a/datafusion/datasource-parquet/src/row_group_filter.rs
+++ b/datafusion/datasource-parquet/src/row_group_filter.rs
@@ -137,7 +137,7 @@ impl RowGroupAccessPlanFilter {
     /// When statistics are truncated to length 6 (e.g., `statistics_truncate_length = 6`),
     /// the min/max values become:
     ///
-    /// ```
+    /// ```text
     /// Row group 3: species_min="Alpine", species_max="Alpine" (truncated from "Alpine Ibex"/"Alpine Sheep")
     ///              s_min=76, s_max=101
     /// ```

From 6c515b2befb8ff780d12f21c7ccedacc078e186c Mon Sep 17 00:00:00 2001
From: "xudong.w" <wxd963996380@gmail.com>
Date: Thu, 15 Jan 2026 14:32:23 +0800
Subject: [PATCH 26/26] update doc

---
 .../src/row_group_filter.rs                   | 45 ++++++++++---------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs
index c3c803dbfc818..7eea8285ad6b5 100644
--- a/datafusion/datasource-parquet/src/row_group_filter.rs
+++ b/datafusion/datasource-parquet/src/row_group_filter.rs
@@ -132,32 +132,35 @@ impl RowGroupAccessPlanFilter {
     /// |  +-----------------------------------+-----------------------------+  |
     /// +-----------------------------------------------------------------------+
     ///
-    /// # Example with Statistics Truncation and NOT Inversion
+    /// ### Identification of Fully Matching Row Groups
     ///
-    /// When statistics are truncated to length 6 (e.g., `statistics_truncate_length = 6`),
-    /// the min/max values become:
+    /// DataFusion identifies row groups where ALL rows satisfy the filter by inverting the
+    /// predicate and checking if statistics prove the inverted version is false for the group.
     ///
-    /// ```text
-    /// Row group 3: species_min="Alpine", species_max="Alpine" (truncated from "Alpine Ibex"/"Alpine Sheep")
-    ///              s_min=76, s_max=101
-    /// ```
+    /// For example, prefix matches like `species LIKE 'Alpine%'` are pruned using ranges:
+    /// 1. Candidate Range: `species >= 'Alpine' AND species < 'Alpinf'`
+    /// 2. Inverted Condition (to prove full match): `species < 'Alpine' OR species >= 'Alpinf'`
+    /// 3. Statistical Evaluation (check if any row *could* satisfy the inverted condition):
+    ///    `min < 'Alpine' OR max >= 'Alpinf'`
     ///
-    /// To identify this as fully matching, the system uses NOT inversion:
-    /// 1. Original predicate: `species LIKE 'Alpine%' AND s >= 50`
-    /// 2. Inverted predicate: `NOT (species LIKE 'Alpine%' AND s >= 50)`
-    ///    Simplified to: `species NOT LIKE 'Alpine%' OR s < 50`
-    /// 3. Pruning predicate generated:
-    ///    `(species_min NOT LIKE 'Alpine%' OR species_max NOT LIKE 'Alpine%') OR s_min < 50`
+    /// If this evaluation is **false**, it proves no row can fail the original filter,
+    /// so the row group is **FULLY MATCHING**.
     ///
-    /// For row group 3 with truncated stats:
-    /// - Evaluating `species_min NOT LIKE 'Alpine%'`: `"A" NOT LIKE 'Alpine%'` = `false`
-    /// - Evaluating `species_max NOT LIKE 'Alpine%'`: `"A" NOT LIKE 'Alpine%'` = `false`
-    /// - Evaluating `s_min < 50`: `76 < 50` = `false`
-    /// - Final result: `(false OR false) OR false` = `false`
+    /// ### Impact of Statistics Truncation
     ///
-    /// Since the inverted predicate would prune this row group (returns false), it means
-    /// no rows in this group could possibly satisfy the inverted predicate.
-    /// Therefore, all rows in this group must match the original predicate, making it fully matched
+    /// The precision of pruning depends on the metadata quality. Truncated statistics
+    /// may prevent the system from proving a full match.
+    ///
+    /// **Example**: `WHERE species LIKE 'Alpine%'` (Target range: `['Alpine', 'Alpinf')`)
+    ///
+    /// | Truncation Length | min / max           | Inverted Evaluation                                                 | Status                 |
+    /// |-------------------|---------------------|---------------------------------------------------------------------|------------------------|
+    /// | **Length 6**      | `Alpine` / `Alpine` | `"Alpine" < "Alpine" (F) OR "Alpine" >= "Alpinf" (F)` -> **false**  | **FULLY MATCHING**     |
+    /// | **Length 3**      | `Alp` / `Alq`       | `"Alp" < "Alpine" (T) OR "Alq" >= "Alpinf" (T)` -> **true**         | **PARTIALLY MATCHING** |
+    ///
+    /// Even though Row Group 3 only contains matching rows, truncation to length 3 makes
+    /// the statistics `[Alp, Alq]` too broad to prove it (they could include "Alpha").
+    /// The system must conservatively scan the group.
     ///
     /// Without limit pruning: Scan Partition 2 → Partition 3 → Partition 4 (until limit reached)
     /// With limit pruning: If Partition 3 contains enough rows to satisfy the limit,