Enhance DataFrame streaming to preserve partition order and update related tests

kosiew · kosiew · commit a1ba2647a7d4 · 2025-09-02T15:20:59.000+08:00
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -1117,7 +1117,8 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
             Arrow PyCapsule object representing an ``ArrowArrayStream``.
         """
         # ``DataFrame.__arrow_c_stream__`` in the Rust extension leverages
-        # ``execute_stream`` under the hood to stream batches one at a time.
+        # ``execute_stream_partitioned`` under the hood to stream batches while
+        # preserving the original partition order.
         return self.df.__arrow_c_stream__(requested_schema)
 
     def __iter__(self) -> Iterator[pa.RecordBatch]:
@@ -1126,7 +1127,8 @@ def __iter__(self) -> Iterator[pa.RecordBatch]:
         This implementation streams record batches via the Arrow C Stream
         interface, allowing callers such as :func:`pyarrow.Table.from_batches` to
         consume results lazily. The DataFrame is executed using DataFusion's
-        streaming APIs so ``collect`` is never invoked.
+        partitioned streaming APIs so ``collect`` is never invoked and batch
+        order across partitions is preserved.
         """
         import pyarrow as pa
 
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -1603,9 +1603,11 @@ def test_arrow_c_stream_to_table(fail_collect):
     df = ctx.create_dataframe([[batch1], [batch2]])
 
     table = pa.Table.from_batches(df)
-    expected = pa.Table.from_batches([batch1, batch2])
+    batches = table.to_batches()
 
-    assert table.equals(expected)
+    assert len(batches) == 2
+    assert batches[0].equals(batch1)
+    assert batches[1].equals(batch2)
     assert table.schema == df.schema()
     assert table.column("a").num_chunks == 2
 
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -383,50 +383,59 @@ impl PyDataFrame {
         Ok(html_str)
     }
 }
-/// Synchronous wrapper around a [`SendableRecordBatchStream`] used for
-/// the `__arrow_c_stream__` implementation.
+
+/// Synchronous wrapper around partitioned [`SendableRecordBatchStream`]s used
+/// for the `__arrow_c_stream__` implementation.
 ///
-/// It uses `runtime.block_on` to consume the underlying async stream,
-/// providing synchronous iteration. When a `projection` is set, each
-/// batch is converted via `record_batch_into_schema` to apply schema
-/// changes per batch.
-struct DataFrameStreamReader {
-    stream: SendableRecordBatchStream,
+/// It drains each partition's stream sequentially, yielding record batches in
+/// their original partition order. When a `projection` is set, each batch is
+/// converted via `record_batch_into_schema` to apply schema changes per batch.
+struct PartitionedDataFrameStreamReader {
+    streams: Vec<SendableRecordBatchStream>,
     schema: SchemaRef,
     projection: Option<SchemaRef>,
+    current: usize,
 }
 
-impl Iterator for DataFrameStreamReader {
+impl Iterator for PartitionedDataFrameStreamReader {
     type Item = Result<RecordBatch, ArrowError>;
 
     fn next(&mut self) -> Option<Self::Item> {
-        // Use wait_for_future to poll the underlying async stream while
-        // respecting Python signal handling (e.g. ``KeyboardInterrupt``).
-        // This mirrors the behaviour of other synchronous wrappers and
-        // prevents blocking indefinitely when a Python interrupt is raised.
-        let fut = poll_next_batch(&mut self.stream);
-        let result = Python::with_gil(|py| wait_for_future(py, fut));
-
-        match result {
-            Ok(Ok(Some(batch))) => {
-                let batch = if let Some(ref schema) = self.projection {
-                    match record_batch_into_schema(batch, schema.as_ref()) {
-                        Ok(b) => b,
-                        Err(e) => return Some(Err(e)),
-                    }
-                } else {
-                    batch
-                };
-                Some(Ok(batch))
+        while self.current < self.streams.len() {
+            let stream = &mut self.streams[self.current];
+            let fut = poll_next_batch(stream);
+            let result = Python::with_gil(|py| wait_for_future(py, fut));
+
+            match result {
+                Ok(Ok(Some(batch))) => {
+                    let batch = if let Some(ref schema) = self.projection {
+                        match record_batch_into_schema(batch, schema.as_ref()) {
+                            Ok(b) => b,
+                            Err(e) => return Some(Err(e)),
+                        }
+                    } else {
+                        batch
+                    };
+                    return Some(Ok(batch));
+                }
+                Ok(Ok(None)) => {
+                    self.current += 1;
+                    continue;
+                }
+                Ok(Err(e)) => {
+                    return Some(Err(ArrowError::ExternalError(Box::new(e))));
+                }
+                Err(e) => {
+                    return Some(Err(ArrowError::ExternalError(Box::new(e))));
+                }
             }
-            Ok(Ok(None)) => None,
-            Ok(Err(e)) => Some(Err(ArrowError::ExternalError(Box::new(e)))),
-            Err(e) => Some(Err(ArrowError::ExternalError(Box::new(e)))),
         }
+
+        None
     }
 }
 
-impl RecordBatchReader for DataFrameStreamReader {
+impl RecordBatchReader for PartitionedDataFrameStreamReader {
     fn schema(&self) -> SchemaRef {
         self.schema.clone()
     }
@@ -958,7 +967,7 @@ impl PyDataFrame {
         requested_schema: Option<Bound<'py, PyCapsule>>,
     ) -> PyDataFusionResult<Bound<'py, PyCapsule>> {
         let df = self.df.as_ref().clone();
-        let stream = spawn_stream(py, async move { df.execute_stream().await })?;
+        let streams = spawn_streams(py, async move { df.execute_stream_partitioned().await })?;
 
         let mut schema: Schema = self.df.schema().to_owned().into();
         let mut projection: Option<SchemaRef> = None;
@@ -975,10 +984,11 @@ impl PyDataFrame {
 
         let schema_ref = Arc::new(schema.clone());
 
-        let reader = DataFrameStreamReader {
-            stream,
+        let reader = PartitionedDataFrameStreamReader {
+            streams,
             schema: schema_ref,
             projection,
+            current: 0,
         };
         let reader: Box<dyn RecordBatchReader + Send> = Box::new(reader);