2525from typing import (
2626 TYPE_CHECKING ,
2727 Any ,
28+ AsyncIterator ,
2829 Iterable ,
30+ Iterator ,
2931 Literal ,
3032 Optional ,
3133 Union ,
4244from datafusion ._internal import ParquetWriterOptions as ParquetWriterOptionsInternal
4345from datafusion .expr import Expr , SortExpr , sort_or_default
4446from datafusion .plan import ExecutionPlan , LogicalPlan
45- from datafusion .record_batch import RecordBatchStream
47+ from datafusion .record_batch import RecordBatch , RecordBatchStream
4648
4749if TYPE_CHECKING :
4850 import pathlib
@@ -296,6 +298,9 @@ def __init__(
296298class DataFrame :
297299 """Two dimensional table representation of data.
298300
301+ DataFrame objects are iterable; iterating over a DataFrame yields
302+ :class:`datafusion.RecordBatch` instances lazily.
303+
299304 See :ref:`user_guide_concepts` in the online documentation for more information.
300305 """
301306
@@ -312,7 +317,7 @@ def into_view(self) -> pa.Table:
312317 return self .df .into_view ()
313318
314319 def __getitem__ (self , key : str | list [str ]) -> DataFrame :
315- """Return a new :py:class`DataFrame` with the specified column or columns.
320+ """Return a new :py:class: `DataFrame` with the specified column or columns.
316321
317322 Args:
318323 key: Column name or list of column names to select.
@@ -1105,21 +1110,33 @@ def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFram
11051110 return DataFrame (self .df .unnest_columns (columns , preserve_nulls = preserve_nulls ))
11061111
11071112 def __arrow_c_stream__ (self , requested_schema : object | None = None ) -> object :
1108- """Export an Arrow PyCapsule Stream.
1113+ """Export the DataFrame as an Arrow C Stream.
11091114
1110- This will execute and collect the DataFrame. We will attempt to respect the
1111- requested schema, but only trivial transformations will be applied such as only
1112- returning the fields listed in the requested schema if their data types match
1113- those in the DataFrame.
1115+ The DataFrame is executed using DataFusion's streaming APIs and exposed via
1116+ Arrow's C Stream interface. Record batches are produced incrementally, so the
1117+ full result set is never materialized in memory. When ``requested_schema`` is
1118+ provided, only straightforward projections such as column selection or
1119+ reordering are applied.
11141120
11151121 Args:
11161122 requested_schema: Attempt to provide the DataFrame using this schema.
11171123
11181124 Returns:
1119- Arrow PyCapsule object.
1125+ Arrow PyCapsule object representing an ``ArrowArrayStream`` .
11201126 """
1127+ # ``DataFrame.__arrow_c_stream__`` in the Rust extension leverages
1128+ # ``execute_stream_partitioned`` under the hood to stream batches while
1129+ # preserving the original partition order.
11211130 return self .df .__arrow_c_stream__ (requested_schema )
11221131
1132+ def __iter__ (self ) -> Iterator [RecordBatch ]:
1133+ """Return an iterator over this DataFrame's record batches."""
1134+ return iter (self .execute_stream ())
1135+
1136+ def __aiter__ (self ) -> AsyncIterator [RecordBatch ]:
1137+ """Return an async iterator over this DataFrame's record batches."""
1138+ return self .execute_stream ().__aiter__ ()
1139+
11231140 def transform (self , func : Callable [..., DataFrame ], * args : Any ) -> DataFrame :
11241141 """Apply a function to the current DataFrame which returns another DataFrame.
11251142
0 commit comments