googleapis
diff --git a/‎bigframes/core/blocks.py‎
Lines changed: 41 additions & 34 deletions b/‎bigframes/core/blocks.py‎
Lines changed: 41 additions & 34 deletions
diff --git a/‎bigframes/core/bq_data.py‎
Lines changed: 123 additions & 1 deletion b/‎bigframes/core/bq_data.py‎
Lines changed: 123 additions & 1 deletion
diff --git a/‎bigframes/core/indexes/base.py‎
Lines changed: 22 additions & 10 deletions b/‎bigframes/core/indexes/base.py‎
Lines changed: 22 additions & 10 deletions
@@ -37,7 +37,6 @@
     Optional,
     Sequence,
     Tuple,
-    TYPE_CHECKING,
     Union,
 )
 import warnings
@@ -70,9 +69,6 @@
 from bigframes.session import dry_runs, execution_spec
 from bigframes.session import executor as executors
 
-if TYPE_CHECKING:
-    from bigframes.session.executor import ExecuteResult
-
 # Type constraint for wherever column labels are used
 Label = typing.Hashable
 
@@ -98,7 +94,6 @@
 LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]]
 
 
-@dataclasses.dataclass
 class PandasBatches(Iterator[pd.DataFrame]):
     """Interface for mutable objects with state represented by a block value object."""
 
@@ -271,10 +266,14 @@ def shape(self) -> typing.Tuple[int, int]:
             except Exception:
                 pass
 
-        row_count = self.session._executor.execute(
-            self.expr.row_count(),
-            execution_spec.ExecutionSpec(promise_under_10gb=True, ordered=False),
-        ).to_py_scalar()
+        row_count = (
+            self.session._executor.execute(
+                self.expr.row_count(),
+                execution_spec.ExecutionSpec(promise_under_10gb=True, ordered=False),
+            )
+            .batches()
+            .to_py_scalar()
+        )
         return (row_count, len(self.value_columns))
 
     @property
@@ -584,7 +583,7 @@ def to_arrow(
                 ordered=ordered,
             ),
         )
-        pa_table = execute_result.to_arrow_table()
+        pa_table = execute_result.batches().to_arrow_table()
 
         pa_index_labels = []
         for index_level, index_label in enumerate(self._index_labels):
@@ -636,17 +635,13 @@ def to_pandas(
             max_download_size, sampling_method, random_state
         )
 
-        ex_result = self._materialize_local(
+        return self._materialize_local(
             materialize_options=MaterializationOptions(
                 downsampling=sampling,
                 allow_large_results=allow_large_results,
                 ordered=ordered,
             )
         )
-        df = ex_result.to_pandas()
-        df = self._copy_index_to_pandas(df)
-        df.set_axis(self.column_labels, axis=1, copy=False)
-        return df, ex_result.query_job
 
     def _get_sampling_option(
         self,
@@ -683,7 +678,7 @@ def try_peek(
                 self.expr,
                 execution_spec.ExecutionSpec(promise_under_10gb=under_10gb, peek=n),
             )
-            df = result.to_pandas()
+            df = result.batches().to_pandas()
             return self._copy_index_to_pandas(df)
         else:
             return None
@@ -704,13 +699,14 @@ def to_pandas_batches(
             if (allow_large_results is not None)
             else not bigframes.options._allow_large_results
         )
-        execute_result = self.session._executor.execute(
+        execution_result = self.session._executor.execute(
             self.expr,
             execution_spec.ExecutionSpec(
                 promise_under_10gb=under_10gb,
                 ordered=True,
             ),
         )
+        result_batches = execution_result.batches()
 
         # To reduce the number of edge cases to consider when working with the
         # results of this, always return at least one DataFrame. See:
@@ -724,19 +720,21 @@ def to_pandas_batches(
         dfs = map(
             lambda a: a[0],
             itertools.zip_longest(
-                execute_result.to_pandas_batches(page_size, max_results),
+                result_batches.to_pandas_batches(page_size, max_results),
                 [0],
                 fillvalue=empty_val,
             ),
         )
         dfs = iter(map(self._copy_index_to_pandas, dfs))
 
-        total_rows = execute_result.total_rows
+        total_rows = result_batches.approx_total_rows
         if (total_rows is not None) and (max_results is not None):
             total_rows = min(total_rows, max_results)
 
         return PandasBatches(
-            dfs, total_rows, total_bytes_processed=execute_result.total_bytes_processed
+            dfs,
+            total_rows,
+            total_bytes_processed=execution_result.total_bytes_processed,
         )
 
     def _copy_index_to_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -754,7 +752,7 @@ def _copy_index_to_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
 
     def _materialize_local(
         self, materialize_options: MaterializationOptions = MaterializationOptions()
-    ) -> ExecuteResult:
+    ) -> tuple[pd.DataFrame, Optional[bigquery.QueryJob]]:
         """Run query and download results as a pandas DataFrame. Return the total number of results as well."""
         # TODO(swast): Allow for dry run and timeout.
         under_10gb = (
@@ -769,9 +767,11 @@ def _materialize_local(
                 ordered=materialize_options.ordered,
             ),
         )
+        result_batches = execute_result.batches()
+
         sample_config = materialize_options.downsampling
-        if execute_result.total_bytes is not None:
-            table_mb = execute_result.total_bytes / _BYTES_TO_MEGABYTES
+        if result_batches.approx_total_bytes is not None:
+            table_mb = result_batches.approx_total_bytes / _BYTES_TO_MEGABYTES
             max_download_size = sample_config.max_download_size
             fraction = (
                 max_download_size / table_mb
@@ -792,7 +792,7 @@ def _materialize_local(
 
         # TODO: Maybe materialize before downsampling
         # Some downsampling methods
-        if fraction < 1 and (execute_result.total_rows is not None):
+        if fraction < 1 and (result_batches.approx_total_rows is not None):
             if not sample_config.enable_downsampling:
                 raise RuntimeError(
                     f"The data size ({table_mb:.2f} MB) exceeds the maximum download limit of "
@@ -811,7 +811,7 @@ def _materialize_local(
                 "the downloading limit."
             )
             warnings.warn(msg, category=UserWarning)
-            total_rows = execute_result.total_rows
+            total_rows = result_batches.approx_total_rows
             # Remove downsampling config from subsequent invocations, as otherwise could result in many
             # iterations if downsampling undershoots
             return self._downsample(
@@ -823,7 +823,10 @@ def _materialize_local(
                 MaterializationOptions(ordered=materialize_options.ordered)
             )
         else:
-            return execute_result
+            df = result_batches.to_pandas()
+            df = self._copy_index_to_pandas(df)
+            df.set_axis(self.column_labels, axis=1, copy=False)
+            return df, execute_result.query_job
 
     def _downsample(
         self, total_rows: int, sampling_method: str, fraction: float, random_state
@@ -1662,15 +1665,19 @@ def retrieve_repr_request_results(
                 ordered=True,
             ),
         )
-        row_count = self.session._executor.execute(
-            self.expr.row_count(),
-            execution_spec.ExecutionSpec(
-                promise_under_10gb=True,
-                ordered=False,
-            ),
-        ).to_py_scalar()
+        row_count = (
+            self.session._executor.execute(
+                self.expr.row_count(),
+                execution_spec.ExecutionSpec(
+                    promise_under_10gb=True,
+                    ordered=False,
+                ),
+            )
+            .batches()
+            .to_py_scalar()
+        )
 
-        head_df = head_result.to_pandas()
+        head_df = head_result.batches().to_pandas()
         return self._copy_index_to_pandas(head_df), row_count, head_result.query_job
 
     def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]:
 
@@ -14,13 +14,21 @@
 
 from __future__ import annotations
 
+import concurrent.futures
 import dataclasses
 import datetime
 import functools
+import os
+import queue
+import threading
 import typing
-from typing import Optional, Sequence, Tuple
+from typing import Any, Iterator, Optional, Sequence, Tuple
 
+from google.cloud import bigquery_storage_v1
 import google.cloud.bigquery as bq
+import google.cloud.bigquery_storage_v1.types as bq_storage_types
+from google.protobuf import timestamp_pb2
+import pyarrow as pa
 
 import bigframes.core.schema
 
@@ -82,3 +90,117 @@ class BigqueryDataSource:
     ordering: typing.Optional[orderings.RowOrdering] = None
     # Optimization field
     n_rows: Optional[int] = None
+
+
+_WORKER_TIME_INCREMENT = 0.05
+
+
+def _iter_stream(
+    stream_name: str,
+    storage_read_client: bigquery_storage_v1.BigQueryReadClient,
+    result_queue: queue.Queue,
+    stop_event: threading.Event,
+):
+    reader = storage_read_client.read_rows(stream_name)
+    for page in reader.rows():
+        try:
+            result_queue.put(page.to_arrow(), timeout=_WORKER_TIME_INCREMENT)
+        except queue.Full:
+            continue
+        if stop_event.is_set():
+            return
+
+
+def _iter_streams(
+    streams, storage_read_client: bigquery_storage_v1.BigQueryReadClient
+) -> Iterator[pa.RecordBatch]:
+    stop_event = threading.Event()
+    result_queue: queue.Queue = queue.Queue(
+        len(streams)
+    )  # each response is large, so small queue is appropriate
+
+    in_progress: list[concurrent.futures.Future] = []
+    with concurrent.futures.ThreadPoolExecutor(max_workers=len(streams)) as pool:
+        for stream in streams:
+            in_progress.append(
+                pool.submit(
+                    _iter_stream, stream, storage_read_client, result_queue, stop_event
+                )
+            )
+
+        while in_progress:
+            try:
+                yield result_queue.get(timeout=0.1)
+            except queue.Empty:
+                new_in_progress = []
+                for future in in_progress:
+                    if future.done():
+                        try:
+                            future.result()
+                        finally:
+                            stop_event.set()
+                            raise
+                    else:
+                        new_in_progress.append(future)
+                in_progress = new_in_progress
+
+
+@dataclasses.dataclass
+class ReadResult:
+    iter: Iterator[pa.RecordBatch]
+    approx_rows: int
+    approx_bytes: int
+
+
+def get_arrow_batches(
+    data: BigqueryDataSource,
+    columns: Sequence[str],
+    storage_read_client: bigquery_storage_v1.BigQueryReadClient,
+) -> ReadResult:
+    table_mod_options = {}
+    read_options_dict: dict[str, Any] = {"selected_fields": list(columns)}
+    if data.sql_predicate:
+        read_options_dict["row_restriction"] = data.sql_predicate
+    read_options = bq_storage_types.ReadSession.TableReadOptions(**read_options_dict)
+
+    if data.at_time:
+        snapshot_time = timestamp_pb2.Timestamp()
+        snapshot_time.FromDatetime(data.at_time)
+        table_mod_options["snapshot_time"] = snapshot_time
+    table_mods = bq_storage_types.ReadSession.TableModifiers(**table_mod_options)
+
+    requested_session = bq_storage_types.stream.ReadSession(
+        table=data.table.get_table_ref().to_bqstorage(),
+        data_format=bq_storage_types.DataFormat.ARROW,
+        read_options=read_options,
+        table_modifiers=table_mods,
+    )
+    # Single stream to maintain ordering
+    request = bq_storage_types.CreateReadSessionRequest(
+        parent=f"projects/{data.table.project_id}",
+        read_session=requested_session,
+        max_stream_count=1,
+    )
+
+    if data.ordering is not None:
+        max_streams = 1
+    else:
+        max_streams = os.cpu_count() or 8
+
+    session = storage_read_client.create_read_session(
+        request=request, max_stream_count=max_streams
+    )
+
+    if not session.streams:
+        batches: Iterator[pa.RecordBatch] = iter([])
+    else:
+        batches = _iter_streams(session.streams, storage_read_client)
+
+        def process_batch(pa_batch):
+            return pa.RecordBatch.from_arrays(pa_batch.columns, names=data.schema.names)
+
+        batches = map(process_batch, batches)
+
+    return ReadResult(
+        batches, session.estimated_row_count, session.estimated_total_bytes_scanned
+    )
@@ -290,9 +290,13 @@ def get_loc(self, key) -> typing.Union[int, slice, "bigframes.series.Series"]:
         count_agg = ex_types.UnaryAggregation(agg_ops.count_op, ex.deref(offsets_id))
         count_result = filtered_block._expr.aggregate([(count_agg, "count")])
 
-        count_scalar = self._block.session._executor.execute(
-            count_result, ex_spec.ExecutionSpec(promise_under_10gb=True)
-        ).to_py_scalar()
+        count_scalar = (
+            self._block.session._executor.execute(
+                count_result, ex_spec.ExecutionSpec(promise_under_10gb=True)
+            )
+            .batches()
+            .to_py_scalar()
+        )
 
         if count_scalar == 0:
             raise KeyError(f"'{key}' is not in index")
@@ -301,9 +305,13 @@ def get_loc(self, key) -> typing.Union[int, slice, "bigframes.series.Series"]:
         if count_scalar == 1:
             min_agg = ex_types.UnaryAggregation(agg_ops.min_op, ex.deref(offsets_id))
             position_result = filtered_block._expr.aggregate([(min_agg, "position")])
-            position_scalar = self._block.session._executor.execute(
-                position_result, ex_spec.ExecutionSpec(promise_under_10gb=True)
-            ).to_py_scalar()
+            position_scalar = (
+                self._block.session._executor.execute(
+                    position_result, ex_spec.ExecutionSpec(promise_under_10gb=True)
+                )
+                .batches()
+                .to_py_scalar()
+            )
             return int(position_scalar)
 
         # Handle multiple matches based on index monotonicity
@@ -333,10 +341,14 @@ def _get_monotonic_slice(self, filtered_block, offsets_id: str) -> slice:
         combined_result = filtered_block._expr.aggregate(min_max_aggs)
 
         # Execute query and extract positions
-        result_df = self._block.session._executor.execute(
-            combined_result,
-            execution_spec=ex_spec.ExecutionSpec(promise_under_10gb=True),
-        ).to_pandas()
+        result_df = (
+            self._block.session._executor.execute(
+                combined_result,
+                execution_spec=ex_spec.ExecutionSpec(promise_under_10gb=True),
+            )
+            .batches()
+            .to_pandas()
+        )
         min_pos = int(result_df["min_pos"].iloc[0])
         max_pos = int(result_df["max_pos"].iloc[0])