refactor: ExecuteResult is reusable, sampleable

TrevorBergeron · TrevorBergeron · commit ca51638df573 · 2025-10-09T20:34:32.000Z
diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py
@@ -18,14 +18,14 @@
 import os
 import threading
 from typing import Literal, Mapping, Optional, Sequence, Tuple
-import warnings
 import weakref
 
 import google.api_core.exceptions
 from google.cloud import bigquery
 import google.cloud.bigquery.job as bq_job
 import google.cloud.bigquery.table as bq_table
 import google.cloud.bigquery_storage_v1
+import pyarrow as pa
 
 import bigframes
 from bigframes import exceptions as bfe
@@ -157,6 +157,7 @@ def __init__(
         self._semi_executors: Sequence[semi_executor.SemiExecutor] = (
             read_api_execution.ReadApiSemiExecutor(
                 bqstoragereadclient=bqstoragereadclient,
+                bqclient=self.bqclient,
                 project=self.bqclient.project,
             ),
             local_scan_executor.LocalScanExecutor(),
@@ -347,14 +348,9 @@ def _export_gbq(
             table.schema = array_value.schema.to_bigquery()
             self.bqclient.update_table(table, ["schema"])
 
-        return executor.ExecuteResult(
-            row_iter.to_arrow_iterable(
-                bqstorage_client=self.bqstoragereadclient,
-                max_stream_count=_MAX_READ_STREAMS,
-            ),
-            array_value.schema,
-            query_job,
-            total_bytes_processed=row_iter.total_bytes_processed,
+        return executor.EmptyExecuteResult(
+            bf_schema=array_value.schema,
+            query_job=query_job,
         )
 
     def dry_run(
@@ -672,41 +668,28 @@ def _execute_plan_gbq(
             query_with_job=(destination_table is not None),
         )
 
-        table_info: Optional[bigquery.Table] = None
-        if query_job and query_job.destination:
-            table_info = self.bqclient.get_table(query_job.destination)
-            size_bytes = table_info.num_bytes
-        else:
-            size_bytes = None
-
         # we could actually cache even when caching is not explicitly requested, but being conservative for now
         if cache_spec is not None:
-            assert table_info is not None
+            assert query_job and query_job.destination
             assert compiled.row_order is not None
+            table_info = self.bqclient.get_table(query_job.destination)
             self.cache.cache_results_table(
                 og_plan, table_info, compiled.row_order, num_rows=table_info.num_rows
             )
 
-        if size_bytes is not None and size_bytes >= MAX_SMALL_RESULT_BYTES:
-            msg = bfe.format_message(
-                "The query result size has exceeded 10 GB. In BigFrames 2.0 and "
-                "later, you might need to manually set `allow_large_results=True` in "
-                "the IO method or adjust the BigFrames option: "
-                "`bigframes.options.compute.allow_large_results=True`."
+        if query_job and query_job.destination:
+            return executor.BQTableExecuteResult(
+                data=query_job.destination,
+                bf_schema=og_schema,
+                bq_client=self.bqclient,
+                storage_client=self.bqstoragereadclient,
+                query_job=query_job,
+            )
+        else:
+            return executor.LocalExecuteResult(
+                data=pa.Table.from_batches(iterator.to_arrow_iterable()),
+                bf_schema=plan.schema,
             )
-            warnings.warn(msg, FutureWarning)
-
-        return executor.ExecuteResult(
-            _arrow_batches=iterator.to_arrow_iterable(
-                bqstorage_client=self.bqstoragereadclient,
-                max_stream_count=_MAX_READ_STREAMS,
-            ),
-            schema=og_schema,
-            query_job=query_job,
-            total_bytes=size_bytes,
-            total_rows=iterator.total_rows,
-            total_bytes_processed=iterator.total_bytes_processed,
-        )
 
 
 def _if_schema_match(
diff --git a/bigframes/session/direct_gbq_execution.py b/bigframes/session/direct_gbq_execution.py
@@ -18,6 +18,7 @@
 from google.cloud import bigquery
 import google.cloud.bigquery.job as bq_job
 import google.cloud.bigquery.table as bq_table
+import pyarrow as pa
 
 from bigframes.core import compile, nodes
 from bigframes.core.compile import sqlglot
@@ -64,13 +65,16 @@ def execute(
             sql=compiled.sql,
         )
 
-        return executor.ExecuteResult(
-            _arrow_batches=iterator.to_arrow_iterable(),
-            schema=plan.schema,
-            query_job=query_job,
-            total_rows=iterator.total_rows,
-            total_bytes_processed=iterator.total_bytes_processed,
-        )
+        if query_job is not None and query_job.destination is not None:
+            return executor.BQTableExecuteResult(
+                data=query_job.destination,
+                bf_schema=plan.schema,
+            )
+        else:
+            return executor.LocalExecuteResult(
+                data=pa.Table.from_batches(iterator.to_arrow_iterable()),
+                bf_schema=plan.schema,
+            )
 
     def _run_execute_query(
         self,
diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py
@@ -16,13 +16,15 @@
 
 import abc
 import dataclasses
+import datetime
 import functools
 import itertools
-from typing import Iterator, Literal, Optional, Union
+from typing import Any, Iterator, Literal, Optional, Sequence, Union
 
-from google.cloud import bigquery
+from google.cloud import bigquery, bigquery_storage_v1
 import pandas as pd
 import pyarrow
+import pyarrow as pa
 
 import bigframes
 import bigframes.core
@@ -38,20 +40,41 @@
 )
 
 
-@dataclasses.dataclass(frozen=True)
-class ExecuteResult:
-    _arrow_batches: Iterator[pyarrow.RecordBatch]
-    schema: bigframes.core.schema.ArraySchema
-    query_job: Optional[bigquery.QueryJob] = None
-    total_bytes: Optional[int] = None
-    total_rows: Optional[int] = None
-    total_bytes_processed: Optional[int] = None
+class ExecuteResult(abc.ABC):
+    @property
+    @abc.abstractmethod
+    def query_job(self) -> Optional[bigquery.QueryJob]:
+        ...
+
+    @property
+    @abc.abstractmethod
+    def total_bytes(self) -> Optional[int]:
+        ...
+
+    @property
+    @abc.abstractmethod
+    def total_rows(self) -> Optional[int]:
+        ...
+
+    @property
+    @abc.abstractmethod
+    def total_bytes_processed(self) -> Optional[int]:
+        ...
+
+    @property
+    @abc.abstractmethod
+    def schema(self) -> bigframes.core.schema.ArraySchema:
+        ...
+
+    @abc.abstractmethod
+    def _get_arrow_batches(self) -> Iterator[pyarrow.RecordBatch]:
+        ...
 
     @property
     def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]:
         result_rows = 0
 
-        for batch in self._arrow_batches:
+        for batch in self._get_arrow_batches():
             batch = pyarrow_utils.cast_batch(batch, self.schema.to_pyarrow())
             result_rows += batch.num_rows
 
@@ -121,6 +144,171 @@ def to_py_scalar(self):
         return column[0]
 
 
+class LocalExecuteResult(ExecuteResult):
+    def __init__(self, data: pa.Table, bf_schema: bigframes.core.schema.ArraySchema):
+        self._data = data
+        self._schema = bf_schema
+
+    @property
+    def query_job(self) -> Optional[bigquery.QueryJob]:
+        return None
+
+    @property
+    def total_bytes(self) -> Optional[int]:
+        return None
+
+    @property
+    def total_rows(self) -> Optional[int]:
+        return self._data.num_rows
+
+    @property
+    def total_bytes_processed(self) -> Optional[int]:
+        return None
+
+    @property
+    def schema(self) -> bigframes.core.schema.ArraySchema:
+        return self._schema
+
+    def _get_arrow_batches(self) -> Iterator[pyarrow.RecordBatch]:
+        return iter(self._data.to_batches())
+
+
+class EmptyExecuteResult(ExecuteResult):
+    def __init__(
+        self,
+        bf_schema: bigframes.core.schema.ArraySchema,
+        query_job: Optional[bigquery.QueryJob] = None,
+    ):
+        self._schema = bf_schema
+        self._query_job = query_job
+
+    @property
+    def query_job(self) -> Optional[bigquery.QueryJob]:
+        return self._query_job
+
+    @property
+    def total_bytes(self) -> Optional[int]:
+        return None
+
+    @property
+    def total_rows(self) -> Optional[int]:
+        return 0
+
+    @property
+    def total_bytes_processed(self) -> Optional[int]:
+        if self.query_job:
+            return self.query_job.total_bytes_processed
+        return None
+
+    @property
+    def schema(self) -> bigframes.core.schema.ArraySchema:
+        return self._schema
+
+    def _get_arrow_batches(self) -> Iterator[pyarrow.RecordBatch]:
+        return iter([])
+
+
+class BQTableExecuteResult(ExecuteResult):
+    def __init__(
+        self,
+        data: bigquery.TableReference,
+        bf_schema: bigframes.core.schema.ArraySchema,
+        bq_client: bigquery.Client,
+        storage_client: bigquery_storage_v1.BigQueryReadClient,
+        *,
+        query_job: Optional[bigquery.QueryJob] = None,
+        snapshot_time: Optional[datetime.datetime] = None,
+        limit: Optional[int] = None,
+        selected_fields: Optional[Sequence[str]] = None,
+        sql_predicate: Optional[str] = None,
+    ):
+        self._data = data
+        self._schema = bf_schema
+        self._query_job = query_job
+        self._bqclient = bq_client
+        self._storage_client = storage_client
+        self._snapshot_time = snapshot_time
+        self._limit = limit
+        self._selected_fields = selected_fields
+        self._predicate = sql_predicate
+
+    @property
+    def query_job(self) -> Optional[bigquery.QueryJob]:
+        return self._query_job
+
+    @property
+    def total_bytes(self) -> Optional[int]:
+        return None
+
+    @property
+    def total_rows(self) -> Optional[int]:
+        return self._get_table_metadata(self._data).num_rows
+
+    @functools.cache
+    def _get_table_metadata(self) -> bigquery.Table:
+        return self._bqclient.get_table(self._data)
+
+    @property
+    def total_bytes_processed(self) -> Optional[int]:
+        if self.query_job:
+            return self.query_job.total_bytes_processed
+        return None
+
+    @property
+    def schema(self) -> bigframes.core.schema.ArraySchema:
+        return self._schema
+
+    def _get_arrow_batches(self) -> Iterator[pyarrow.RecordBatch]:
+        import google.cloud.bigquery_storage_v1.types as bq_storage_types
+        from google.protobuf import timestamp_pb2
+
+        table_mod_options = {}
+        read_options_dict: dict[str, Any] = {}
+        if self._selected_fields:
+            read_options_dict["selected_fields"] = list(self._selected_fields)
+        if self._predicate:
+            read_options_dict["row_restriction"] = self._predicate
+        read_options = bq_storage_types.ReadSession.TableReadOptions(
+            **read_options_dict
+        )
+
+        if self._snapshot_time:
+            snapshot_time = timestamp_pb2.Timestamp()
+            snapshot_time.FromDatetime(self._snapshot_time)
+            table_mod_options["snapshot_time"] = snapshot_time = snapshot_time
+        table_mods = bq_storage_types.ReadSession.TableModifiers(**table_mod_options)
+
+        requested_session = bq_storage_types.stream.ReadSession(
+            table=self._data.to_bqstorage(),
+            data_format=bq_storage_types.DataFormat.ARROW,
+            read_options=read_options,
+            table_modifiers=table_mods,
+        )
+        # Single stream to maintain ordering
+        request = bq_storage_types.CreateReadSessionRequest(
+            parent=f"projects/{self._data.project}",
+            read_session=requested_session,
+            max_stream_count=1,
+        )
+        session = self._storage_client.create_read_session(request=request)
+
+        if not session.streams:
+            batches: Iterator[pa.RecordBatch] = iter([])
+        else:
+            reader = self._storage_client.read_rows(session.streams[0].name)
+            rowstream = reader.rows()
+
+            def process_page(page):
+                pa_batch = page.to_arrow()
+                return pa.RecordBatch.from_arrays(
+                    pa_batch.columns, names=self.schema.names
+                )
+
+            batches = map(process_page, rowstream.pages)
+
+        return batches
+
+
 @dataclasses.dataclass(frozen=True)
 class HierarchicalKey:
     columns: tuple[str, ...]
diff --git a/bigframes/session/local_scan_executor.py b/bigframes/session/local_scan_executor.py
@@ -57,10 +57,7 @@ def execute(
         if (peek is not None) and (total_rows is not None):
             total_rows = min(peek, total_rows)
 
-        return executor.ExecuteResult(
-            _arrow_batches=arrow_table.to_batches(),
-            schema=plan.schema,
-            query_job=None,
-            total_bytes=None,
-            total_rows=total_rows,
+        return executor.LocalExecuteResult(
+            data=arrow_table,
+            bf_schema=plan.schema,
         )
diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py
@@ -153,11 +153,9 @@ def execute(
         if peek is not None:
             lazy_frame = lazy_frame.limit(peek)
         pa_table = lazy_frame.collect().to_arrow()
-        return executor.ExecuteResult(
-            _arrow_batches=iter(map(self._adapt_batch, pa_table.to_batches())),
-            schema=plan.schema,
-            total_bytes=pa_table.nbytes,
-            total_rows=pa_table.num_rows,
+        return executor.LocalExecuteResult(
+            data=pa.Table.from_batches(map(self._adapt_batch, pa_table.to_batches())),
+            bf_schema=plan.schema,
         )
 
     def _can_execute(self, plan: bigframe_node.BigFrameNode):
diff --git a/bigframes/session/read_api_execution.py b/bigframes/session/read_api_execution.py
diff --git a/bigframes/testing/polars_session.py b/bigframes/testing/polars_session.py