refactor materialization type normalize steps

TrevorBergeron · TrevorBergeron · commit 703fd7e9a113 · 2025-10-12T01:22:46.000Z
diff --git a/bigframes/core/bq_data.py b/bigframes/core/bq_data.py
@@ -30,6 +30,7 @@
 from google.protobuf import timestamp_pb2
 import pyarrow as pa
 
+from bigframes.core import pyarrow_utils
 import bigframes.core.schema
 
 if typing.TYPE_CHECKING:
@@ -197,7 +198,7 @@ def get_arrow_batches(
         batches = _iter_streams(session.streams, storage_read_client)
 
         def process_batch(pa_batch):
-            return pa.RecordBatch.from_arrays(pa_batch.columns, names=data.schema.names)
+            return pyarrow_utils.cast_batch(pa_batch, data.schema.to_pyarrow())
 
         batches = map(process_batch, batches)
 
diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py
@@ -24,7 +24,6 @@
 import google.cloud.bigquery.job as bq_job
 import google.cloud.bigquery.table as bq_table
 import google.cloud.bigquery_storage_v1
-import pyarrow as pa
 
 import bigframes
 from bigframes import exceptions as bfe
@@ -321,7 +320,7 @@ def _export_gbq(
 
         # TODO(swast): plumb through the api_name of the user-facing api that
         # caused this query.
-        row_iter, query_job = self._run_execute_query(
+        _, query_job = self._run_execute_query(
             sql=sql,
             job_config=job_config,
         )
@@ -688,9 +687,7 @@ def _execute_plan_gbq(
             )
         else:
             return executor.LocalExecuteResult(
-                data=pa.Table.from_batches(
-                    iterator.to_arrow_iterable(), plan.schema.to_pyarrow()
-                ),
+                data=iterator.to_arrow(),
                 bf_schema=plan.schema,
             )
 
diff --git a/bigframes/session/direct_gbq_execution.py b/bigframes/session/direct_gbq_execution.py
@@ -18,7 +18,6 @@
 from google.cloud import bigquery
 import google.cloud.bigquery.job as bq_job
 import google.cloud.bigquery.table as bq_table
-import pyarrow as pa
 
 from bigframes.core import compile, nodes
 from bigframes.core.compile import sqlglot
@@ -67,7 +66,7 @@ def execute(
 
         # just immediately downlaod everything for simplicity
         return executor.LocalExecuteResult(
-            data=pa.Table.from_batches(iterator.to_arrow_iterable()),
+            data=iterator.to_arrow(),
             bf_schema=plan.schema,
         )
 
diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py
@@ -27,7 +27,7 @@
 
 import bigframes
 import bigframes.core
-from bigframes.core import bq_data, pyarrow_utils
+from bigframes.core import bq_data, local_data, pyarrow_utils
 import bigframes.core.schema
 import bigframes.session._io.pandas as io_pandas
 import bigframes.session.execution_spec as ex_spec
@@ -70,7 +70,6 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]:
         result_rows = 0
 
         for batch in self._batches:
-            batch = pyarrow_utils.cast_batch(batch, self._schema.to_pyarrow())
             result_rows += batch.num_rows
 
             maximum_result_rows = bigframes.options.compute.maximum_result_rows
@@ -162,8 +161,10 @@ def batches(self) -> ResultsIterator:
 
 class LocalExecuteResult(ExecuteResult):
     def __init__(self, data: pa.Table, bf_schema: bigframes.core.schema.ArraySchema):
-        self._data = data
-        self._schema = bf_schema
+        self._data = local_data.ManagedArrowTable(
+            data.cast(bf_schema.to_pyarrow()), bf_schema
+        )
+        self._data.validate()
 
     @property
     def query_job(self) -> Optional[bigquery.QueryJob]:
@@ -175,14 +176,14 @@ def total_bytes_processed(self) -> Optional[int]:
 
     @property
     def schema(self) -> bigframes.core.schema.ArraySchema:
-        return self._schema
+        return self._data.schema
 
     def batches(self) -> ResultsIterator:
         return ResultsIterator(
-            iter(self._data.to_batches()),
+            iter(self._data.to_arrow()[1]),
             self.schema,
-            self._data.num_rows,
-            self._data.nbytes,
+            self._data.metadata.row_count,
+            self._data.metadata.total_bytes,
         )
 
 
diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py
@@ -16,14 +16,11 @@
 import itertools
 from typing import Optional, TYPE_CHECKING
 
-import pyarrow as pa
-
 from bigframes.core import (
     agg_expressions,
     array_value,
     bigframe_node,
     expression,
-    local_data,
     nodes,
 )
 import bigframes.operations
@@ -154,22 +151,9 @@ def execute(
             lazy_frame = lazy_frame.limit(peek)
         pa_table = lazy_frame.collect().to_arrow()
         return executor.LocalExecuteResult(
-            data=pa.Table.from_batches(
-                map(self._adapt_batch, pa_table.to_batches()), plan.schema.to_pyarrow()
-            ),
+            data=pa_table,
             bf_schema=plan.schema,
         )
 
     def _can_execute(self, plan: bigframe_node.BigFrameNode):
         return all(_is_node_polars_executable(node) for node in plan.unique_nodes())
-
-    def _adapt_array(self, array: pa.Array) -> pa.Array:
-        target_type = local_data.logical_type_replacements(array.type)
-        if target_type != array.type:
-            # Safe is false to handle weird polars decimal scaling
-            return array.cast(target_type, safe=False)
-        return array
-
-    def _adapt_batch(self, batch: pa.RecordBatch) -> pa.RecordBatch:
-        new_arrays = [self._adapt_array(arr) for arr in batch.columns]
-        return pa.RecordBatch.from_arrays(new_arrays, names=batch.column_names)
diff --git a/tests/unit/session/test_local_scan_executor.py b/tests/unit/session/test_local_scan_executor.py
@@ -73,7 +73,7 @@ def test_local_scan_executor_with_slice(start, stop, expected_rows, object_under
     )
 
     result = object_under_test.execute(plan, ordered=True)
-    result_table = pyarrow.Table.from_batches(result.arrow_batches)
+    result_table = pyarrow.Table.from_batches(result.batches().arrow_batches)
     assert result_table.num_rows == expected_rows
 
 

Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ def test_local_scan_executor_with_slice(start, stop, expected_rows, object_under`
`73`	`73`	`)`
`74`	`74`
`75`	`75`	`result = object_under_test.execute(plan, ordered=True)`
`76`		`- result_table = pyarrow.Table.from_batches(result.arrow_batches)`
	`76`	`+ result_table = pyarrow.Table.from_batches(result.batches().arrow_batches)`
`77`	`77`	`assert result_table.num_rows == expected_rows`
`78`	`78`
`79`	`79`