googleapis
diff --git a/‎bigframes/core/array_value.py‎
Lines changed: 7 additions & 6 deletions b/‎bigframes/core/array_value.py‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎bigframes/core/bq_data.py‎
Lines changed: 84 additions & 0 deletions b/‎bigframes/core/bq_data.py‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎bigframes/core/compile/ibis_compiler/ibis_compiler.py‎
Lines changed: 3 additions & 3 deletions b/‎bigframes/core/compile/ibis_compiler/ibis_compiler.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎bigframes/core/nodes.py‎
Lines changed: 16 additions & 69 deletions b/‎bigframes/core/nodes.py‎
Lines changed: 16 additions & 69 deletions
diff --git a/‎bigframes/core/rewrite/fold_row_count.py‎
Lines changed: 1 addition & 5 deletions b/‎bigframes/core/rewrite/fold_row_count.py‎
Lines changed: 1 addition & 5 deletions
@@ -23,7 +23,7 @@
 import pandas
 import pyarrow as pa
 
-from bigframes.core import agg_expressions
+from bigframes.core import agg_expressions, bq_data
 import bigframes.core.expression as ex
 import bigframes.core.guid
 import bigframes.core.identifiers as ids
@@ -63,7 +63,7 @@ def from_pyarrow(cls, arrow_table: pa.Table, session: Session):
     def from_managed(cls, source: local_data.ManagedArrowTable, session: Session):
         scan_list = nodes.ScanList(
             tuple(
-                nodes.ScanItem(ids.ColumnId(item.column), item.dtype, item.column)
+                nodes.ScanItem(ids.ColumnId(item.column), item.column)
                 for item in source.schema.items
             )
         )
@@ -100,7 +100,7 @@ def from_table(
         if offsets_col and primary_key:
             raise ValueError("must set at most one of 'offests', 'primary_key'")
         # define data source only for needed columns, this makes row-hashing cheaper
-        table_def = nodes.GbqTable.from_table(table, columns=schema.names)
+        table_def = bq_data.GbqTable.from_table(table, columns=schema.names)
 
         # create ordering from info
         ordering = None
@@ -114,12 +114,13 @@ def from_table(
         # Scan all columns by default, we define this list as it can be pruned while preserving source_def
         scan_list = nodes.ScanList(
             tuple(
-                nodes.ScanItem(ids.ColumnId(item.column), item.dtype, item.column)
+                nodes.ScanItem(ids.ColumnId(item.column), item.column)
                 for item in schema.items
             )
         )
-        source_def = nodes.BigqueryDataSource(
+        source_def = bq_data.BigqueryDataSource(
             table=table_def,
+            schema=schema,
             at_time=at_time,
             sql_predicate=predicate,
             ordering=ordering,
@@ -130,7 +131,7 @@ def from_table(
     @classmethod
     def from_bq_data_source(
         cls,
-        source: nodes.BigqueryDataSource,
+        source: bq_data.BigqueryDataSource,
         scan_list: nodes.ScanList,
         session: Session,
     ):
 
@@ -0,0 +1,84 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import dataclasses
+import datetime
+import functools
+import typing
+from typing import Optional, Sequence, Tuple
+
+import google.cloud.bigquery as bq
+
+import bigframes.core.schema
+
+if typing.TYPE_CHECKING:
+    import bigframes.core.ordering as orderings
+
+
+@dataclasses.dataclass(frozen=True)
+class GbqTable:
+    project_id: str = dataclasses.field()
+    dataset_id: str = dataclasses.field()
+    table_id: str = dataclasses.field()
+    physical_schema: Tuple[bq.SchemaField, ...] = dataclasses.field()
+    is_physically_stored: bool = dataclasses.field()
+    cluster_cols: typing.Optional[Tuple[str, ...]]
+
+    @staticmethod
+    def from_table(table: bq.Table, columns: Sequence[str] = ()) -> GbqTable:
+        # Subsetting fields with columns can reduce cost of row-hash default ordering
+        if columns:
+            schema = tuple(item for item in table.schema if item.name in columns)
+        else:
+            schema = tuple(table.schema)
+        return GbqTable(
+            project_id=table.project,
+            dataset_id=table.dataset_id,
+            table_id=table.table_id,
+            physical_schema=schema,
+            is_physically_stored=(table.table_type in ["TABLE", "MATERIALIZED_VIEW"]),
+            cluster_cols=None
+            if table.clustering_fields is None
+            else tuple(table.clustering_fields),
+        )
+
+    def get_table_ref(self) -> bq.TableReference:
+        return bq.TableReference(
+            bq.DatasetReference(self.project_id, self.dataset_id), self.table_id
+        )
+
+    @property
+    @functools.cache
+    def schema_by_id(self):
+        return {col.name: col for col in self.physical_schema}
+
+
+@dataclasses.dataclass(frozen=True)
+class BigqueryDataSource:
+    """
+    Google BigQuery Data source.
+
+    This should not be modified once defined, as all attributes contribute to the default ordering.
+    """
+
+    table: GbqTable
+    schema: bigframes.core.schema.ArraySchema
+    at_time: typing.Optional[datetime.datetime] = None
+    # Added for backwards compatibility, not validated
+    sql_predicate: typing.Optional[str] = None
+    ordering: typing.Optional[orderings.RowOrdering] = None
+    # Optimization field
+    n_rows: Optional[int] = None
@@ -24,7 +24,7 @@
 import bigframes_vendored.ibis.expr.types as ibis_types
 
 from bigframes import dtypes, operations
-from bigframes.core import expression, pyarrow_utils
+from bigframes.core import bq_data, expression, pyarrow_utils
 import bigframes.core.compile.compiled as compiled
 import bigframes.core.compile.concat as concat_impl
 import bigframes.core.compile.configs as configs
@@ -186,7 +186,7 @@ def compile_readtable(node: nodes.ReadTableNode, *args):
     # TODO(b/395912450): Remove workaround solution once b/374784249 got resolved.
     for scan_item in node.scan_list.items:
         if (
-            scan_item.dtype == dtypes.JSON_DTYPE
+            node.source.schema.get_type(scan_item.source_id) == dtypes.JSON_DTYPE
             and ibis_table[scan_item.source_id].type() == ibis_dtypes.string
         ):
             json_column = scalar_op_registry.parse_json(
@@ -204,7 +204,7 @@ def compile_readtable(node: nodes.ReadTableNode, *args):
 
 
 def _table_to_ibis(
-    source: nodes.BigqueryDataSource,
+    source: bq_data.BigqueryDataSource,
     scan_cols: typing.Sequence[str],
 ) -> ibis_types.Table:
     full_table_name = (
 
@@ -16,7 +16,6 @@
 
 import abc
 import dataclasses
-import datetime
 import functools
 import itertools
 import typing
@@ -31,9 +30,7 @@
     Tuple,
 )
 
-import google.cloud.bigquery as bq
-
-from bigframes.core import agg_expressions, identifiers, local_data, sequences
+from bigframes.core import agg_expressions, bq_data, identifiers, local_data, sequences
 from bigframes.core.bigframe_node import BigFrameNode, COLUMN_SET
 import bigframes.core.expression as ex
 from bigframes.core.field import Field
@@ -599,14 +596,13 @@ def transform_children(self, t: Callable[[BigFrameNode], BigFrameNode]) -> LeafN
 
 class ScanItem(typing.NamedTuple):
     id: identifiers.ColumnId
-    dtype: bigframes.dtypes.Dtype  # Might be multiple logical types for a given physical source type
     source_id: str  # Flexible enough for both local data and bq data
 
     def with_id(self, id: identifiers.ColumnId) -> ScanItem:
-        return ScanItem(id, self.dtype, self.source_id)
+        return ScanItem(id, self.source_id)
 
     def with_source_id(self, source_id: str) -> ScanItem:
-        return ScanItem(self.id, self.dtype, source_id)
+        return ScanItem(self.id, source_id)
 
 
 @dataclasses.dataclass(frozen=True)
@@ -661,7 +657,7 @@ def remap_source_ids(
     def append(
         self, source_id: str, dtype: bigframes.dtypes.Dtype, id: identifiers.ColumnId
     ) -> ScanList:
-        return ScanList((*self.items, ScanItem(id, dtype, source_id)))
+        return ScanList((*self.items, ScanItem(id, source_id)))
 
 
 @dataclasses.dataclass(frozen=True, eq=False)
@@ -677,8 +673,10 @@ class ReadLocalNode(LeafNode):
     @property
     def fields(self) -> Sequence[Field]:
         fields = tuple(
-            Field(col_id, dtype) for col_id, dtype, _ in self.scan_list.items
+            Field(col_id, self.local_data_source.schema.get_type(source_id))
+            for col_id, source_id in self.scan_list.items
         )
+
         if self.offsets_col is not None:
             return tuple(
                 itertools.chain(
@@ -726,7 +724,7 @@ def remap_vars(
     ) -> ReadLocalNode:
         new_scan_list = ScanList(
             tuple(
-                ScanItem(mappings.get(item.id, item.id), item.dtype, item.source_id)
+                ScanItem(mappings.get(item.id, item.id), item.source_id)
                 for item in self.scan_list.items
             )
         )
@@ -745,64 +743,10 @@ def remap_refs(
         return self
 
 
-@dataclasses.dataclass(frozen=True)
-class GbqTable:
-    project_id: str = dataclasses.field()
-    dataset_id: str = dataclasses.field()
-    table_id: str = dataclasses.field()
-    physical_schema: Tuple[bq.SchemaField, ...] = dataclasses.field()
-    is_physically_stored: bool = dataclasses.field()
-    cluster_cols: typing.Optional[Tuple[str, ...]]
-
-    @staticmethod
-    def from_table(table: bq.Table, columns: Sequence[str] = ()) -> GbqTable:
-        # Subsetting fields with columns can reduce cost of row-hash default ordering
-        if columns:
-            schema = tuple(item for item in table.schema if item.name in columns)
-        else:
-            schema = tuple(table.schema)
-        return GbqTable(
-            project_id=table.project,
-            dataset_id=table.dataset_id,
-            table_id=table.table_id,
-            physical_schema=schema,
-            is_physically_stored=(table.table_type in ["TABLE", "MATERIALIZED_VIEW"]),
-            cluster_cols=None
-            if table.clustering_fields is None
-            else tuple(table.clustering_fields),
-        )
-
-    def get_table_ref(self) -> bq.TableReference:
-        return bq.TableReference(
-            bq.DatasetReference(self.project_id, self.dataset_id), self.table_id
-        )
-
-    @property
-    @functools.cache
-    def schema_by_id(self):
-        return {col.name: col for col in self.physical_schema}
-
-
-@dataclasses.dataclass(frozen=True)
-class BigqueryDataSource:
-    """
-    Google BigQuery Data source.
-
-    This should not be modified once defined, as all attributes contribute to the default ordering.
-    """
-
-    table: GbqTable
-    at_time: typing.Optional[datetime.datetime] = None
-    # Added for backwards compatibility, not validated
-    sql_predicate: typing.Optional[str] = None
-    ordering: typing.Optional[orderings.RowOrdering] = None
-    n_rows: Optional[int] = None
-
-
 ## Put ordering in here or just add order_by node above?
 @dataclasses.dataclass(frozen=True, eq=False)
 class ReadTableNode(LeafNode):
-    source: BigqueryDataSource
+    source: bq_data.BigqueryDataSource
     # Subset of physical schema column
     # Mapping of table schema ids to bfet id.
     scan_list: ScanList
@@ -826,8 +770,12 @@ def session(self):
     @property
     def fields(self) -> Sequence[Field]:
         return tuple(
-            Field(col_id, dtype, self.source.table.schema_by_id[source_id].is_nullable)
-            for col_id, dtype, source_id in self.scan_list.items
+            Field(
+                col_id,
+                self.source.schema.get_type(source_id),
+                self.source.table.schema_by_id[source_id].is_nullable,
+            )
+            for col_id, source_id in self.scan_list.items
         )
 
     @property
@@ -886,7 +834,7 @@ def remap_vars(
     ) -> ReadTableNode:
         new_scan_list = ScanList(
             tuple(
-                ScanItem(mappings.get(item.id, item.id), item.dtype, item.source_id)
+                ScanItem(mappings.get(item.id, item.id), item.source_id)
                 for item in self.scan_list.items
             )
         )
@@ -907,7 +855,6 @@ def with_order_cols(self):
         new_scan_cols = [
             ScanItem(
                 identifiers.ColumnId.unique(),
-                dtype=bigframes.dtypes.convert_schema_field(field)[1],
                 source_id=field.name,
             )
             for field in self.source.table.physical_schema
 
@@ -15,7 +15,6 @@
 
 import pyarrow as pa
 
-from bigframes import dtypes
 from bigframes.core import local_data, nodes
 from bigframes.operations import aggregations
 
@@ -34,10 +33,7 @@ def fold_row_counts(node: nodes.BigFrameNode) -> nodes.BigFrameNode:
         pa.table({"count": pa.array([node.child.row_count], type=pa.int64())})
     )
     scan_list = nodes.ScanList(
-        tuple(
-            nodes.ScanItem(out_id, dtypes.INT_DTYPE, "count")
-            for _, out_id in node.aggregations
-        )
+        tuple(nodes.ScanItem(out_id, "count") for _, out_id in node.aggregations)
     )
     return nodes.ReadLocalNode(
         local_data_source=local_data_source, scan_list=scan_list, session=node.session