1616
1717import abc
1818import dataclasses
19- import datetime
2019import functools
2120import itertools
2221import typing
3130 Tuple ,
3231)
3332
34- import google .cloud .bigquery as bq
35-
36- from bigframes .core import agg_expressions , identifiers , local_data , sequences
33+ from bigframes .core import agg_expressions , bq_data , identifiers , local_data , sequences
3734from bigframes .core .bigframe_node import BigFrameNode , COLUMN_SET
3835import bigframes .core .expression as ex
3936from bigframes .core .field import Field
@@ -599,14 +596,13 @@ def transform_children(self, t: Callable[[BigFrameNode], BigFrameNode]) -> LeafN
599596
600597class ScanItem (typing .NamedTuple ):
601598 id : identifiers .ColumnId
602- dtype : bigframes .dtypes .Dtype # Might be multiple logical types for a given physical source type
603599 source_id : str # Flexible enough for both local data and bq data
604600
605601 def with_id (self , id : identifiers .ColumnId ) -> ScanItem :
606- return ScanItem (id , self .dtype , self . source_id )
602+ return ScanItem (id , self .source_id )
607603
608604 def with_source_id (self , source_id : str ) -> ScanItem :
609- return ScanItem (self .id , self . dtype , source_id )
605+ return ScanItem (self .id , source_id )
610606
611607
612608@dataclasses .dataclass (frozen = True )
@@ -661,7 +657,7 @@ def remap_source_ids(
661657 def append (
662658 self , source_id : str , dtype : bigframes .dtypes .Dtype , id : identifiers .ColumnId
663659 ) -> ScanList :
664- return ScanList ((* self .items , ScanItem (id , dtype , source_id )))
660+ return ScanList ((* self .items , ScanItem (id , source_id )))
665661
666662
667663@dataclasses .dataclass (frozen = True , eq = False )
@@ -677,8 +673,10 @@ class ReadLocalNode(LeafNode):
677673 @property
678674 def fields (self ) -> Sequence [Field ]:
679675 fields = tuple (
680- Field (col_id , dtype ) for col_id , dtype , _ in self .scan_list .items
676+ Field (col_id , self .local_data_source .schema .get_type (source_id ))
677+ for col_id , source_id in self .scan_list .items
681678 )
679+
682680 if self .offsets_col is not None :
683681 return tuple (
684682 itertools .chain (
@@ -726,7 +724,7 @@ def remap_vars(
726724 ) -> ReadLocalNode :
727725 new_scan_list = ScanList (
728726 tuple (
729- ScanItem (mappings .get (item .id , item .id ), item .dtype , item . source_id )
727+ ScanItem (mappings .get (item .id , item .id ), item .source_id )
730728 for item in self .scan_list .items
731729 )
732730 )
@@ -745,64 +743,10 @@ def remap_refs(
745743 return self
746744
747745
748- @dataclasses .dataclass (frozen = True )
749- class GbqTable :
750- project_id : str = dataclasses .field ()
751- dataset_id : str = dataclasses .field ()
752- table_id : str = dataclasses .field ()
753- physical_schema : Tuple [bq .SchemaField , ...] = dataclasses .field ()
754- is_physically_stored : bool = dataclasses .field ()
755- cluster_cols : typing .Optional [Tuple [str , ...]]
756-
757- @staticmethod
758- def from_table (table : bq .Table , columns : Sequence [str ] = ()) -> GbqTable :
759- # Subsetting fields with columns can reduce cost of row-hash default ordering
760- if columns :
761- schema = tuple (item for item in table .schema if item .name in columns )
762- else :
763- schema = tuple (table .schema )
764- return GbqTable (
765- project_id = table .project ,
766- dataset_id = table .dataset_id ,
767- table_id = table .table_id ,
768- physical_schema = schema ,
769- is_physically_stored = (table .table_type in ["TABLE" , "MATERIALIZED_VIEW" ]),
770- cluster_cols = None
771- if table .clustering_fields is None
772- else tuple (table .clustering_fields ),
773- )
774-
775- def get_table_ref (self ) -> bq .TableReference :
776- return bq .TableReference (
777- bq .DatasetReference (self .project_id , self .dataset_id ), self .table_id
778- )
779-
780- @property
781- @functools .cache
782- def schema_by_id (self ):
783- return {col .name : col for col in self .physical_schema }
784-
785-
786- @dataclasses .dataclass (frozen = True )
787- class BigqueryDataSource :
788- """
789- Google BigQuery Data source.
790-
791- This should not be modified once defined, as all attributes contribute to the default ordering.
792- """
793-
794- table : GbqTable
795- at_time : typing .Optional [datetime .datetime ] = None
796- # Added for backwards compatibility, not validated
797- sql_predicate : typing .Optional [str ] = None
798- ordering : typing .Optional [orderings .RowOrdering ] = None
799- n_rows : Optional [int ] = None
800-
801-
802746## Put ordering in here or just add order_by node above?
803747@dataclasses .dataclass (frozen = True , eq = False )
804748class ReadTableNode (LeafNode ):
805- source : BigqueryDataSource
749+ source : bq_data . BigqueryDataSource
806750 # Subset of physical schema column
807751 # Mapping of table schema ids to bfet id.
808752 scan_list : ScanList
@@ -826,8 +770,12 @@ def session(self):
826770 @property
827771 def fields (self ) -> Sequence [Field ]:
828772 return tuple (
829- Field (col_id , dtype , self .source .table .schema_by_id [source_id ].is_nullable )
830- for col_id , dtype , source_id in self .scan_list .items
773+ Field (
774+ col_id ,
775+ self .source .schema .get_type (source_id ),
776+ self .source .table .schema_by_id [source_id ].is_nullable ,
777+ )
778+ for col_id , source_id in self .scan_list .items
831779 )
832780
833781 @property
@@ -886,7 +834,7 @@ def remap_vars(
886834 ) -> ReadTableNode :
887835 new_scan_list = ScanList (
888836 tuple (
889- ScanItem (mappings .get (item .id , item .id ), item .dtype , item . source_id )
837+ ScanItem (mappings .get (item .id , item .id ), item .source_id )
890838 for item in self .scan_list .items
891839 )
892840 )
@@ -907,7 +855,6 @@ def with_order_cols(self):
907855 new_scan_cols = [
908856 ScanItem (
909857 identifiers .ColumnId .unique (),
910- dtype = bigframes .dtypes .convert_schema_field (field )[1 ],
911858 source_id = field .name ,
912859 )
913860 for field in self .source .table .physical_schema
0 commit comments