175175from pyiceberg .utils .concurrent import ExecutorFactory
176176from pyiceberg .utils .config import Config
177177from pyiceberg .utils .datetime import millis_to_datetime
178+ from pyiceberg .utils .decimal import unscaled_to_decimal
178179from pyiceberg .utils .deprecated import deprecation_message
179180from pyiceberg .utils .properties import get_first_property_value , property_as_bool , property_as_int
180181from pyiceberg .utils .singleton import Singleton
@@ -1384,7 +1385,6 @@ def _task_to_record_batches(
13841385 positional_deletes : Optional [List [ChunkedArray ]],
13851386 case_sensitive : bool ,
13861387 name_mapping : Optional [NameMapping ] = None ,
1387- use_large_types : bool = True ,
13881388 partition_spec : Optional [PartitionSpec ] = None ,
13891389) -> Iterator [pa .RecordBatch ]:
13901390 _ , _ , path = _parse_location (task .file .file_path )
@@ -1420,13 +1420,7 @@ def _task_to_record_batches(
14201420
14211421 fragment_scanner = ds .Scanner .from_fragment (
14221422 fragment = fragment ,
1423- # With PyArrow 16.0.0 there is an issue with casting record-batches:
1424- # https://github.com/apache/arrow/issues/41884
1425- # https://github.com/apache/arrow/issues/43183
1426- # Would be good to remove this later on
1427- schema = _pyarrow_schema_ensure_large_types (physical_schema )
1428- if use_large_types
1429- else (_pyarrow_schema_ensure_small_types (physical_schema )),
1423+ schema = physical_schema ,
14301424 # This will push down the query to Arrow.
14311425 # But in case there are positional deletes, we have to apply them first
14321426 filter = pyarrow_filter if not positional_deletes else None ,
@@ -1461,7 +1455,6 @@ def _task_to_record_batches(
14611455 file_project_schema ,
14621456 current_batch ,
14631457 downcast_ns_timestamp_to_us = True ,
1464- use_large_types = use_large_types ,
14651458 )
14661459
14671460 # Inject projected column values if available
@@ -1555,14 +1548,6 @@ def __init__(
15551548 self ._case_sensitive = case_sensitive
15561549 self ._limit = limit
15571550
1558- @property
1559- def _use_large_types (self ) -> bool :
1560- """Whether to represent data as large arrow types.
1561-
1562- Defaults to True.
1563- """
1564- return property_as_bool (self ._io .properties , PYARROW_USE_LARGE_TYPES_ON_READ , True )
1565-
15661551 def to_table (self , tasks : Iterable [FileScanTask ]) -> pa .Table :
15671552 """Scan the Iceberg table and return a pa.Table.
15681553
@@ -1618,11 +1603,21 @@ def _table_from_scan_task(task: FileScanTask) -> Optional[pa.Table]:
16181603
16191604 tables = [f .result () for f in completed_futures if f .result ()]
16201605
1606+ arrow_schema = schema_to_pyarrow (self ._projected_schema , include_field_ids = False )
1607+
16211608 if len (tables ) < 1 :
1622- return pa .Table .from_batches ([], schema = schema_to_pyarrow ( self . _projected_schema , include_field_ids = False ) )
1609+ return pa .Table .from_batches ([], schema = arrow_schema )
16231610
16241611 result = pa .concat_tables (tables , promote_options = "permissive" )
16251612
1613+ if property_as_bool (self ._io .properties , PYARROW_USE_LARGE_TYPES_ON_READ , False ):
1614+ deprecation_message (
1615+ deprecated_in = "0.10.0" ,
1616+ removed_in = "0.11.0" ,
1617+ help_message = f"Property `{ PYARROW_USE_LARGE_TYPES_ON_READ } ` will be removed." ,
1618+ )
1619+ result = result .cast (arrow_schema )
1620+
16261621 if self ._limit is not None :
16271622 return result .slice (0 , self ._limit )
16281623
@@ -1666,7 +1661,6 @@ def _record_batches_from_scan_tasks_and_deletes(
16661661 deletes_per_file .get (task .file .file_path ),
16671662 self ._case_sensitive ,
16681663 self ._table_metadata .name_mapping (),
1669- self ._use_large_types ,
16701664 self ._table_metadata .spec (),
16711665 )
16721666 for batch in batches :
@@ -1685,13 +1679,12 @@ def _to_requested_schema(
16851679 batch : pa .RecordBatch ,
16861680 downcast_ns_timestamp_to_us : bool = False ,
16871681 include_field_ids : bool = False ,
1688- use_large_types : bool = True ,
16891682) -> pa .RecordBatch :
16901683 # We could reuse some of these visitors
16911684 struct_array = visit_with_partner (
16921685 requested_schema ,
16931686 batch ,
1694- ArrowProjectionVisitor (file_schema , downcast_ns_timestamp_to_us , include_field_ids , use_large_types ),
1687+ ArrowProjectionVisitor (file_schema , downcast_ns_timestamp_to_us , include_field_ids ),
16951688 ArrowAccessor (file_schema ),
16961689 )
16971690 return pa .RecordBatch .from_struct_array (struct_array )
@@ -1701,20 +1694,27 @@ class ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, Optional[pa.Arra
17011694 _file_schema : Schema
17021695 _include_field_ids : bool
17031696 _downcast_ns_timestamp_to_us : bool
1704- _use_large_types : bool
1697+ _use_large_types : Optional [ bool ]
17051698
17061699 def __init__ (
17071700 self ,
17081701 file_schema : Schema ,
17091702 downcast_ns_timestamp_to_us : bool = False ,
17101703 include_field_ids : bool = False ,
1711- use_large_types : bool = True ,
1704+ use_large_types : Optional [ bool ] = None ,
17121705 ) -> None :
17131706 self ._file_schema = file_schema
17141707 self ._include_field_ids = include_field_ids
17151708 self ._downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us
17161709 self ._use_large_types = use_large_types
17171710
1711+ if use_large_types is not None :
1712+ deprecation_message (
1713+ deprecated_in = "0.10.0" ,
1714+ removed_in = "0.11.0" ,
1715+ help_message = "Argument `use_large_types` will be removed from ArrowProjectionVisitor" ,
1716+ )
1717+
17181718 def _cast_if_needed (self , field : NestedField , values : pa .Array ) -> pa .Array :
17191719 file_field = self ._file_schema .find_field (field .field_id )
17201720
@@ -1723,7 +1723,7 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
17231723 target_schema = schema_to_pyarrow (
17241724 promote (file_field .field_type , field .field_type ), include_field_ids = self ._include_field_ids
17251725 )
1726- if not self ._use_large_types :
1726+ if self ._use_large_types is False :
17271727 target_schema = _pyarrow_schema_ensure_small_types (target_schema )
17281728 return values .cast (target_schema )
17291729 elif (target_type := schema_to_pyarrow (field .field_type , include_field_ids = self ._include_field_ids )) != values .type :
@@ -1784,7 +1784,7 @@ def struct(
17841784 field_arrays .append (array )
17851785 fields .append (self ._construct_field (field , array .type ))
17861786 elif field .optional :
1787- arrow_type = schema_to_pyarrow (field .field_type , include_field_ids = False )
1787+ arrow_type = schema_to_pyarrow (field .field_type , include_field_ids = self . _include_field_ids )
17881788 field_arrays .append (pa .nulls (len (struct_array ), type = arrow_type ))
17891789 fields .append (self ._construct_field (field , arrow_type ))
17901790 else :
@@ -1896,7 +1896,7 @@ def visit_fixed(self, fixed_type: FixedType) -> str:
18961896 return "FIXED_LEN_BYTE_ARRAY"
18971897
18981898 def visit_decimal (self , decimal_type : DecimalType ) -> str :
1899- return "FIXED_LEN_BYTE_ARRAY"
1899+ return "INT32" if decimal_type . precision <= 9 else "INT64" if decimal_type . precision <= 18 else " FIXED_LEN_BYTE_ARRAY"
19001900
19011901 def visit_boolean (self , boolean_type : BooleanType ) -> str :
19021902 return "BOOLEAN"
@@ -2370,8 +2370,13 @@ def data_file_statistics_from_parquet_metadata(
23702370 stats_col .iceberg_type , statistics .physical_type , stats_col .mode .length
23712371 )
23722372
2373- col_aggs [field_id ].update_min (statistics .min )
2374- col_aggs [field_id ].update_max (statistics .max )
2373+ if isinstance (stats_col .iceberg_type , DecimalType ) and statistics .physical_type != "FIXED_LEN_BYTE_ARRAY" :
2374+ scale = stats_col .iceberg_type .scale
2375+ col_aggs [field_id ].update_min (unscaled_to_decimal (statistics .min_raw , scale ))
2376+ col_aggs [field_id ].update_max (unscaled_to_decimal (statistics .max_raw , scale ))
2377+ else :
2378+ col_aggs [field_id ].update_min (statistics .min )
2379+ col_aggs [field_id ].update_max (statistics .max )
23752380
23762381 except pyarrow .lib .ArrowNotImplementedError as e :
23772382 invalidate_col .add (field_id )
0 commit comments