175175from pyiceberg .utils .concurrent import ExecutorFactory
176176from pyiceberg .utils .config import Config
177177from pyiceberg .utils .datetime import millis_to_datetime
178+ from pyiceberg .utils .deprecated import deprecation_message
178179from pyiceberg .utils .properties import get_first_property_value , property_as_bool , property_as_int
179180from pyiceberg .utils .singleton import Singleton
180181from pyiceberg .utils .truncate import truncate_upper_bound_binary_string , truncate_upper_bound_text_string
@@ -1385,7 +1386,6 @@ def _task_to_record_batches(
13851386 positional_deletes : Optional [List [ChunkedArray ]],
13861387 case_sensitive : bool ,
13871388 name_mapping : Optional [NameMapping ] = None ,
1388- use_large_types : bool = True ,
13891389 partition_spec : Optional [PartitionSpec ] = None ,
13901390) -> Iterator [pa .RecordBatch ]:
13911391 _ , _ , path = _parse_location (task .file .file_path )
@@ -1415,13 +1415,7 @@ def _task_to_record_batches(
14151415
14161416 fragment_scanner = ds .Scanner .from_fragment (
14171417 fragment = fragment ,
1418- # With PyArrow 16.0.0 there is an issue with casting record-batches:
1419- # https://github.com/apache/arrow/issues/41884
1420- # https://github.com/apache/arrow/issues/43183
1421- # Would be good to remove this later on
1422- schema = _pyarrow_schema_ensure_large_types (physical_schema )
1423- if use_large_types
1424- else (_pyarrow_schema_ensure_small_types (physical_schema )),
1418+ schema = physical_schema ,
14251419 # This will push down the query to Arrow.
14261420 # But in case there are positional deletes, we have to apply them first
14271421 filter = pyarrow_filter if not positional_deletes else None ,
@@ -1456,7 +1450,6 @@ def _task_to_record_batches(
14561450 file_project_schema ,
14571451 current_batch ,
14581452 downcast_ns_timestamp_to_us = True ,
1459- use_large_types = use_large_types ,
14601453 )
14611454
14621455 # Inject projected column values if available
@@ -1542,14 +1535,6 @@ def __init__(
15421535 self ._case_sensitive = case_sensitive
15431536 self ._limit = limit
15441537
1545- @property
1546- def _use_large_types (self ) -> bool :
1547- """Whether to represent data as large arrow types.
1548-
1549- Defaults to True.
1550- """
1551- return property_as_bool (self ._io .properties , PYARROW_USE_LARGE_TYPES_ON_READ , True )
1552-
15531538 @property
15541539 def _projected_field_ids (self ) -> Set [int ]:
15551540 """Set of field IDs that should be projected from the data files."""
@@ -1611,11 +1596,21 @@ def _table_from_scan_task(task: FileScanTask) -> pa.Table:
16111596
16121597 tables = [f .result () for f in completed_futures if f .result ()]
16131598
1599+ arrow_schema = schema_to_pyarrow (self ._projected_schema , include_field_ids = False )
1600+
16141601 if len (tables ) < 1 :
1615- return pa .Table .from_batches ([], schema = schema_to_pyarrow ( self . _projected_schema , include_field_ids = False ) )
1602+ return pa .Table .from_batches ([], schema = arrow_schema )
16161603
16171604 result = pa .concat_tables (tables , promote_options = "permissive" )
16181605
1606+ if property_as_bool (self ._io .properties , PYARROW_USE_LARGE_TYPES_ON_READ , False ):
1607+ deprecation_message (
1608+ deprecated_in = "0.10.0" ,
1609+ removed_in = "0.11.0" ,
1610+ help_message = f"Property `{ PYARROW_USE_LARGE_TYPES_ON_READ } ` will be removed." ,
1611+ )
1612+ result = result .cast (arrow_schema )
1613+
16191614 if self ._limit is not None :
16201615 return result .slice (0 , self ._limit )
16211616
@@ -1658,7 +1653,6 @@ def _record_batches_from_scan_tasks_and_deletes(
16581653 deletes_per_file .get (task .file .file_path ),
16591654 self ._case_sensitive ,
16601655 self ._table_metadata .name_mapping (),
1661- self ._use_large_types ,
16621656 self ._table_metadata .spec (),
16631657 )
16641658 for batch in batches :
@@ -1677,13 +1671,12 @@ def _to_requested_schema(
16771671 batch : pa .RecordBatch ,
16781672 downcast_ns_timestamp_to_us : bool = False ,
16791673 include_field_ids : bool = False ,
1680- use_large_types : bool = True ,
16811674) -> pa .RecordBatch :
16821675 # We could reuse some of these visitors
16831676 struct_array = visit_with_partner (
16841677 requested_schema ,
16851678 batch ,
1686- ArrowProjectionVisitor (file_schema , downcast_ns_timestamp_to_us , include_field_ids , use_large_types ),
1679+ ArrowProjectionVisitor (file_schema , downcast_ns_timestamp_to_us , include_field_ids ),
16871680 ArrowAccessor (file_schema ),
16881681 )
16891682 return pa .RecordBatch .from_struct_array (struct_array )
@@ -1693,20 +1686,27 @@ class ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, Optional[pa.Arra
16931686 _file_schema : Schema
16941687 _include_field_ids : bool
16951688 _downcast_ns_timestamp_to_us : bool
1696- _use_large_types : bool
1689+ _use_large_types : Optional [ bool ]
16971690
16981691 def __init__ (
16991692 self ,
17001693 file_schema : Schema ,
17011694 downcast_ns_timestamp_to_us : bool = False ,
17021695 include_field_ids : bool = False ,
1703- use_large_types : bool = True ,
1696+ use_large_types : Optional [ bool ] = None ,
17041697 ) -> None :
17051698 self ._file_schema = file_schema
17061699 self ._include_field_ids = include_field_ids
17071700 self ._downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us
17081701 self ._use_large_types = use_large_types
17091702
1703+ if use_large_types is not None :
1704+ deprecation_message (
1705+ deprecated_in = "0.10.0" ,
1706+ removed_in = "0.11.0" ,
1707+ help_message = "Argument `use_large_types` will be removed from ArrowProjectionVisitor" ,
1708+ )
1709+
17101710 def _cast_if_needed (self , field : NestedField , values : pa .Array ) -> pa .Array :
17111711 file_field = self ._file_schema .find_field (field .field_id )
17121712
@@ -1715,7 +1715,7 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
17151715 target_schema = schema_to_pyarrow (
17161716 promote (file_field .field_type , field .field_type ), include_field_ids = self ._include_field_ids
17171717 )
1718- if not self ._use_large_types :
1718+ if self ._use_large_types is False :
17191719 target_schema = _pyarrow_schema_ensure_small_types (target_schema )
17201720 return values .cast (target_schema )
17211721 elif (target_type := schema_to_pyarrow (field .field_type , include_field_ids = self ._include_field_ids )) != values .type :
0 commit comments