apache
diff --git a/‎poetry.lock‎
Lines changed: 233 additions & 230 deletions b/‎poetry.lock‎
Lines changed: 233 additions & 230 deletions
diff --git a/‎pyiceberg/expressions/__init__.py‎
Lines changed: 44 additions & 3 deletions b/‎pyiceberg/expressions/__init__.py‎
Lines changed: 44 additions & 3 deletions
diff --git a/‎pyiceberg/io/pyarrow.py‎
Lines changed: 33 additions & 28 deletions b/‎pyiceberg/io/pyarrow.py‎
Lines changed: 33 additions & 28 deletions
diff --git a/‎pyiceberg/table/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎pyiceberg/table/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyiceberg/table/upsert_util.py‎
Lines changed: 17 additions & 2 deletions b/‎pyiceberg/table/upsert_util.py‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎pyiceberg/transforms.py‎
Lines changed: 1 addition & 1 deletion b/‎pyiceberg/transforms.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyiceberg/types.py‎
Lines changed: 20 additions & 1 deletion b/‎pyiceberg/types.py‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 5 additions & 5 deletions b/‎pyproject.toml‎
Lines changed: 5 additions & 5 deletions
@@ -18,11 +18,13 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from functools import cached_property, reduce
+from functools import cached_property
 from typing import (
     Any,
+    Callable,
     Generic,
     Iterable,
+    Sequence,
     Set,
     Tuple,
     Type,
@@ -79,6 +81,45 @@ def __or__(self, other: BooleanExpression) -> BooleanExpression:
         return Or(self, other)
 
 
+def _build_balanced_tree(
+    operator_: Callable[[BooleanExpression, BooleanExpression], BooleanExpression], items: Sequence[BooleanExpression]
+) -> BooleanExpression:
+    """
+    Recursively constructs a balanced binary tree of BooleanExpressions using the provided binary operator.
+
+    This function is a safer and more scalable alternative to:
+        reduce(operator_, items)
+
+    Using `reduce` creates a deeply nested, unbalanced tree (e.g., operator_(a, operator_(b, operator_(c, ...)))),
+    which grows linearly with the number of items. This can lead to RecursionError exceptions in Python
+    when the number of expressions is large (e.g., >1000).
+
+    In contrast, this function builds a balanced binary tree with logarithmic depth (O(log n)),
+    helping avoid recursion issues and ensuring that expression trees remain stable, predictable,
+    and safe to traverse — especially in tools like PyIceberg that operate on large logical trees.
+
+    Parameters:
+        operator_ (Callable): A binary operator function (e.g., pyiceberg.expressions.Or, And) that takes two
+            BooleanExpressions and returns a combined BooleanExpression.
+        items (Sequence[BooleanExpression]): A sequence of BooleanExpression objects to combine.
+
+    Returns:
+        BooleanExpression: The balanced combination of all input BooleanExpressions.
+
+    Raises:
+        ValueError: If the input sequence is empty.
+    """
+    if not items:
+        raise ValueError("No expressions to combine")
+    if len(items) == 1:
+        return items[0]
+    mid = len(items) // 2
+
+    left = _build_balanced_tree(operator_, items[:mid])
+    right = _build_balanced_tree(operator_, items[mid:])
+    return operator_(left, right)
+
+
 class Term(Generic[L], ABC):
     """A simple expression that evaluates to a value."""
 
@@ -214,7 +255,7 @@ class And(BooleanExpression):
 
     def __new__(cls, left: BooleanExpression, right: BooleanExpression, *rest: BooleanExpression) -> BooleanExpression:  # type: ignore
         if rest:
-            return reduce(And, (left, right, *rest))
+            return _build_balanced_tree(And, (left, right, *rest))
         if left is AlwaysFalse() or right is AlwaysFalse():
             return AlwaysFalse()
         elif left is AlwaysTrue():
@@ -257,7 +298,7 @@ class Or(BooleanExpression):
 
     def __new__(cls, left: BooleanExpression, right: BooleanExpression, *rest: BooleanExpression) -> BooleanExpression:  # type: ignore
         if rest:
-            return reduce(Or, (left, right, *rest))
+            return _build_balanced_tree(Or, (left, right, *rest))
         if left is AlwaysTrue() or right is AlwaysTrue():
             return AlwaysTrue()
         elif left is AlwaysFalse():
 
@@ -175,6 +175,7 @@
 from pyiceberg.utils.concurrent import ExecutorFactory
 from pyiceberg.utils.config import Config
 from pyiceberg.utils.datetime import millis_to_datetime
+from pyiceberg.utils.decimal import unscaled_to_decimal
 from pyiceberg.utils.deprecated import deprecation_message
 from pyiceberg.utils.properties import get_first_property_value, property_as_bool, property_as_int
 from pyiceberg.utils.singleton import Singleton
@@ -1384,7 +1385,6 @@ def _task_to_record_batches(
     positional_deletes: Optional[List[ChunkedArray]],
     case_sensitive: bool,
     name_mapping: Optional[NameMapping] = None,
-    use_large_types: bool = True,
     partition_spec: Optional[PartitionSpec] = None,
 ) -> Iterator[pa.RecordBatch]:
     _, _, path = _parse_location(task.file.file_path)
@@ -1420,13 +1420,7 @@ def _task_to_record_batches(
 
         fragment_scanner = ds.Scanner.from_fragment(
             fragment=fragment,
-            # With PyArrow 16.0.0 there is an issue with casting record-batches:
-            # https://github.com/apache/arrow/issues/41884
-            # https://github.com/apache/arrow/issues/43183
-            # Would be good to remove this later on
-            schema=_pyarrow_schema_ensure_large_types(physical_schema)
-            if use_large_types
-            else (_pyarrow_schema_ensure_small_types(physical_schema)),
+            schema=physical_schema,
             # This will push down the query to Arrow.
             # But in case there are positional deletes, we have to apply them first
             filter=pyarrow_filter if not positional_deletes else None,
@@ -1461,7 +1455,6 @@ def _task_to_record_batches(
                 file_project_schema,
                 current_batch,
                 downcast_ns_timestamp_to_us=True,
-                use_large_types=use_large_types,
             )
 
             # Inject projected column values if available
@@ -1555,14 +1548,6 @@ def __init__(
         self._case_sensitive = case_sensitive
         self._limit = limit
 
-    @property
-    def _use_large_types(self) -> bool:
-        """Whether to represent data as large arrow types.
-
-        Defaults to True.
-        """
-        return property_as_bool(self._io.properties, PYARROW_USE_LARGE_TYPES_ON_READ, True)
-
     def to_table(self, tasks: Iterable[FileScanTask]) -> pa.Table:
         """Scan the Iceberg table and return a pa.Table.
 
@@ -1618,11 +1603,21 @@ def _table_from_scan_task(task: FileScanTask) -> Optional[pa.Table]:
 
         tables = [f.result() for f in completed_futures if f.result()]
 
+        arrow_schema = schema_to_pyarrow(self._projected_schema, include_field_ids=False)
+
         if len(tables) < 1:
-            return pa.Table.from_batches([], schema=schema_to_pyarrow(self._projected_schema, include_field_ids=False))
+            return pa.Table.from_batches([], schema=arrow_schema)
 
         result = pa.concat_tables(tables, promote_options="permissive")
 
+        if property_as_bool(self._io.properties, PYARROW_USE_LARGE_TYPES_ON_READ, False):
+            deprecation_message(
+                deprecated_in="0.10.0",
+                removed_in="0.11.0",
+                help_message=f"Property `{PYARROW_USE_LARGE_TYPES_ON_READ}` will be removed.",
+            )
+            result = result.cast(arrow_schema)
+
         if self._limit is not None:
             return result.slice(0, self._limit)
 
@@ -1666,7 +1661,6 @@ def _record_batches_from_scan_tasks_and_deletes(
                 deletes_per_file.get(task.file.file_path),
                 self._case_sensitive,
                 self._table_metadata.name_mapping(),
-                self._use_large_types,
                 self._table_metadata.spec(),
             )
             for batch in batches:
@@ -1685,13 +1679,12 @@ def _to_requested_schema(
     batch: pa.RecordBatch,
     downcast_ns_timestamp_to_us: bool = False,
     include_field_ids: bool = False,
-    use_large_types: bool = True,
 ) -> pa.RecordBatch:
     # We could reuse some of these visitors
     struct_array = visit_with_partner(
         requested_schema,
         batch,
-        ArrowProjectionVisitor(file_schema, downcast_ns_timestamp_to_us, include_field_ids, use_large_types),
+        ArrowProjectionVisitor(file_schema, downcast_ns_timestamp_to_us, include_field_ids),
         ArrowAccessor(file_schema),
     )
     return pa.RecordBatch.from_struct_array(struct_array)
@@ -1701,20 +1694,27 @@ class ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, Optional[pa.Arra
     _file_schema: Schema
     _include_field_ids: bool
     _downcast_ns_timestamp_to_us: bool
-    _use_large_types: bool
+    _use_large_types: Optional[bool]
 
     def __init__(
         self,
         file_schema: Schema,
         downcast_ns_timestamp_to_us: bool = False,
         include_field_ids: bool = False,
-        use_large_types: bool = True,
+        use_large_types: Optional[bool] = None,
     ) -> None:
         self._file_schema = file_schema
         self._include_field_ids = include_field_ids
         self._downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us
         self._use_large_types = use_large_types
 
+        if use_large_types is not None:
+            deprecation_message(
+                deprecated_in="0.10.0",
+                removed_in="0.11.0",
+                help_message="Argument `use_large_types` will be removed from ArrowProjectionVisitor",
+            )
+
     def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
         file_field = self._file_schema.find_field(field.field_id)
 
@@ -1723,7 +1723,7 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
                 target_schema = schema_to_pyarrow(
                     promote(file_field.field_type, field.field_type), include_field_ids=self._include_field_ids
                 )
-                if not self._use_large_types:
+                if self._use_large_types is False:
                     target_schema = _pyarrow_schema_ensure_small_types(target_schema)
                 return values.cast(target_schema)
             elif (target_type := schema_to_pyarrow(field.field_type, include_field_ids=self._include_field_ids)) != values.type:
@@ -1784,7 +1784,7 @@ def struct(
                 field_arrays.append(array)
                 fields.append(self._construct_field(field, array.type))
             elif field.optional:
-                arrow_type = schema_to_pyarrow(field.field_type, include_field_ids=False)
+                arrow_type = schema_to_pyarrow(field.field_type, include_field_ids=self._include_field_ids)
                 field_arrays.append(pa.nulls(len(struct_array), type=arrow_type))
                 fields.append(self._construct_field(field, arrow_type))
             else:
@@ -1896,7 +1896,7 @@ def visit_fixed(self, fixed_type: FixedType) -> str:
         return "FIXED_LEN_BYTE_ARRAY"
 
     def visit_decimal(self, decimal_type: DecimalType) -> str:
-        return "FIXED_LEN_BYTE_ARRAY"
+        return "INT32" if decimal_type.precision <= 9 else "INT64" if decimal_type.precision <= 18 else "FIXED_LEN_BYTE_ARRAY"
 
     def visit_boolean(self, boolean_type: BooleanType) -> str:
         return "BOOLEAN"
@@ -2370,8 +2370,13 @@ def data_file_statistics_from_parquet_metadata(
                             stats_col.iceberg_type, statistics.physical_type, stats_col.mode.length
                         )
 
-                    col_aggs[field_id].update_min(statistics.min)
-                    col_aggs[field_id].update_max(statistics.max)
+                    if isinstance(stats_col.iceberg_type, DecimalType) and statistics.physical_type != "FIXED_LEN_BYTE_ARRAY":
+                        scale = stats_col.iceberg_type.scale
+                        col_aggs[field_id].update_min(unscaled_to_decimal(statistics.min_raw, scale))
+                        col_aggs[field_id].update_max(unscaled_to_decimal(statistics.max_raw, scale))
+                    else:
+                        col_aggs[field_id].update_min(statistics.min)
+                        col_aggs[field_id].update_max(statistics.max)
 
                 except pyarrow.lib.ArrowNotImplementedError as e:
                     invalidate_col.add(field_id)
 
@@ -1793,7 +1793,7 @@ def to_arrow_batch_reader(self) -> pa.RecordBatchReader:
         return pa.RecordBatchReader.from_batches(
             target_schema,
             batches,
-        )
+        ).cast(target_schema)
 
     def to_pandas(self, **kwargs: Any) -> pd.DataFrame:
         """Read a Pandas DataFrame eagerly from this Iceberg table.
 
@@ -26,6 +26,7 @@
     BooleanExpression,
     EqualTo,
     In,
+    Or,
 )
 
 
@@ -39,7 +40,12 @@ def create_match_filter(df: pyarrow_table, join_cols: list[str]) -> BooleanExpre
             functools.reduce(operator.and_, [EqualTo(col, row[col]) for col in join_cols]) for row in unique_keys.to_pylist()
         ]
 
-        return AlwaysFalse() if len(filters) == 0 else functools.reduce(operator.or_, filters)
+        if len(filters) == 0:
+            return AlwaysFalse()
+        elif len(filters) == 1:
+            return filters[0]
+        else:
+            return Or(*filters)
 
 
 def has_duplicate_rows(df: pyarrow_table, join_cols: list[str]) -> bool:
@@ -65,7 +71,16 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols
         # When the target table is empty, there is nothing to update :)
         return source_table.schema.empty_table()
 
-    diff_expr = functools.reduce(operator.or_, [pc.field(f"{col}-lhs") != pc.field(f"{col}-rhs") for col in non_key_cols])
+    diff_expr = functools.reduce(
+        operator.or_,
+        [
+            pc.or_kleene(
+                pc.not_equal(pc.field(f"{col}-lhs"), pc.field(f"{col}-rhs")),
+                pc.is_null(pc.not_equal(pc.field(f"{col}-lhs"), pc.field(f"{col}-rhs"))),
+            )
+            for col in non_key_cols
+        ],
+    )
 
     return (
         source_table
 
@@ -703,7 +703,7 @@ def hour_func(v: Any) -> int:
 
         elif isinstance(source, (TimestampNanoType, TimestamptzNanoType)):
 
-            def day_func(v: Any) -> int:
+            def hour_func(v: Any) -> int:
                 # python datetime has no nanoseconds support.
                 # nanosecond datetimes will be expressed as int as a workaround
                 return datetime.nanos_to_hours(v)
 
@@ -47,6 +47,7 @@
     Field,
     PrivateAttr,
     SerializeAsAny,
+    field_validator,
     model_serializer,
     model_validator,
 )
@@ -310,6 +311,14 @@ class NestedField(IcebergType):
         ...     doc="Just a long"
         ... ))
         '2: bar: required long (Just a long)'
+        >>> str(NestedField(
+        ...     field_id=3,
+        ...     name='baz',
+        ...     field_type="string",
+        ...     required=True,
+        ...     doc="A string field"
+        ... ))
+        '3: baz: required string (A string field)'
     """
 
     field_id: int = Field(alias="id")
@@ -320,11 +329,21 @@ class NestedField(IcebergType):
     initial_default: Optional[Any] = Field(alias="initial-default", default=None, repr=False)
     write_default: Optional[L] = Field(alias="write-default", default=None, repr=False)  # type: ignore
 
+    @field_validator("field_type", mode="before")
+    def convert_field_type(cls, v: Any) -> IcebergType:
+        """Convert string values into IcebergType instances."""
+        if isinstance(v, str):
+            try:
+                return IcebergType.handle_primitive_type(v, None)
+            except ValueError as e:
+                raise ValueError(f"Unsupported field type: '{v}'") from e
+        return v
+
     def __init__(
         self,
         field_id: Optional[int] = None,
         name: Optional[str] = None,
-        field_type: Optional[IcebergType] = None,
+        field_type: Optional[IcebergType | str] = None,
         required: bool = False,
         doc: Optional[str] = None,
         initial_default: Optional[Any] = None,
 
@@ -53,7 +53,7 @@ python = "^3.9.2, !=3.9.7"
 mmh3 = ">=4.0.0,<6.0.0"
 requests = ">=2.20.0,<3.0.0"
 click = ">=7.1.1,<9.0.0"
-rich = ">=10.11.0,<14.0.0"
+rich = ">=10.11.0,<15.0.0"
 strictyaml = ">=1.7.0,<2.0.0" # CVE-2020-14343 was fixed in 5.4.
 pydantic = ">=2.0,<3.0,!=2.4.0,!=2.4.1" # 2.4.0, 2.4.1 has a critical bug
 sortedcontainers = "2.4.0"
@@ -98,20 +98,20 @@ pytest-mock = "3.14.0"
 pyspark = "3.5.5"
 cython = "3.0.12"
 deptry = ">=0.14,<0.24"
-datafusion = ">=44,<46"
+datafusion = ">=44,<47"
 docutils = "!=0.21.post1"   # https://github.com/python-poetry/poetry/issues/9248#issuecomment-2026240520
 
 [tool.poetry.group.docs.dependencies]
 # for mkdocs
 mkdocs = "1.6.1"
-griffe = "1.6.3"
+griffe = "1.7.1"
 jinja2 = "3.1.6"
-mkdocstrings = "0.29.0"
+mkdocstrings = "0.29.1"
 mkdocstrings-python = "1.16.8"
 mkdocs-literate-nav = "0.6.2"
 mkdocs-autorefs = "1.4.1"
 mkdocs-gen-files = "0.5.0"
-mkdocs-material = "9.6.9"
+mkdocs-material = "9.6.10"
 mkdocs-material-extensions = "1.3.1"
 mkdocs-section-index = "0.3.9"