From 2262b51da29444cd4f50ea566f6c5d8ee820a7d4 Mon Sep 17 00:00:00 2001 From: Kristofer Date: Wed, 20 Aug 2025 14:23:10 -0400 Subject: [PATCH 1/2] Document null field handling for PyArrow --- pyiceberg/io/pyarrow.py | 5 +++++ pyiceberg/schema.py | 21 +++++++++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 7779a422f0..b49d58ffd3 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -779,6 +779,9 @@ def visit_uuid(self, _: UUIDType) -> pa.DataType: return pa.uuid() def visit_unknown(self, _: UnknownType) -> pa.DataType: + """ + UnknownType can be promoted to any primitive type in V3+ tables per the Iceberg spec + """ return pa.null() def visit_binary(self, _: BinaryType) -> pa.DataType: @@ -1358,6 +1361,8 @@ def primitive(self, primitive: pa.DataType) -> PrimitiveType: primitive = cast(pa.FixedSizeBinaryType, primitive) return FixedType(primitive.byte_width) elif pa.types.is_null(primitive): + # PyArrow null type (pa.null()) is converted to Iceberg UnknownType + # UnknownType can be promoted to any primitive type in V3+ tables per the Iceberg spec return UnknownType() elif isinstance(primitive, pa.UuidType): return UUIDType() diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index 1eadc58361..0f335c2a66 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -1692,6 +1692,15 @@ def _(file_type: FixedType, read_type: IcebergType) -> IcebergType: raise ResolveError(f"Cannot promote {file_type} to {read_type}") +@promote.register(UnknownType) +def _(file_type: UnknownType, read_type: IcebergType) -> IcebergType: + # Per V3 Spec, "Unknown" can be promoted to any Primitive type + if isinstance(read_type, PrimitiveType): + return read_type + else: + raise ResolveError(f"Cannot promote {file_type} to {read_type}") + + def _check_schema_compatible(requested_schema: Schema, provided_schema: Schema) -> None: """ Check if the `provided_schema` is compatible with `requested_schema`. @@ -1760,8 +1769,16 @@ def _is_field_compatible(self, lhs: NestedField) -> bool: promote(rhs.field_type, lhs.field_type) self.rich_table.add_row("✅", str(lhs), str(rhs)) return True - except ResolveError: - self.rich_table.add_row("❌", str(lhs), str(rhs)) + except ResolveError as e: + # UnknownType can only be promoted to Primitive types + if isinstance(rhs.field_type, UnknownType): + if isinstance(lhs.field_type, (ListType, MapType, StructType)): + error_msg = f"PyArrow null type (UnknownType) cannot be promoted to non-primitive type {lhs.field_type}. UnknownType can only be promoted to primitive types (string, int, boolean, etc.) in V3+ tables." + else: + error_msg = f"PyArrow null type (UnknownType) cannot be promoted to {lhs.field_type}. This may be due to table format version limitations (V1/V2 tables don't support UnknownType promotion)." + self.rich_table.add_row("❌", str(lhs), f"{str(rhs)} - {error_msg}") + else: + self.rich_table.add_row("❌", str(lhs), str(rhs)) return False def schema(self, schema: Schema, struct_result: Callable[[], bool]) -> bool: From db62f03b8ab5e554b55163955cf5d404c6bef3d9 Mon Sep 17 00:00:00 2001 From: Kristofer Date: Wed, 20 Aug 2025 17:26:36 -0400 Subject: [PATCH 2/2] Add unit test and address comments --- pyiceberg/io/pyarrow.py | 4 +--- pyiceberg/schema.py | 8 ++++---- tests/test_schema.py | 31 +++++++++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index b49d58ffd3..304021af44 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -779,9 +779,7 @@ def visit_uuid(self, _: UUIDType) -> pa.DataType: return pa.uuid() def visit_unknown(self, _: UnknownType) -> pa.DataType: - """ - UnknownType can be promoted to any primitive type in V3+ tables per the Iceberg spec - """ + """Type `UnknownType` can be promoted to any primitive type in V3+ tables per the Iceberg spec.""" return pa.null() def visit_binary(self, _: BinaryType) -> pa.DataType: diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index 0f335c2a66..d9c2d7ddfc 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -1769,13 +1769,13 @@ def _is_field_compatible(self, lhs: NestedField) -> bool: promote(rhs.field_type, lhs.field_type) self.rich_table.add_row("✅", str(lhs), str(rhs)) return True - except ResolveError as e: + except ResolveError: # UnknownType can only be promoted to Primitive types if isinstance(rhs.field_type, UnknownType): - if isinstance(lhs.field_type, (ListType, MapType, StructType)): - error_msg = f"PyArrow null type (UnknownType) cannot be promoted to non-primitive type {lhs.field_type}. UnknownType can only be promoted to primitive types (string, int, boolean, etc.) in V3+ tables." + if not isinstance(lhs.field_type, PrimitiveType): + error_msg = f"Null type (UnknownType) cannot be promoted to non-primitive type {lhs.field_type}. UnknownType can only be promoted to primitive types (string, int, boolean, etc.) in V3+ tables." else: - error_msg = f"PyArrow null type (UnknownType) cannot be promoted to {lhs.field_type}. This may be due to table format version limitations (V1/V2 tables don't support UnknownType promotion)." + error_msg = f"Null type (UnknownType) cannot be promoted to {lhs.field_type}. This may be due to table format version limitations (V1/V2 tables don't support UnknownType promotion)." self.rich_table.add_row("❌", str(lhs), f"{str(rhs)} - {error_msg}") else: self.rich_table.add_row("❌", str(lhs), str(rhs)) diff --git a/tests/test_schema.py b/tests/test_schema.py index 3ca74c4027..ed8f8622dc 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -54,6 +54,7 @@ TimestampType, TimestamptzType, TimeType, + UnknownType, UUIDType, ) @@ -919,6 +920,36 @@ def test_promotion(file_type: IcebergType, read_type: IcebergType) -> None: promote(file_type, read_type) +def test_unknown_type_promotion_to_primitive() -> None: + """Test that UnknownType can be promoted to primitive types (V3+ behavior)""" + unknown_type = UnknownType() + + assert promote(unknown_type, StringType()) == StringType() + assert promote(unknown_type, IntegerType()) == IntegerType() + assert promote(unknown_type, BooleanType()) == BooleanType() + assert promote(unknown_type, FloatType()) == FloatType() + + +def test_unknown_type_promotion_to_non_primitive_raises_resolve_error() -> None: + """Test that UnknownType cannot be promoted to non-primitive types and raises ResolveError""" + unknown_type = UnknownType() + + with pytest.raises(ResolveError) as exc_info: + promote(unknown_type, ListType(element_id=1, element_type=StringType(), element_required=False)) + + assert "Cannot promote unknown to list" in str(exc_info.value) + + with pytest.raises(ResolveError) as exc_info: + promote(unknown_type, MapType(key_id=1, key_type=StringType(), value_id=2, value_type=StringType(), value_required=False)) + + assert "Cannot promote unknown to map" in str(exc_info.value) + + with pytest.raises(ResolveError) as exc_info: + promote(unknown_type, StructType(NestedField(field_id=1, name="field", field_type=StringType(), required=False))) + + assert "Cannot promote unknown to struct<1: field: optional string>" in str(exc_info.value) + + @pytest.fixture() def primitive_fields() -> List[NestedField]: return [