diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 7779a422f0..304021af44 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -779,6 +779,7 @@ def visit_uuid(self, _: UUIDType) -> pa.DataType: return pa.uuid() def visit_unknown(self, _: UnknownType) -> pa.DataType: + """Type `UnknownType` can be promoted to any primitive type in V3+ tables per the Iceberg spec.""" return pa.null() def visit_binary(self, _: BinaryType) -> pa.DataType: @@ -1358,6 +1359,8 @@ def primitive(self, primitive: pa.DataType) -> PrimitiveType: primitive = cast(pa.FixedSizeBinaryType, primitive) return FixedType(primitive.byte_width) elif pa.types.is_null(primitive): + # PyArrow null type (pa.null()) is converted to Iceberg UnknownType + # UnknownType can be promoted to any primitive type in V3+ tables per the Iceberg spec return UnknownType() elif isinstance(primitive, pa.UuidType): return UUIDType() diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index 1eadc58361..d9c2d7ddfc 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -1692,6 +1692,15 @@ def _(file_type: FixedType, read_type: IcebergType) -> IcebergType: raise ResolveError(f"Cannot promote {file_type} to {read_type}") +@promote.register(UnknownType) +def _(file_type: UnknownType, read_type: IcebergType) -> IcebergType: + # Per V3 Spec, "Unknown" can be promoted to any Primitive type + if isinstance(read_type, PrimitiveType): + return read_type + else: + raise ResolveError(f"Cannot promote {file_type} to {read_type}") + + def _check_schema_compatible(requested_schema: Schema, provided_schema: Schema) -> None: """ Check if the `provided_schema` is compatible with `requested_schema`. @@ -1761,7 +1770,15 @@ def _is_field_compatible(self, lhs: NestedField) -> bool: self.rich_table.add_row("✅", str(lhs), str(rhs)) return True except ResolveError: - self.rich_table.add_row("❌", str(lhs), str(rhs)) + # UnknownType can only be promoted to Primitive types + if isinstance(rhs.field_type, UnknownType): + if not isinstance(lhs.field_type, PrimitiveType): + error_msg = f"Null type (UnknownType) cannot be promoted to non-primitive type {lhs.field_type}. UnknownType can only be promoted to primitive types (string, int, boolean, etc.) in V3+ tables." + else: + error_msg = f"Null type (UnknownType) cannot be promoted to {lhs.field_type}. This may be due to table format version limitations (V1/V2 tables don't support UnknownType promotion)." + self.rich_table.add_row("❌", str(lhs), f"{str(rhs)} - {error_msg}") + else: + self.rich_table.add_row("❌", str(lhs), str(rhs)) return False def schema(self, schema: Schema, struct_result: Callable[[], bool]) -> bool: diff --git a/tests/test_schema.py b/tests/test_schema.py index 3ca74c4027..ed8f8622dc 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -54,6 +54,7 @@ TimestampType, TimestamptzType, TimeType, + UnknownType, UUIDType, ) @@ -919,6 +920,36 @@ def test_promotion(file_type: IcebergType, read_type: IcebergType) -> None: promote(file_type, read_type) +def test_unknown_type_promotion_to_primitive() -> None: + """Test that UnknownType can be promoted to primitive types (V3+ behavior)""" + unknown_type = UnknownType() + + assert promote(unknown_type, StringType()) == StringType() + assert promote(unknown_type, IntegerType()) == IntegerType() + assert promote(unknown_type, BooleanType()) == BooleanType() + assert promote(unknown_type, FloatType()) == FloatType() + + +def test_unknown_type_promotion_to_non_primitive_raises_resolve_error() -> None: + """Test that UnknownType cannot be promoted to non-primitive types and raises ResolveError""" + unknown_type = UnknownType() + + with pytest.raises(ResolveError) as exc_info: + promote(unknown_type, ListType(element_id=1, element_type=StringType(), element_required=False)) + + assert "Cannot promote unknown to list" in str(exc_info.value) + + with pytest.raises(ResolveError) as exc_info: + promote(unknown_type, MapType(key_id=1, key_type=StringType(), value_id=2, value_type=StringType(), value_required=False)) + + assert "Cannot promote unknown to map" in str(exc_info.value) + + with pytest.raises(ResolveError) as exc_info: + promote(unknown_type, StructType(NestedField(field_id=1, name="field", field_type=StringType(), required=False))) + + assert "Cannot promote unknown to struct<1: field: optional string>" in str(exc_info.value) + + @pytest.fixture() def primitive_fields() -> List[NestedField]: return [