Skip to content

Commit 950fc71

Browse files
authored
Document null field handling for PyArrow (#2365)
<!-- Thanks for opening a pull request! --> <!-- In the case this PR will resolve an issue, please replace ${GITHUB_ISSUE_ID} below with the actual Github issue id. --> <!-- Closes #${2119} --> Closes #2119 # Rationale for this change Documentation for handling of null field for PyArrow # Are these changes tested? NA # Are there any user-facing changes? NA <!-- In the case of user-facing changes, please add the changelog label. -->
1 parent 5a781df commit 950fc71

File tree

3 files changed

+52
-1
lines changed

3 files changed

+52
-1
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -779,6 +779,7 @@ def visit_uuid(self, _: UUIDType) -> pa.DataType:
779779
return pa.uuid()
780780

781781
def visit_unknown(self, _: UnknownType) -> pa.DataType:
782+
"""Type `UnknownType` can be promoted to any primitive type in V3+ tables per the Iceberg spec."""
782783
return pa.null()
783784

784785
def visit_binary(self, _: BinaryType) -> pa.DataType:
@@ -1358,6 +1359,8 @@ def primitive(self, primitive: pa.DataType) -> PrimitiveType:
13581359
primitive = cast(pa.FixedSizeBinaryType, primitive)
13591360
return FixedType(primitive.byte_width)
13601361
elif pa.types.is_null(primitive):
1362+
# PyArrow null type (pa.null()) is converted to Iceberg UnknownType
1363+
# UnknownType can be promoted to any primitive type in V3+ tables per the Iceberg spec
13611364
return UnknownType()
13621365
elif isinstance(primitive, pa.UuidType):
13631366
return UUIDType()

pyiceberg/schema.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1692,6 +1692,15 @@ def _(file_type: FixedType, read_type: IcebergType) -> IcebergType:
16921692
raise ResolveError(f"Cannot promote {file_type} to {read_type}")
16931693

16941694

1695+
@promote.register(UnknownType)
1696+
def _(file_type: UnknownType, read_type: IcebergType) -> IcebergType:
1697+
# Per V3 Spec, "Unknown" can be promoted to any Primitive type
1698+
if isinstance(read_type, PrimitiveType):
1699+
return read_type
1700+
else:
1701+
raise ResolveError(f"Cannot promote {file_type} to {read_type}")
1702+
1703+
16951704
def _check_schema_compatible(requested_schema: Schema, provided_schema: Schema) -> None:
16961705
"""
16971706
Check if the `provided_schema` is compatible with `requested_schema`.
@@ -1761,7 +1770,15 @@ def _is_field_compatible(self, lhs: NestedField) -> bool:
17611770
self.rich_table.add_row("✅", str(lhs), str(rhs))
17621771
return True
17631772
except ResolveError:
1764-
self.rich_table.add_row("❌", str(lhs), str(rhs))
1773+
# UnknownType can only be promoted to Primitive types
1774+
if isinstance(rhs.field_type, UnknownType):
1775+
if not isinstance(lhs.field_type, PrimitiveType):
1776+
error_msg = f"Null type (UnknownType) cannot be promoted to non-primitive type {lhs.field_type}. UnknownType can only be promoted to primitive types (string, int, boolean, etc.) in V3+ tables."
1777+
else:
1778+
error_msg = f"Null type (UnknownType) cannot be promoted to {lhs.field_type}. This may be due to table format version limitations (V1/V2 tables don't support UnknownType promotion)."
1779+
self.rich_table.add_row("❌", str(lhs), f"{str(rhs)} - {error_msg}")
1780+
else:
1781+
self.rich_table.add_row("❌", str(lhs), str(rhs))
17651782
return False
17661783

17671784
def schema(self, schema: Schema, struct_result: Callable[[], bool]) -> bool:

tests/test_schema.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
TimestampType,
5656
TimestamptzType,
5757
TimeType,
58+
UnknownType,
5859
UUIDType,
5960
)
6061

@@ -920,6 +921,36 @@ def test_promotion(file_type: IcebergType, read_type: IcebergType) -> None:
920921
promote(file_type, read_type)
921922

922923

924+
def test_unknown_type_promotion_to_primitive() -> None:
925+
"""Test that UnknownType can be promoted to primitive types (V3+ behavior)"""
926+
unknown_type = UnknownType()
927+
928+
assert promote(unknown_type, StringType()) == StringType()
929+
assert promote(unknown_type, IntegerType()) == IntegerType()
930+
assert promote(unknown_type, BooleanType()) == BooleanType()
931+
assert promote(unknown_type, FloatType()) == FloatType()
932+
933+
934+
def test_unknown_type_promotion_to_non_primitive_raises_resolve_error() -> None:
935+
"""Test that UnknownType cannot be promoted to non-primitive types and raises ResolveError"""
936+
unknown_type = UnknownType()
937+
938+
with pytest.raises(ResolveError) as exc_info:
939+
promote(unknown_type, ListType(element_id=1, element_type=StringType(), element_required=False))
940+
941+
assert "Cannot promote unknown to list<string>" in str(exc_info.value)
942+
943+
with pytest.raises(ResolveError) as exc_info:
944+
promote(unknown_type, MapType(key_id=1, key_type=StringType(), value_id=2, value_type=StringType(), value_required=False))
945+
946+
assert "Cannot promote unknown to map<string, string>" in str(exc_info.value)
947+
948+
with pytest.raises(ResolveError) as exc_info:
949+
promote(unknown_type, StructType(NestedField(field_id=1, name="field", field_type=StringType(), required=False)))
950+
951+
assert "Cannot promote unknown to struct<1: field: optional string>" in str(exc_info.value)
952+
953+
923954
@pytest.fixture()
924955
def primitive_fields() -> List[NestedField]:
925956
return [

0 commit comments

Comments
 (0)