From 35cf34a90cbd83ea7597106e7f20004e8b4908a6 Mon Sep 17 00:00:00 2001 From: Louis Santos Date: Thu, 26 Jun 2025 14:13:13 -0700 Subject: [PATCH 1/2] Adding promotion for UnknownType per V3+ spec --- pyiceberg/schema.py | 3 +++ tests/io/test_pyarrow_visitor.py | 23 +++++++++++++++++++++++ tests/test_schema.py | 4 ++++ 3 files changed, 30 insertions(+) diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index 6333ace6e2..6ad43439f1 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -1617,6 +1617,9 @@ def promote(file_type: IcebergType, read_type: IcebergType) -> IcebergType: else: raise ResolveError(f"Cannot promote {file_type} to {read_type}") +@promote.register(UnknownType) +def _(file_type: UnknownType, read_type: IcebergType) -> IcebergType: + return read_type # Per V3 Spec, "Unknown" can be promoted to any type @promote.register(IntegerType) def _(file_type: IntegerType, read_type: IcebergType) -> IcebergType: diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py index 9d5772d01c..0611c1ef21 100644 --- a/tests/io/test_pyarrow_visitor.py +++ b/tests/io/test_pyarrow_visitor.py @@ -34,6 +34,7 @@ from pyiceberg.expressions.literals import literal from pyiceberg.io.pyarrow import ( UnsupportedPyArrowTypeException, + _check_pyarrow_schema_compatible, _ConvertToArrowSchema, _ConvertToIceberg, _ConvertToIcebergWithoutIDs, @@ -313,6 +314,28 @@ def test_pyarrow_dictionary_encoded_type_to_iceberg(value_type: pa.DataType, exp assert visit_pyarrow(pyarrow_dict, _ConvertToIceberg()) == expected_result +def test_schema_check_null_column(table_schema_simple: Schema) -> None: + pyarrow_schema: pa.Schema = schema_to_pyarrow(table_schema_simple) + new_field = pyarrow_schema.field(0).with_type(pa.null()) # Make the optional string field null for testing + pyarrow_schema = pyarrow_schema.set(0, new_field) + assert pyarrow_schema.field(0).type == pa.null() + _check_pyarrow_schema_compatible(table_schema_simple, pyarrow_schema) + + +def test_schema_conversion_null_column(table_schema_simple: Schema) -> None: + pyarrow_schema: pa.Schema = schema_to_pyarrow(table_schema_simple) + new_field = pyarrow_schema.field(2).with_type(pa.null()) # Make the optional boolean field null for testing + pyarrow_schema = pyarrow_schema.set(2, new_field) + assert pyarrow_schema.field(2).type == pa.null() + actual = str(pyarrow_to_schema(pyarrow_schema)) + expected = """table { + 1: foo: optional string + 2: bar: required int + 3: baz: optional unknown +}""" + assert actual == expected + + def test_round_schema_conversion_simple(table_schema_simple: Schema) -> None: actual = str(pyarrow_to_schema(schema_to_pyarrow(table_schema_simple))) expected = """table { diff --git a/tests/test_schema.py b/tests/test_schema.py index 3ca74c4027..87c0ab0e0e 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -55,6 +55,7 @@ TimestamptzType, TimeType, UUIDType, + UnknownType ) TEST_PRIMITIVE_TYPES = [ @@ -74,6 +75,7 @@ FixedType(16), FixedType(20), UUIDType(), + UnknownType() ] @@ -855,6 +857,8 @@ def should_promote(file_type: IcebergType, read_type: IcebergType) -> bool: return file_type.precision <= read_type.precision and file_type.scale == file_type.scale if isinstance(file_type, FixedType) and isinstance(read_type, UUIDType) and len(file_type) == 16: return True + if isinstance(file_type, UnknownType): + return True return False From 531e526997d5f82b6fbb2fbcd47072c1fa6309de Mon Sep 17 00:00:00 2001 From: Louis Santos Date: Wed, 9 Jul 2025 13:15:29 -0700 Subject: [PATCH 2/2] fix linting error --- pyiceberg/schema.py | 4 +++- tests/test_schema.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index 6ad43439f1..0da9677b50 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -1617,9 +1617,11 @@ def promote(file_type: IcebergType, read_type: IcebergType) -> IcebergType: else: raise ResolveError(f"Cannot promote {file_type} to {read_type}") + @promote.register(UnknownType) def _(file_type: UnknownType, read_type: IcebergType) -> IcebergType: - return read_type # Per V3 Spec, "Unknown" can be promoted to any type + return read_type # Per V3 Spec, "Unknown" can be promoted to any type + @promote.register(IntegerType) def _(file_type: IntegerType, read_type: IcebergType) -> IcebergType: diff --git a/tests/test_schema.py b/tests/test_schema.py index 87c0ab0e0e..248ae9c97a 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -54,8 +54,8 @@ TimestampType, TimestamptzType, TimeType, + UnknownType, UUIDType, - UnknownType ) TEST_PRIMITIVE_TYPES = [ @@ -75,7 +75,7 @@ FixedType(16), FixedType(20), UUIDType(), - UnknownType() + UnknownType(), ]