From 2de947a1f5c671d795e07e7326deca3f64bf22ff Mon Sep 17 00:00:00 2001 From: gmweaver Date: Wed, 24 Sep 2025 15:20:56 -0700 Subject: [PATCH 1/5] fix checking physical type for Decimal to handle INT32/INT64 storage based on size --- pyiceberg/io/pyarrow.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index b6ad5659b1..a90f18e066 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -2089,6 +2089,9 @@ def __init__(self, iceberg_type: PrimitiveType, physical_type_string: str, trunc physical_type_string == "FLOAT" and expected_physical_type == "DOUBLE" ): pass + # Allow DECIMAL to be stored as FIXED_LEN_BYTE_ARRAY + if (physical_type_string == "FIXED_LEN_BYTE_ARRAY" and expected_physical_type in ("INT32", "INT64")): + pass else: raise ValueError( f"Unexpected physical type {physical_type_string} for {iceberg_type}, expected {expected_physical_type}" From a30b7e9c16c20a6d17ae47080efa94d2ed4f3802 Mon Sep 17 00:00:00 2001 From: gmweaver Date: Thu, 25 Sep 2025 09:57:29 -0700 Subject: [PATCH 2/5] add test --- pyiceberg/io/pyarrow.py | 2 +- tests/io/test_pyarrow.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index a90f18e066..cabad25318 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -2090,7 +2090,7 @@ def __init__(self, iceberg_type: PrimitiveType, physical_type_string: str, trunc ): pass # Allow DECIMAL to be stored as FIXED_LEN_BYTE_ARRAY - if (physical_type_string == "FIXED_LEN_BYTE_ARRAY" and expected_physical_type in ("INT32", "INT64")): + elif physical_type_string == "FIXED_LEN_BYTE_ARRAY" and expected_physical_type in ("INT32", "INT64"): pass else: raise ValueError( diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 09cd2421ea..6e90e61573 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2188,6 +2188,36 @@ def test_stats_aggregator_update_max(vals: List[Any], primitive_type: PrimitiveT assert stats.current_max == expected_result +@pytest.mark.parametrize( + "iceberg_type, physical_type_string, should_succeed", + [ + # Exact match + (IntegerType(), "INT32", True), + # Allowed INT32 -> INT64 promotion + (LongType(), "INT32", True), + # Allowed FLOAT -> DOUBLE promotion + (DoubleType(), "FLOAT", True), + # Allowed FIXED_LEN_BYTE_ARRAY -> INT32 + (DecimalType(precision=2, scale=2), "FIXED_LEN_BYTE_ARRAY", True), + # Allowed FIXED_LEN_BYTE_ARRAY -> INT64 + (DecimalType(precision=12, scale=2), "FIXED_LEN_BYTE_ARRAY", True), + # Fail case: INT64 cannot be cast to INT32 + (IntegerType(), "INT64", False), + ], +) +def test_stats_aggregator_conditionally_allowed_types( + iceberg_type: PrimitiveType, physical_type_string: str, should_succeed: bool +) -> None: + if should_succeed: + stats = StatsAggregator(iceberg_type, physical_type_string) + assert stats.primitive_type == iceberg_type + assert stats.current_min is None + assert stats.current_max is None + else: + with pytest.raises(ValueError, match="Unexpected physical type"): + StatsAggregator(iceberg_type, physical_type_string) + + def test_bin_pack_arrow_table(arrow_table_with_null: pa.Table) -> None: # default packs to 1 bin since the table is small bin_packed = bin_pack_arrow_table( From fe09b2f4a5182545f302c458aa186462468b3c59 Mon Sep 17 00:00:00 2001 From: gmweaver Date: Fri, 26 Sep 2025 14:43:42 -0700 Subject: [PATCH 3/5] comments --- tests/io/test_pyarrow.py | 41 +++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 6e90e61573..114c63a670 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2189,33 +2189,40 @@ def test_stats_aggregator_update_max(vals: List[Any], primitive_type: PrimitiveT @pytest.mark.parametrize( - "iceberg_type, physical_type_string, should_succeed", + "iceberg_type, physical_type_string", [ # Exact match - (IntegerType(), "INT32", True), + (IntegerType(), "INT32"), # Allowed INT32 -> INT64 promotion - (LongType(), "INT32", True), + (LongType(), "INT32"), # Allowed FLOAT -> DOUBLE promotion - (DoubleType(), "FLOAT", True), + (DoubleType(), "FLOAT"), # Allowed FIXED_LEN_BYTE_ARRAY -> INT32 - (DecimalType(precision=2, scale=2), "FIXED_LEN_BYTE_ARRAY", True), + (DecimalType(precision=2, scale=2), "FIXED_LEN_BYTE_ARRAY"), # Allowed FIXED_LEN_BYTE_ARRAY -> INT64 - (DecimalType(precision=12, scale=2), "FIXED_LEN_BYTE_ARRAY", True), + (DecimalType(precision=12, scale=2), "FIXED_LEN_BYTE_ARRAY"), + ], +) +def test_stats_aggregator_conditionally_allowed_types_pass(iceberg_type: PrimitiveType, physical_type_string: str) -> None: + stats = StatsAggregator(iceberg_type, physical_type_string) + + assert stats.primitive_type == iceberg_type + assert stats.current_min is None + assert stats.current_max is None + + +@pytest.mark.parametrize( + "iceberg_type, physical_type_string", + [ # Fail case: INT64 cannot be cast to INT32 - (IntegerType(), "INT64", False), + (IntegerType(), "INT64"), ], ) -def test_stats_aggregator_conditionally_allowed_types( - iceberg_type: PrimitiveType, physical_type_string: str, should_succeed: bool +def test_stats_aggregator_physical_type_does_not_match_expected_raise_error( + iceberg_type: PrimitiveType, physical_type_string: str ) -> None: - if should_succeed: - stats = StatsAggregator(iceberg_type, physical_type_string) - assert stats.primitive_type == iceberg_type - assert stats.current_min is None - assert stats.current_max is None - else: - with pytest.raises(ValueError, match="Unexpected physical type"): - StatsAggregator(iceberg_type, physical_type_string) + with pytest.raises(ValueError, match="Unexpected physical type"): + StatsAggregator(iceberg_type, physical_type_string) def test_bin_pack_arrow_table(arrow_table_with_null: pa.Table) -> None: From fa6ae2942251a8f9c59efa62162654838bd747ce Mon Sep 17 00:00:00 2001 From: gmweaver Date: Fri, 26 Sep 2025 14:48:46 -0700 Subject: [PATCH 4/5] add TODO --- pyiceberg/io/pyarrow.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index cabad25318..b8834f03c9 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -2082,6 +2082,8 @@ def __init__(self, iceberg_type: PrimitiveType, physical_type_string: str, trunc self.trunc_length = trunc_length expected_physical_type = _primitive_to_physical(iceberg_type) + + # TODO: Refactor to use promotion logic if expected_physical_type != physical_type_string: # Allow promotable physical types # INT32 -> INT64 and FLOAT -> DOUBLE are safe type casts @@ -2509,12 +2511,16 @@ def data_file_statistics_from_parquet_metadata( if isinstance(stats_col.iceberg_type, DecimalType) and statistics.physical_type != "FIXED_LEN_BYTE_ARRAY": scale = stats_col.iceberg_type.scale - col_aggs[field_id].update_min( - unscaled_to_decimal(statistics.min_raw, scale) - ) if statistics.min_raw is not None else None - col_aggs[field_id].update_max( - unscaled_to_decimal(statistics.max_raw, scale) - ) if statistics.max_raw is not None else None + ( + col_aggs[field_id].update_min(unscaled_to_decimal(statistics.min_raw, scale)) + if statistics.min_raw is not None + else None + ) + ( + col_aggs[field_id].update_max(unscaled_to_decimal(statistics.max_raw, scale)) + if statistics.max_raw is not None + else None + ) else: col_aggs[field_id].update_min(statistics.min) col_aggs[field_id].update_max(statistics.max) From ccb110858862f49c62a6b6ba0afd57cdb8a417f8 Mon Sep 17 00:00:00 2001 From: gmweaver Date: Fri, 26 Sep 2025 14:49:17 -0700 Subject: [PATCH 5/5] fix comment --- pyiceberg/io/pyarrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index b8834f03c9..e42c130779 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -2091,7 +2091,7 @@ def __init__(self, iceberg_type: PrimitiveType, physical_type_string: str, trunc physical_type_string == "FLOAT" and expected_physical_type == "DOUBLE" ): pass - # Allow DECIMAL to be stored as FIXED_LEN_BYTE_ARRAY + # Allow DECIMAL to be stored as FIXED_LEN_BYTE_ARRAY, INT32 or INT64 elif physical_type_string == "FIXED_LEN_BYTE_ARRAY" and expected_physical_type in ("INT32", "INT64"): pass else: