From 580c986fd7a61f389e0a6e5568ce62bf8e0259db Mon Sep 17 00:00:00 2001 From: redpheonixx Date: Mon, 17 Mar 2025 21:57:57 +0530 Subject: [PATCH] handle decimal physicial type mapping --- pyiceberg/io/pyarrow.py | 24 +++++++++++++++++++----- tests/io/test_pyarrow_stats.py | 28 ++++++++++++++++++++++------ 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index d9f84a42ba..890412f131 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -175,6 +175,7 @@ from pyiceberg.utils.properties import get_first_property_value, property_as_bool, property_as_int from pyiceberg.utils.singleton import Singleton from pyiceberg.utils.truncate import truncate_upper_bound_binary_string, truncate_upper_bound_text_string +from decimal import Decimal, Context if TYPE_CHECKING: from pyiceberg.table import FileScanTask, WriteTask @@ -194,7 +195,7 @@ UTC_ALIASES = {"UTC", "+00:00", "Etc/UTC", "Z"} T = TypeVar("T") - +DECIMAL_REGEX = re.compile(r"decimal\((\d+),\s*(\d+)\)") @lru_cache def _cached_resolve_s3_region(bucket: str) -> Optional[str]: @@ -1868,7 +1869,11 @@ def visit_fixed(self, fixed_type: FixedType) -> str: return "FIXED_LEN_BYTE_ARRAY" def visit_decimal(self, decimal_type: DecimalType) -> str: - return "FIXED_LEN_BYTE_ARRAY" + return ( + "INT32" if decimal_type.precision <= 9 + else "INT64" if decimal_type.precision <= 18 + else "FIXED_LEN_BYTE_ARRAY" + ) def visit_boolean(self, boolean_type: BooleanType) -> str: return "BOOLEAN" @@ -2335,9 +2340,18 @@ def data_file_statistics_from_parquet_metadata( col_aggs[field_id] = StatsAggregator( stats_col.iceberg_type, statistics.physical_type, stats_col.mode.length ) - - col_aggs[field_id].update_min(statistics.min) - col_aggs[field_id].update_max(statistics.max) + matches=DECIMAL_REGEX.search(str(stats_col.iceberg_type)) + if matches and statistics.physical_type != "FIXED_LEN_BYTE_ARRAY": + precision=int(matches.group(1)) + scale=int(matches.group(2)) + local_context = Context(prec=precision) + decoded_min = local_context.create_decimal(Decimal(statistics.min_raw)/ (10 ** scale)) + decoded_max = local_context.create_decimal(Decimal(statistics.max_raw)/ (10 ** scale)) + col_aggs[field_id].update_min(decoded_min) + col_aggs[field_id].update_max(decoded_max) + else: + col_aggs[field_id].update_min(statistics.min) + col_aggs[field_id].update_max(statistics.max) except pyarrow.lib.ArrowNotImplementedError as e: invalidate_col.add(field_id) diff --git a/tests/io/test_pyarrow_stats.py b/tests/io/test_pyarrow_stats.py index 788891711e..00bc8137cc 100644 --- a/tests/io/test_pyarrow_stats.py +++ b/tests/io/test_pyarrow_stats.py @@ -72,7 +72,7 @@ StringType, ) from pyiceberg.utils.datetime import date_to_days, datetime_to_micros, time_to_micros - +from decimal import Decimal @dataclass(frozen=True) class TestStruct: @@ -446,6 +446,9 @@ def construct_test_table_primitive_types() -> Tuple[pq.FileMetaData, Union[Table {"id": 10, "name": "strings", "required": False, "type": "string"}, {"id": 11, "name": "uuids", "required": False, "type": "uuid"}, {"id": 12, "name": "binaries", "required": False, "type": "binary"}, + {"id": 13, "name": "decimal8", "required": False, "type": "decimal(8, 2)"}, + {"id": 14, "name": "decimal16", "required": False, "type": "decimal(16, 6)"}, + {"id": 15, "name": "decimal32", "required": False, "type": "decimal(20, 6)"}, ], }, ], @@ -470,6 +473,9 @@ def construct_test_table_primitive_types() -> Tuple[pq.FileMetaData, Union[Table strings = ["hello", "world"] uuids = [uuid.uuid3(uuid.NAMESPACE_DNS, "foo").bytes, uuid.uuid3(uuid.NAMESPACE_DNS, "bar").bytes] binaries = [b"hello", b"world"] + decimal8 = [Decimal("123.45"), Decimal("678.91")] + decimal16 = [Decimal("123456789.123456"), Decimal("678912345.678912")] + decimal32 = [Decimal("12345678901234.123456"), Decimal("98765432109870.654321")] table = pa.Table.from_pydict( { @@ -485,6 +491,9 @@ def construct_test_table_primitive_types() -> Tuple[pq.FileMetaData, Union[Table "strings": strings, "uuids": uuids, "binaries": binaries, + "decimal8": decimal8, + "decimal16": decimal16, + "decimal32": decimal32, }, schema=arrow_schema, ) @@ -492,7 +501,7 @@ def construct_test_table_primitive_types() -> Tuple[pq.FileMetaData, Union[Table metadata_collector: List[Any] = [] with pa.BufferOutputStream() as f: - with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector) as writer: + with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector, store_decimal_as_integer=True) as writer: writer.write_table(table) return metadata_collector[0], table_metadata @@ -510,13 +519,13 @@ def test_metrics_primitive_types() -> None: ) datafile = DataFile(**statistics.to_serialized_dict()) - assert len(datafile.value_counts) == 12 - assert len(datafile.null_value_counts) == 12 + assert len(datafile.value_counts) == 15 + assert len(datafile.null_value_counts) == 15 assert len(datafile.nan_value_counts) == 0 tz = timezone(timedelta(seconds=19800)) - assert len(datafile.lower_bounds) == 12 + assert len(datafile.lower_bounds) == 15 assert datafile.lower_bounds[1] == STRUCT_BOOL.pack(False) assert datafile.lower_bounds[2] == STRUCT_INT32.pack(23) assert datafile.lower_bounds[3] == STRUCT_INT64.pack(2) @@ -529,8 +538,12 @@ def test_metrics_primitive_types() -> None: assert datafile.lower_bounds[10] == b"he" assert datafile.lower_bounds[11] == uuid.uuid3(uuid.NAMESPACE_DNS, "foo").bytes assert datafile.lower_bounds[12] == b"he" + assert int.from_bytes(datafile.lower_bounds[13], byteorder="big", signed=True) == int(12345) + assert int.from_bytes(datafile.lower_bounds[14], byteorder="big", signed=True) == int(123456789123456) + assert int.from_bytes(datafile.lower_bounds[15], byteorder="big", signed=True) == int(12345678901234123456) + - assert len(datafile.upper_bounds) == 12 + assert len(datafile.upper_bounds) == 15 assert datafile.upper_bounds[1] == STRUCT_BOOL.pack(True) assert datafile.upper_bounds[2] == STRUCT_INT32.pack(89) assert datafile.upper_bounds[3] == STRUCT_INT64.pack(54) @@ -543,6 +556,9 @@ def test_metrics_primitive_types() -> None: assert datafile.upper_bounds[10] == b"wp" assert datafile.upper_bounds[11] == uuid.uuid3(uuid.NAMESPACE_DNS, "bar").bytes assert datafile.upper_bounds[12] == b"wp" + assert int.from_bytes(datafile.upper_bounds[13], byteorder="big", signed=True) == int(67891) + assert int.from_bytes(datafile.upper_bounds[14], byteorder="big", signed=True) == int(678912345678912) + assert int.from_bytes(datafile.upper_bounds[15], byteorder="big", signed=True) == int(98765432109870654321) def construct_test_table_invalid_upper_bound() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]: