Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,6 +636,12 @@ def visit_fixed(self, fixed_type: FixedType) -> pa.DataType:
return pa.binary(len(fixed_type))

def visit_decimal(self, decimal_type: DecimalType) -> pa.DataType:
# It looks like decimal{32,64} is not fully implemented:
# https://github.com/apache/arrow/issues/25483
# https://github.com/apache/arrow/issues/43956
# However, if we keep it as 128 in memory, and based on the
# precision/scale Arrow will map it to INT{32,64}
# https://github.com/apache/arrow/blob/598938711a8376cbfdceaf5c77ab0fd5057e6c02/cpp/src/parquet/arrow/schema.cc#L380-L392
return pa.decimal128(decimal_type.precision, decimal_type.scale)

def visit_boolean(self, _: BooleanType) -> pa.DataType:
Expand Down Expand Up @@ -2442,7 +2448,9 @@ def write_parquet(task: WriteTask) -> DataFile:
)
fo = io.new_output(file_path)
with fo.create(overwrite=True) as fos:
with pq.ParquetWriter(fos, schema=arrow_table.schema, **parquet_writer_kwargs) as writer:
with pq.ParquetWriter(
fos, schema=arrow_table.schema, store_decimal_as_integer=True, **parquet_writer_kwargs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"""
By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. When enabled, the writer will use the following physical types to store decimals:

  • int32: for 1 <= precision <= 9.
  • int64: for 10 <= precision <= 18.
  • fixed_len_byte_array: for precision > 18.
    """

from https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html
Screenshot 2025-05-09 at 8 35 41 AM

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this matches the parquet data type mapping for decimal
https://iceberg.apache.org/spec/#parquet

Screenshot 2025-05-09 at 8 36 40 AM

) as writer:
writer.write(arrow_table, row_group_size=row_group_size)
statistics = data_file_statistics_from_parquet_metadata(
parquet_metadata=writer.writer.metadata,
Expand Down
31 changes: 31 additions & 0 deletions tests/integration/test_writes/test_writes.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import random
import time
from datetime import date, datetime, timedelta
from decimal import Decimal
from pathlib import Path
from typing import Any, Dict
from urllib.parse import urlparse
Expand Down Expand Up @@ -50,6 +51,7 @@
from pyiceberg.transforms import DayTransform, HourTransform, IdentityTransform
from pyiceberg.types import (
DateType,
DecimalType,
DoubleType,
IntegerType,
ListType,
Expand Down Expand Up @@ -1810,3 +1812,32 @@ def test_evolve_and_write(
)

assert session_catalog.load_table(identifier).scan().to_arrow().column(0).combine_chunks() == numbers


@pytest.mark.integration
def test_read_write_decimals(session_catalog: Catalog) -> None:
"""Roundtrip decimal types to make sure that we correctly write them as ints"""
identifier = "default.test_read_write_decimals"

arrow_table = pa.Table.from_pydict(
{
"decimal8": pa.array([Decimal("123.45"), Decimal("678.91")], pa.decimal128(8, 2)),
"decimal16": pa.array([Decimal("12345679.123456"), Decimal("67891234.678912")], pa.decimal128(16, 6)),
"decimal19": pa.array([Decimal("1234567890123.123456"), Decimal("9876543210703.654321")], pa.decimal128(19, 6)),
},
)

tbl = _create_table(
session_catalog,
identifier,
properties={"format-version": 2},
schema=Schema(
NestedField(1, "decimal8", DecimalType(8, 2)),
NestedField(2, "decimal16", DecimalType(16, 6)),
NestedField(3, "decimal19", DecimalType(19, 6)),
),
)

tbl.append(arrow_table)

assert tbl.scan().to_arrow() == arrow_table