Skip to content

Commit 245a1ef

Browse files
committed
add simple read write testcases for uuid partition transform
1 parent bedf777 commit 245a1ef

File tree

2 files changed

+93
-2
lines changed

2 files changed

+93
-2
lines changed

tests/integration/test_writes/test_writes.py

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import os
2020
import random
2121
import time
22+
import uuid
2223
from datetime import date, datetime, timedelta
2324
from decimal import Decimal
2425
from pathlib import Path
@@ -48,7 +49,7 @@
4849
from pyiceberg.schema import Schema
4950
from pyiceberg.table import TableProperties
5051
from pyiceberg.table.sorting import SortDirection, SortField, SortOrder
51-
from pyiceberg.transforms import DayTransform, HourTransform, IdentityTransform
52+
from pyiceberg.transforms import BucketTransform, DayTransform, HourTransform, IdentityTransform
5253
from pyiceberg.types import (
5354
DateType,
5455
DecimalType,
@@ -58,6 +59,7 @@
5859
LongType,
5960
NestedField,
6061
StringType,
62+
UUIDType,
6163
)
6264
from utils import _create_table
6365

@@ -1841,3 +1843,56 @@ def test_read_write_decimals(session_catalog: Catalog) -> None:
18411843
tbl.append(arrow_table)
18421844

18431845
assert tbl.scan().to_arrow() == arrow_table
1846+
1847+
1848+
@pytest.mark.integration
1849+
def test_read_write_uuids_partitioned(session_catalog: Catalog) -> None:
1850+
"""Test simple reading and writing partitioned UUID data types in supported transform.
1851+
- BucketTransform
1852+
- IdentityTransform
1853+
"""
1854+
1855+
identifier = "default.test_read_write_uuids"
1856+
uuids = [
1857+
uuid.UUID("ec9b663b-062f-4200-a130-8de19c21b800").bytes,
1858+
uuid.UUID("5f473c64-dbeb-449b-bdfa-b6b4185b1bde").bytes,
1859+
None,
1860+
]
1861+
1862+
arrow_table = pa.Table.from_pydict(
1863+
{
1864+
"uuid_1": pa.array(uuids, type=pa.binary(16)),
1865+
"uuid_2": pa.array(uuids, type=pa.binary(16)),
1866+
}
1867+
)
1868+
1869+
tbl = _create_table(
1870+
session_catalog,
1871+
identifier,
1872+
properties={"format-version": 2},
1873+
schema=Schema(
1874+
NestedField(field_id=1, name="uuid_1", field_type=UUIDType(), required=False),
1875+
NestedField(field_id=2, name="uuid_2", field_type=UUIDType(), required=False),
1876+
),
1877+
partition_spec=PartitionSpec(
1878+
PartitionField(source_id=1, field_id=1001, transform=BucketTransform(2), name="uuid_bucket"),
1879+
PartitionField(source_id=2, field_id=1002, transform=IdentityTransform(), name="uuid_indentity"),
1880+
),
1881+
)
1882+
1883+
tbl.append(arrow_table)
1884+
assert tbl.scan().to_arrow() == arrow_table
1885+
# Check BucketTransform partitioning filtering
1886+
assert tbl.scan(row_filter=f"uuid_1 == '{uuid.UUID(bytes=uuids[0])}'").to_arrow() == pa.Table.from_pydict(
1887+
{
1888+
"uuid_1": pa.array([uuids[0]], type=pa.binary(16)),
1889+
"uuid_2": pa.array([uuids[0]], type=pa.binary(16)),
1890+
}
1891+
)
1892+
# Check IdentityTransform partitioning filtering
1893+
assert tbl.scan(row_filter=f"uuid_2 == '{uuid.UUID(bytes=uuids[1])}'").to_arrow() == pa.Table.from_pydict(
1894+
{
1895+
"uuid_1": pa.array([uuids[1]], type=pa.binary(16)),
1896+
"uuid_2": pa.array([uuids[1]], type=pa.binary(16)),
1897+
}
1898+
)

tests/table/test_partitioning.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,15 @@
2121

2222
import pytest
2323

24-
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec
24+
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionFieldValue, PartitionKey, PartitionSpec
2525
from pyiceberg.schema import Schema
2626
from pyiceberg.transforms import (
2727
BucketTransform,
2828
DayTransform,
2929
HourTransform,
3030
IdentityTransform,
3131
MonthTransform,
32+
Transform,
3233
TruncateTransform,
3334
YearTransform,
3435
)
@@ -217,6 +218,41 @@ def test_transform_consistency_with_pyarrow_transform(source_type: PrimitiveType
217218
raise
218219

219220

221+
@pytest.mark.parametrize(
222+
"source_type, _transform, input_value,expected_value",
223+
[
224+
(UUIDType(), BucketTransform(2), UUID("ec9b663b-062f-4200-a130-8de19c21b800").bytes, 0),
225+
(
226+
UUIDType(),
227+
IdentityTransform(),
228+
UUID("ec9b663b-062f-4200-a130-8de19c21b800"),
229+
UUID("ec9b663b-062f-4200-a130-8de19c21b800"),
230+
),
231+
(UUIDType(), TruncateTransform(1), UUID("ec9b663b-062f-4200-a130-8de19c21b800").bytes, None),
232+
],
233+
)
234+
def test_transform_uuid_partition_key(
235+
source_type: PrimitiveType, _transform: Transform[Any, Any], input_value: UUID, expected_value: Any
236+
) -> None:
237+
"""
238+
Tests that UUID values can be correctly transformed and used as partition keys with various transformation functions.
239+
"""
240+
schema = Schema(NestedField(field_id=1, name="uuid", field_type=source_type, required=True))
241+
partition_field = PartitionField(source_id=1, field_id=1001, transform=_transform, name="uuid_partition")
242+
spec = PartitionSpec(partition_field)
243+
244+
if _transform.can_transform(source_type):
245+
transformer = _transform.transform(source=source_type)
246+
247+
value = transformer(input_value)
248+
assert value == expected_value
249+
250+
partition_field_value = PartitionFieldValue(field=partition_field, value=value)
251+
partition_key = PartitionKey(field_values=[partition_field_value], partition_spec=spec, schema=schema)
252+
assert partition_key.field_values[0].field == partition_field
253+
assert partition_key.field_values[0].value == expected_value
254+
255+
220256
def test_deserialize_partition_field_v2() -> None:
221257
json_partition_spec = """{"source-id": 1, "field-id": 1000, "transform": "truncate[19]", "name": "str_truncate"}"""
222258

0 commit comments

Comments
 (0)