Skip to content

Commit 665d543

Browse files
committed
allow field names with dots and add test
1 parent cac298b commit 665d543

File tree

2 files changed

+46
-6
lines changed

2 files changed

+46
-6
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2770,19 +2770,29 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T
27702770

27712771

27722772
def _get_field_from_arrow_table(arrow_table: pa.Table, field_path: str) -> pa.Array:
2773-
"""Get a nested field from an Arrow table struct type field using dot notation.
2773+
"""Get a field from an Arrow table, supporting both literal field names and nested field paths.
2774+
2775+
This function handles two cases:
2776+
1. Literal field names that may contain dots (e.g., "some.id")
2777+
2. Nested field paths using dot notation (e.g., "bar.baz" for nested access)
27742778
27752779
Args:
27762780
arrow_table: The Arrow table containing the field
2777-
field_path: Dot-separated field path (e.g., "name" or "bar.baz.timestamp")
2781+
field_path: Field name or dot-separated path
27782782
27792783
Returns:
2780-
The unnested field as a PyArrow Array
2784+
The field as a PyArrow Array
2785+
2786+
Raises:
2787+
KeyError: If the field path cannot be resolved
27812788
"""
2782-
if "." not in field_path:
2789+
# Try exact column name match (handles field names containing literal dots)
2790+
if field_path in arrow_table.column_names:
27832791
return arrow_table[field_path]
27842792

2793+
# If not found as exact name, treat as nested field path
27852794
path_parts = field_path.split(".")
2795+
# Get the struct column from the table (e.g., "bar" from "bar.baz")
27862796
field_array = arrow_table[path_parts[0]]
2787-
field_array = pc.struct_field(field_array, path_parts[1:])
2788-
return field_array
2797+
# Navigate into the struct using the remaining path parts
2798+
return pc.struct_field(field_array, path_parts[1:])

tests/io/test_pyarrow.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2416,6 +2416,36 @@ def test_partition_for_deep_nested_field() -> None:
24162416
assert partition_values == {"data-1", "data-2"}
24172417

24182418

2419+
def test_inspect_partition_for_nested_field(catalog: InMemoryCatalog) -> None:
2420+
schema = Schema(
2421+
NestedField(id=1, name="foo", field_type=StringType(), required=True),
2422+
NestedField(
2423+
id=2,
2424+
name="bar",
2425+
field_type=StructType(
2426+
NestedField(id=3, name="baz", field_type=StringType(), required=False),
2427+
NestedField(id=4, name="qux", field_type=IntegerType(), required=False),
2428+
),
2429+
required=True,
2430+
),
2431+
)
2432+
spec = PartitionSpec(PartitionField(source_id=3, field_id=1000, transform=IdentityTransform(), name="part"))
2433+
catalog.create_namespace("default")
2434+
table = catalog.create_table("default.test_partition_in_struct", schema=schema, partition_spec=spec)
2435+
test_data = [
2436+
{"foo": "a", "bar": {"baz": "data-a", "qux": 1}},
2437+
{"foo": "b", "bar": {"baz": "data-b", "qux": 2}},
2438+
]
2439+
2440+
arrow_table = pa.Table.from_pylist(test_data, schema=table.schema().as_arrow())
2441+
table.append(arrow_table)
2442+
partitions_table = table.inspect.partitions()
2443+
partitions = partitions_table["partition"].to_pylist()
2444+
2445+
assert len(partitions) == 2
2446+
assert {part["part"] for part in partitions} == {"data-a", "data-b"}
2447+
2448+
24192449
def test_identity_partition_on_multi_columns() -> None:
24202450
test_pa_schema = pa.schema([("born_year", pa.int64()), ("n_legs", pa.int64()), ("animal", pa.string())])
24212451
test_schema = Schema(

0 commit comments

Comments
 (0)