From a72900d43b5f060c1f200ae95471f53e20b744d0 Mon Sep 17 00:00:00 2001
From: David Li
Date: Wed, 5 Mar 2025 17:09:13 +0900
Subject: [PATCH] Inject columns of proper length for missing fields
Fixes #1766.
---
pyiceberg/io/pyarrow.py | 3 ++-
tests/io/test_pyarrow.py | 17 ++++++++---------
2 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index eab26b0c57..7c8aaaab1b 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -1438,7 +1438,8 @@ def _task_to_record_batches(
for name, value in projected_missing_fields.items():
index = result_batch.schema.get_field_index(name)
if index != -1:
- result_batch = result_batch.set_column(index, name, [value])
+ arr = pa.repeat(value, result_batch.num_rows)
+ result_batch = result_batch.set_column(index, name, arr)
yield result_batch
diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py
index 3f43d9215a..e883e38cb8 100644
--- a/tests/io/test_pyarrow.py
+++ b/tests/io/test_pyarrow.py
@@ -1153,7 +1153,7 @@ def test_identity_transform_column_projection(tmp_path: str, catalog: InMemoryCa
properties={TableProperties.DEFAULT_NAME_MAPPING: create_mapping_from_schema(schema).model_dump_json()},
)
- file_data = pa.array(["foo"], type=pa.string())
+ file_data = pa.array(["foo", "bar", "baz"], type=pa.string())
file_loc = f"{tmp_path}/test.parquet"
pq.write_table(pa.table([file_data], names=["other_field"]), file_loc)
@@ -1181,14 +1181,13 @@ def test_identity_transform_column_projection(tmp_path: str, catalog: InMemoryCa
with transaction.update_snapshot().overwrite() as update:
update.append_data_file(unpartitioned_file)
- assert (
- str(table.scan().to_arrow())
- == """pyarrow.Table
-other_field: large_string
-partition_id: int64
-----
-other_field: [["foo"]]
-partition_id: [[1]]"""
+ schema = pa.schema([("other_field", pa.large_string()), ("partition_id", pa.int64())])
+ assert table.scan().to_arrow() == pa.table(
+ {
+ "other_field": ["foo", "bar", "baz"],
+ "partition_id": [1, 1, 1],
+ },
+ schema=schema,
)