From a72900d43b5f060c1f200ae95471f53e20b744d0 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 5 Mar 2025 17:09:13 +0900 Subject: [PATCH] Inject columns of proper length for missing fields Fixes #1766. --- pyiceberg/io/pyarrow.py | 3 ++- tests/io/test_pyarrow.py | 17 ++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index eab26b0c57..7c8aaaab1b 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -1438,7 +1438,8 @@ def _task_to_record_batches( for name, value in projected_missing_fields.items(): index = result_batch.schema.get_field_index(name) if index != -1: - result_batch = result_batch.set_column(index, name, [value]) + arr = pa.repeat(value, result_batch.num_rows) + result_batch = result_batch.set_column(index, name, arr) yield result_batch diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 3f43d9215a..e883e38cb8 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -1153,7 +1153,7 @@ def test_identity_transform_column_projection(tmp_path: str, catalog: InMemoryCa properties={TableProperties.DEFAULT_NAME_MAPPING: create_mapping_from_schema(schema).model_dump_json()}, ) - file_data = pa.array(["foo"], type=pa.string()) + file_data = pa.array(["foo", "bar", "baz"], type=pa.string()) file_loc = f"{tmp_path}/test.parquet" pq.write_table(pa.table([file_data], names=["other_field"]), file_loc) @@ -1181,14 +1181,13 @@ def test_identity_transform_column_projection(tmp_path: str, catalog: InMemoryCa with transaction.update_snapshot().overwrite() as update: update.append_data_file(unpartitioned_file) - assert ( - str(table.scan().to_arrow()) - == """pyarrow.Table -other_field: large_string -partition_id: int64 ----- -other_field: [["foo"]] -partition_id: [[1]]""" + schema = pa.schema([("other_field", pa.large_string()), ("partition_id", pa.int64())]) + assert table.scan().to_arrow() == pa.table( + { + "other_field": ["foo", "bar", "baz"], + "partition_id": [1, 1, 1], + }, + schema=schema, )