-
Notifications
You must be signed in to change notification settings - Fork 414
Arrow: Infer the types when reading #1669
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
fa9b3ca
0384b4e
6dd9308
2817c61
d6fbca9
fff7414
0d19987
7382112
6526cc2
d9d4fda
dd1c5d4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1785,7 +1785,7 @@ def to_arrow_batch_reader(self) -> pa.RecordBatchReader: | |
| return pa.RecordBatchReader.from_batches( | ||
| target_schema, | ||
| batches, | ||
| ) | ||
| ).cast(target_schema) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will still return |
||
|
|
||
| def to_pandas(self, **kwargs: Any) -> pd.DataFrame: | ||
| """Read a Pandas DataFrame eagerly from this Iceberg table. | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -831,7 +831,16 @@ def test_configure_row_group_batch_size(session_catalog: Catalog) -> None: | |||||
|
|
||||||
| @pytest.mark.integration | ||||||
| @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) | ||||||
| def test_table_scan_default_to_large_types(catalog: Catalog) -> None: | ||||||
| def test_table_scan_keep_types(catalog: Catalog) -> None: | ||||||
| expected_schema = pa.schema( | ||||||
| [ | ||||||
| pa.field("string", pa.string()), | ||||||
| pa.field("string-to-binary", pa.large_binary()), | ||||||
| pa.field("binary", pa.binary()), | ||||||
| pa.field("list", pa.list_(pa.large_string())), | ||||||
| ] | ||||||
| ) | ||||||
|
|
||||||
| identifier = "default.test_table_scan_default_to_large_types" | ||||||
| arrow_table = pa.Table.from_arrays( | ||||||
| [ | ||||||
|
|
@@ -840,7 +849,7 @@ def test_table_scan_default_to_large_types(catalog: Catalog) -> None: | |||||
| pa.array([b"a", b"b", b"c"]), | ||||||
| pa.array([["a", "b"], ["c", "d"], ["e", "f"]]), | ||||||
| ], | ||||||
| names=["string", "string-to-binary", "binary", "list"], | ||||||
| schema=expected_schema, | ||||||
| ) | ||||||
|
|
||||||
| try: | ||||||
|
|
@@ -859,15 +868,6 @@ def test_table_scan_default_to_large_types(catalog: Catalog) -> None: | |||||
| update_schema.update_column("string-to-binary", BinaryType()) | ||||||
|
|
||||||
| result_table = tbl.scan().to_arrow() | ||||||
|
|
||||||
| expected_schema = pa.schema( | ||||||
| [ | ||||||
| pa.field("string", pa.large_string()), | ||||||
| pa.field("string-to-binary", pa.large_binary()), | ||||||
| pa.field("binary", pa.large_binary()), | ||||||
| pa.field("list", pa.large_list(pa.large_string())), | ||||||
| ] | ||||||
| ) | ||||||
| assert result_table.schema.equals(expected_schema) | ||||||
|
|
||||||
|
|
||||||
|
|
@@ -906,7 +906,7 @@ def test_table_scan_override_with_small_types(catalog: Catalog) -> None: | |||||
| expected_schema = pa.schema( | ||||||
| [ | ||||||
| pa.field("string", pa.string()), | ||||||
| pa.field("string-to-binary", pa.binary()), | ||||||
| pa.field("string-to-binary", pa.large_binary()), | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Fokko is this right? type promotion for string->binary results in a large_binary type iceberg-python/pyiceberg/io/pyarrow.py Lines 687 to 688 in 7a56ddb
i found these 3 places where |
||||||
| pa.field("binary", pa.binary()), | ||||||
| pa.field("list", pa.list_(pa.string())), | ||||||
| ] | ||||||
|
|
||||||
Uh oh!
There was an error while loading. Please reload this page.