From b2a41277faf01dd2e0d259bb382a8a1ad39eefe2 Mon Sep 17 00:00:00 2001 From: frankzfli Date: Sat, 24 May 2025 12:58:44 +0800 Subject: [PATCH] Update hive storage descriptor after commit schema change --- dev/hive/core-site.xml | 5 ++++ pyiceberg/catalog/hive.py | 6 +++++ tests/integration/test_writes/test_writes.py | 24 ++++++++++++++++++++ 3 files changed, 35 insertions(+) diff --git a/dev/hive/core-site.xml b/dev/hive/core-site.xml index b77332b83b..f5a9473b51 100644 --- a/dev/hive/core-site.xml +++ b/dev/hive/core-site.xml @@ -50,4 +50,9 @@ fs.s3a.path.style.access true + + hive.metastore.disallow.incompatible.col.type.changes + false + + diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index 5a9387577b..23b148e9b9 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -542,6 +542,12 @@ def commit_table( metadata_location=updated_staged_table.metadata_location, previous_metadata_location=current_table.metadata_location, ) + # Update hive's schema and properties + hive_table.sd = _construct_hive_storage_descriptor( + updated_staged_table.schema(), + updated_staged_table.location(), + property_as_bool(updated_staged_table.properties, HIVE2_COMPATIBLE, HIVE2_COMPATIBLE_DEFAULT), + ) open_client.alter_table_with_environment_context( dbname=database_name, tbl_name=table_name, diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index 150d2b750c..94a3d3775f 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -1157,6 +1157,30 @@ def test_hive_catalog_storage_descriptor( assert spark.sql("SELECT * FROM hive.default.test_storage_descriptor").count() == 3 +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_hive_catalog_storage_descriptor_has_changed( + session_catalog_hive: HiveCatalog, + pa_schema: pa.Schema, + arrow_table_with_null: pa.Table, + spark: SparkSession, + format_version: int, +) -> None: + tbl = _create_table( + session_catalog_hive, "default.test_storage_descriptor", {"format-version": format_version}, [arrow_table_with_null] + ) + + with tbl.transaction() as tx: + with tx.update_schema() as schema: + schema.update_column("string_long", doc="this is string_long") + schema.update_column("binary", doc="this is binary") + + with session_catalog_hive._client as open_client: + hive_table = session_catalog_hive._get_hive_table(open_client, "default", "test_storage_descriptor") + assert "this is string_long" in str(hive_table.sd) + assert "this is binary" in str(hive_table.sd) + + @pytest.mark.integration @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_sanitize_character_partitioned(catalog: Catalog) -> None: