From e6438ad2087217609e0bc1e22d7382d5da468901 Mon Sep 17 00:00:00 2001 From: frankzfli Date: Mon, 5 May 2025 10:56:01 +0800 Subject: [PATCH] Hive: Add DO_NOT_UPDATE_STATS in alter_table --- pyiceberg/catalog/hive.py | 17 +++++++++++++++-- tests/catalog/test_hive.py | 25 +++++++++++++++++++------ 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index 75a63e0ae7..5a9387577b 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -36,6 +36,7 @@ from hive_metastore.ttypes import ( AlreadyExistsException, CheckLockRequest, + EnvironmentContext, FieldSchema, InvalidOperationException, LockComponent, @@ -135,6 +136,8 @@ DEFAULT_LOCK_CHECK_MIN_WAIT_TIME = 0.1 # 100 milliseconds DEFAULT_LOCK_CHECK_MAX_WAIT_TIME = 60 # 1 min DEFAULT_LOCK_CHECK_RETRIES = 4 +DO_NOT_UPDATE_STATS = "DO_NOT_UPDATE_STATS" +DO_NOT_UPDATE_STATS_DEFAULT = "true" logger = logging.getLogger(__name__) @@ -539,7 +542,12 @@ def commit_table( metadata_location=updated_staged_table.metadata_location, previous_metadata_location=current_table.metadata_location, ) - open_client.alter_table(dbname=database_name, tbl_name=table_name, new_tbl=hive_table) + open_client.alter_table_with_environment_context( + dbname=database_name, + tbl_name=table_name, + new_tbl=hive_table, + environment_context=EnvironmentContext(properties={DO_NOT_UPDATE_STATS: DO_NOT_UPDATE_STATS_DEFAULT}), + ) else: # Table does not exist, create it. hive_table = self._convert_iceberg_into_hive( @@ -626,7 +634,12 @@ def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: U tbl = open_client.get_table(dbname=from_database_name, tbl_name=from_table_name) tbl.dbName = to_database_name tbl.tableName = to_table_name - open_client.alter_table(dbname=from_database_name, tbl_name=from_table_name, new_tbl=tbl) + open_client.alter_table_with_environment_context( + dbname=from_database_name, + tbl_name=from_table_name, + new_tbl=tbl, + environment_context=EnvironmentContext(properties={DO_NOT_UPDATE_STATS: DO_NOT_UPDATE_STATS_DEFAULT}), + ) except NoSuchObjectException as e: raise NoSuchTableError(f"Table does not exist: {from_table_name}") from e except InvalidOperationException as e: diff --git a/tests/catalog/test_hive.py b/tests/catalog/test_hive.py index 99d1c67cb4..497ff99924 100644 --- a/tests/catalog/test_hive.py +++ b/tests/catalog/test_hive.py @@ -29,6 +29,7 @@ import thrift.transport.TSocket from hive_metastore.ttypes import ( AlreadyExistsException, + EnvironmentContext, FieldSchema, InvalidOperationException, LockResponse, @@ -44,6 +45,8 @@ from pyiceberg.catalog import PropertiesUpdateSummary from pyiceberg.catalog.hive import ( + DO_NOT_UPDATE_STATS, + DO_NOT_UPDATE_STATS_DEFAULT, HIVE_KERBEROS_AUTH, LOCK_CHECK_MAX_WAIT_TIME, LOCK_CHECK_MIN_WAIT_TIME, @@ -874,7 +877,7 @@ def test_rename_table(hive_table: HiveTable) -> None: catalog._client = MagicMock() catalog._client.__enter__().get_table.side_effect = [hive_table, renamed_table] - catalog._client.__enter__().alter_table.return_value = None + catalog._client.__enter__().alter_table_with_environment_context.return_value = None from_identifier = ("default", "new_tabl2e") to_identifier = ("default", "new_tabl3e") @@ -884,7 +887,12 @@ def test_rename_table(hive_table: HiveTable) -> None: calls = [call(dbname="default", tbl_name="new_tabl2e"), call(dbname="default", tbl_name="new_tabl3e")] catalog._client.__enter__().get_table.assert_has_calls(calls) - catalog._client.__enter__().alter_table.assert_called_with(dbname="default", tbl_name="new_tabl2e", new_tbl=renamed_table) + catalog._client.__enter__().alter_table_with_environment_context.assert_called_with( + dbname="default", + tbl_name="new_tabl2e", + new_tbl=renamed_table, + environment_context=EnvironmentContext(properties={DO_NOT_UPDATE_STATS: DO_NOT_UPDATE_STATS_DEFAULT}), + ) def test_rename_table_from_self_identifier(hive_table: HiveTable) -> None: @@ -902,7 +910,7 @@ def test_rename_table_from_self_identifier(hive_table: HiveTable) -> None: renamed_table.tableName = "new_tabl3e" catalog._client.__enter__().get_table.side_effect = [hive_table, renamed_table] - catalog._client.__enter__().alter_table.return_value = None + catalog._client.__enter__().alter_table_with_environment_context.return_value = None to_identifier = ("default", "new_tabl3e") table = catalog.rename_table(from_table.name(), to_identifier) @@ -910,14 +918,19 @@ def test_rename_table_from_self_identifier(hive_table: HiveTable) -> None: calls = [call(dbname="default", tbl_name="new_tabl2e"), call(dbname="default", tbl_name="new_tabl3e")] catalog._client.__enter__().get_table.assert_has_calls(calls) - catalog._client.__enter__().alter_table.assert_called_with(dbname="default", tbl_name="new_tabl2e", new_tbl=renamed_table) + catalog._client.__enter__().alter_table_with_environment_context.assert_called_with( + dbname="default", + tbl_name="new_tabl2e", + new_tbl=renamed_table, + environment_context=EnvironmentContext(properties={DO_NOT_UPDATE_STATS: DO_NOT_UPDATE_STATS_DEFAULT}), + ) def test_rename_table_from_does_not_exists() -> None: catalog = HiveCatalog(HIVE_CATALOG_NAME, uri=HIVE_METASTORE_FAKE_URL) catalog._client = MagicMock() - catalog._client.__enter__().alter_table.side_effect = NoSuchObjectException( + catalog._client.__enter__().alter_table_with_environment_context.side_effect = NoSuchObjectException( message="hive.default.does_not_exists table not found" ) @@ -931,7 +944,7 @@ def test_rename_table_to_namespace_does_not_exists() -> None: catalog = HiveCatalog(HIVE_CATALOG_NAME, uri=HIVE_METASTORE_FAKE_URL) catalog._client = MagicMock() - catalog._client.__enter__().alter_table.side_effect = InvalidOperationException( + catalog._client.__enter__().alter_table_with_environment_context.side_effect = InvalidOperationException( message="Unable to change partition or table. Database default does not exist Check metastore logs for detailed stack.does_not_exists" )