From f86b85752fb5dbc05686b33371e2735c41881b25 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 2 Jun 2025 19:27:10 -0700 Subject: [PATCH 1/3] remove .db for all but dynamo/glue/hive --- pyiceberg/catalog/__init__.py | 2 +- pyiceberg/catalog/dynamodb.py | 15 +++++++++++++++ pyiceberg/catalog/glue.py | 14 ++++++++++++++ pyiceberg/catalog/hive.py | 14 ++++++++++++++ 4 files changed, 44 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index cf649ba7d6..e8085bc066 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -930,7 +930,7 @@ def _get_default_warehouse_location(self, database_name: str, table_name: str) - if warehouse_path := self.properties.get(WAREHOUSE_LOCATION): warehouse_path = warehouse_path.rstrip("/") - return f"{warehouse_path}/{database_name}.db/{table_name}" + return f"{warehouse_path}/{database_name}/{table_name}" raise ValueError("No default path is set, please specify a location when creating a table") diff --git a/pyiceberg/catalog/dynamodb.py b/pyiceberg/catalog/dynamodb.py index 63466b0142..abcf8d173f 100644 --- a/pyiceberg/catalog/dynamodb.py +++ b/pyiceberg/catalog/dynamodb.py @@ -32,9 +32,11 @@ from pyiceberg.catalog import ( BOTOCORE_SESSION, ICEBERG, + LOCATION, METADATA_LOCATION, PREVIOUS_METADATA_LOCATION, TABLE_TYPE, + WAREHOUSE_LOCATION, MetastoreCatalog, PropertiesUpdateSummary, ) @@ -652,6 +654,19 @@ def _convert_dynamo_table_item_to_iceberg_table(self, dynamo_table_item: Dict[st catalog=self, ) + def _get_default_warehouse_location(self, database_name: str, table_name: str) -> str: + """Return the default warehouse location following the Hive convention of `warehousePath/databaseName.db/tableName`.""" + database_properties = self.load_namespace_properties(database_name) + if database_location := database_properties.get(LOCATION): + database_location = database_location.rstrip("/") + return f"{database_location}/{table_name}" + + if warehouse_path := self.properties.get(WAREHOUSE_LOCATION): + warehouse_path = warehouse_path.rstrip("/") + return f"{warehouse_path}/{database_name}.db/{table_name}" + + raise ValueError("No default path is set, please specify a location when creating a table") + def _get_create_table_item(database_name: str, table_name: str, properties: Properties, metadata_location: str) -> Dict[str, Any]: current_timestamp_ms = str(round(time() * 1000)) diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index 01bb0e9b05..c1940791c0 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -48,6 +48,7 @@ METADATA_LOCATION, PREVIOUS_METADATA_LOCATION, TABLE_TYPE, + WAREHOUSE_LOCATION, MetastoreCatalog, PropertiesUpdateSummary, ) @@ -808,3 +809,16 @@ def view_exists(self, identifier: Union[str, Identifier]) -> bool: @staticmethod def __is_iceberg_table(table: TableTypeDef) -> bool: return table.get("Parameters", {}).get(TABLE_TYPE, "").lower() == ICEBERG + + def _get_default_warehouse_location(self, database_name: str, table_name: str) -> str: + """Return the default warehouse location following the Hive convention of `warehousePath/databaseName.db/tableName`.""" + database_properties = self.load_namespace_properties(database_name) + if database_location := database_properties.get(LOCATION): + database_location = database_location.rstrip("/") + return f"{database_location}/{table_name}" + + if warehouse_path := self.properties.get(WAREHOUSE_LOCATION): + warehouse_path = warehouse_path.rstrip("/") + return f"{warehouse_path}/{database_name}.db/{table_name}" + + raise ValueError("No default path is set, please specify a location when creating a table") diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index 5a9387577b..817aef4f8f 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -63,6 +63,7 @@ LOCATION, METADATA_LOCATION, TABLE_TYPE, + WAREHOUSE_LOCATION, MetastoreCatalog, PropertiesUpdateSummary, ) @@ -790,3 +791,16 @@ def update_namespace_properties( def drop_view(self, identifier: Union[str, Identifier]) -> None: raise NotImplementedError + + def _get_default_warehouse_location(self, database_name: str, table_name: str) -> str: + """Return the default warehouse location following the Hive convention of `warehousePath/databaseName.db/tableName`.""" + database_properties = self.load_namespace_properties(database_name) + if database_location := database_properties.get(LOCATION): + database_location = database_location.rstrip("/") + return f"{database_location}/{table_name}" + + if warehouse_path := self.properties.get(WAREHOUSE_LOCATION): + warehouse_path = warehouse_path.rstrip("/") + return f"{warehouse_path}/{database_name}.db/{table_name}" + + raise ValueError("No default path is set, please specify a location when creating a table") From f393c384e4fb4571387829671ec660dcc5f31f4e Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 2 Jun 2025 19:28:08 -0700 Subject: [PATCH 2/3] remove .db in sql and console --- tests/catalog/test_sql.py | 30 +++++++++++++++--------------- tests/cli/test_console.py | 4 ++-- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/catalog/test_sql.py b/tests/catalog/test_sql.py index 8c3047b2ca..235951484f 100644 --- a/tests/catalog/test_sql.py +++ b/tests/catalog/test_sql.py @@ -72,7 +72,7 @@ def catalog_name() -> str: @pytest.fixture(name="random_table_identifier") def fixture_random_table_identifier(warehouse: Path, database_name: str, table_name: str) -> Identifier: - os.makedirs(f"{warehouse}/{database_name}.db/{table_name}/metadata/", exist_ok=True) + os.makedirs(f"{warehouse}/{database_name}/{table_name}/metadata/", exist_ok=True) return database_name, table_name @@ -80,13 +80,13 @@ def fixture_random_table_identifier(warehouse: Path, database_name: str, table_n def fixture_another_random_table_identifier(warehouse: Path, database_name: str, table_name: str) -> Identifier: database_name = database_name + "_new" table_name = table_name + "_new" - os.makedirs(f"{warehouse}/{database_name}.db/{table_name}/metadata/", exist_ok=True) + os.makedirs(f"{warehouse}/{database_name}/{table_name}/metadata/", exist_ok=True) return database_name, table_name @pytest.fixture(name="random_hierarchical_identifier") def fixture_random_hierarchical_identifier(warehouse: Path, hierarchical_namespace_name: str, table_name: str) -> Identifier: - os.makedirs(f"{warehouse}/{hierarchical_namespace_name}.db/{table_name}/metadata/", exist_ok=True) + os.makedirs(f"{warehouse}/{hierarchical_namespace_name}/{table_name}/metadata/", exist_ok=True) return Catalog.identifier_to_tuple(".".join((hierarchical_namespace_name, table_name))) @@ -96,7 +96,7 @@ def fixture_another_random_hierarchical_identifier( ) -> Identifier: hierarchical_namespace_name = hierarchical_namespace_name + "_new" table_name = table_name + "_new" - os.makedirs(f"{warehouse}/{hierarchical_namespace_name}.db/{table_name}/metadata/", exist_ok=True) + os.makedirs(f"{warehouse}/{hierarchical_namespace_name}/{table_name}/metadata/", exist_ok=True) return Catalog.identifier_to_tuple(".".join((hierarchical_namespace_name, table_name))) @@ -115,7 +115,7 @@ def catalog_memory(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog, @pytest.fixture(scope="module") def catalog_sqlite(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog, None, None]: props = { - "uri": f"sqlite:////{warehouse}/sql-catalog.db", + "uri": f"sqlite:////{warehouse}/sql-catalog", "warehouse": f"file://{warehouse}", } catalog = SqlCatalog(catalog_name, **props) @@ -126,7 +126,7 @@ def catalog_sqlite(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog, @pytest.fixture(scope="module") def catalog_uri(warehouse: Path) -> str: - return f"sqlite:////{warehouse}/sql-catalog.db" + return f"sqlite:////{warehouse}/sql-catalog" @pytest.fixture(scope="module") @@ -137,7 +137,7 @@ def alchemy_engine(catalog_uri: str) -> Engine: @pytest.fixture(scope="module") def catalog_sqlite_without_rowcount(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog, None, None]: props = { - "uri": f"sqlite:////{warehouse}/sql-catalog.db", + "uri": f"sqlite:////{warehouse}/sql-catalog", "warehouse": f"file://{warehouse}", } catalog = SqlCatalog(catalog_name, **props) @@ -150,7 +150,7 @@ def catalog_sqlite_without_rowcount(catalog_name: str, warehouse: Path) -> Gener @pytest.fixture(scope="module") def catalog_sqlite_fsspec(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog, None, None]: props = { - "uri": f"sqlite:////{warehouse}/sql-catalog.db", + "uri": f"sqlite:////{warehouse}/sql-catalog", "warehouse": f"file://{warehouse}", PY_IO_IMPL: FSSPEC_FILE_IO, } @@ -176,7 +176,7 @@ def test_creation_with_echo_parameter(catalog_name: str, warehouse: Path) -> Non for echo_param, expected_echo_value in test_cases: props = { - "uri": f"sqlite:////{warehouse}/sql-catalog.db", + "uri": f"sqlite:////{warehouse}/sql-catalog", "warehouse": f"file://{warehouse}", } # None is for default value @@ -199,7 +199,7 @@ def test_creation_with_pool_pre_ping_parameter(catalog_name: str, warehouse: Pat for pool_pre_ping_param, expected_pool_pre_ping_value in test_cases: props = { - "uri": f"sqlite:////{warehouse}/sql-catalog.db", + "uri": f"sqlite:////{warehouse}/sql-catalog", "warehouse": f"file://{warehouse}", } # None is for default value @@ -219,7 +219,7 @@ def test_creation_from_impl(catalog_name: str, warehouse: Path) -> None: catalog_name, **{ "py-catalog-impl": "pyiceberg.catalog.sql.SqlCatalog", - "uri": f"sqlite:////{warehouse}/sql-catalog.db", + "uri": f"sqlite:////{warehouse}/sql-catalog", "warehouse": f"file://{warehouse}", }, ), @@ -493,7 +493,7 @@ def test_create_table_with_given_location_removes_trailing_slash( identifier_tuple = Catalog.identifier_to_tuple(table_identifier) namespace = Catalog.namespace_from(table_identifier) table_name = Catalog.table_name_from(identifier_tuple) - location = f"file://{warehouse}/{catalog.name}.db/{table_name}-given" + location = f"file://{warehouse}/{catalog.name}/{table_name}-given" catalog.create_namespace(namespace) catalog.create_table(table_identifier, table_schema_nested, location=f"{location}/") table = catalog.load_table(table_identifier) @@ -1235,7 +1235,7 @@ def test_load_namespace_properties(catalog: SqlCatalog, namespace: str) -> None: warehouse_location = "/test/location" test_properties = { "comment": "this is a test description", - "location": f"{warehouse_location}/{namespace}.db", + "location": f"{warehouse_location}/{namespace}", "test_property1": "1", "test_property2": "2", "test_property3": "3", @@ -1286,7 +1286,7 @@ def test_update_namespace_properties(catalog: SqlCatalog, namespace: str) -> Non warehouse_location = "/test/location" test_properties = { "comment": "this is a test description", - "location": f"{warehouse_location}/{namespace}.db", + "location": f"{warehouse_location}/{namespace}", "test_property1": "1", "test_property2": "2", "test_property3": "3", @@ -1306,7 +1306,7 @@ def test_update_namespace_properties(catalog: SqlCatalog, namespace: str) -> Non "comment": "updated test description", "test_property4": "4", "test_property5": "5", - "location": f"{warehouse_location}/{namespace}.db", + "location": f"{warehouse_location}/{namespace}", } diff --git a/tests/cli/test_console.py b/tests/cli/test_console.py index 70e04071ad..fe375eb276 100644 --- a/tests/cli/test_console.py +++ b/tests/cli/test_console.py @@ -271,7 +271,7 @@ def test_location(catalog: InMemoryCatalog) -> None: runner = CliRunner() result = runner.invoke(run, ["location", "default.my_table"]) assert result.exit_code == 0 - assert result.output == f"""{catalog._warehouse_location}/default.db/my_table\n""" + assert result.output == f"""{catalog._warehouse_location}/default/my_table\n""" def test_location_does_not_exists(catalog: InMemoryCatalog) -> None: @@ -700,7 +700,7 @@ def test_json_location(catalog: InMemoryCatalog) -> None: runner = CliRunner() result = runner.invoke(run, ["--output=json", "location", "default.my_table"]) assert result.exit_code == 0 - assert result.output == f'"{catalog._warehouse_location}/default.db/my_table"\n' + assert result.output == f'"{catalog._warehouse_location}/default/my_table"\n' def test_json_location_does_not_exists(catalog: InMemoryCatalog) -> None: From 9f28fe49bd543a3db30339497807bc023fea7ee4 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 14 Jun 2025 10:46:36 -0700 Subject: [PATCH 3/3] dry --- pyiceberg/catalog/__init__.py | 14 ++++++++++++++ pyiceberg/catalog/dynamodb.py | 15 ++------------- pyiceberg/catalog/glue.py | 14 ++------------ pyiceberg/catalog/hive.py | 14 ++------------ 4 files changed, 20 insertions(+), 37 deletions(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index e8085bc066..4b78b1cce4 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -923,6 +923,7 @@ def _resolve_table_location(self, location: Optional[str], database_name: str, t return location.rstrip("/") def _get_default_warehouse_location(self, database_name: str, table_name: str) -> str: + """Return the default warehouse location using the convention of `warehousePath/databaseName/tableName`.""" database_properties = self.load_namespace_properties(database_name) if database_location := database_properties.get(LOCATION): database_location = database_location.rstrip("/") @@ -934,6 +935,19 @@ def _get_default_warehouse_location(self, database_name: str, table_name: str) - raise ValueError("No default path is set, please specify a location when creating a table") + def _get_hive_style_warehouse_location(self, database_name: str, table_name: str) -> str: + """Return the default warehouse location following the Hive convention of `warehousePath/databaseName.db/tableName`.""" + database_properties = self.load_namespace_properties(database_name) + if database_location := database_properties.get(LOCATION): + database_location = database_location.rstrip("/") + return f"{database_location}/{table_name}" + + if warehouse_path := self.properties.get(WAREHOUSE_LOCATION): + warehouse_path = warehouse_path.rstrip("/") + return f"{warehouse_path}/{database_name}.db/{table_name}" + + raise ValueError("No default path is set, please specify a location when creating a table") + @staticmethod def _write_metadata(metadata: TableMetadata, io: FileIO, metadata_path: str) -> None: ToOutputFile.table_metadata(metadata, io.new_output(metadata_path)) diff --git a/pyiceberg/catalog/dynamodb.py b/pyiceberg/catalog/dynamodb.py index abcf8d173f..bfd5a14453 100644 --- a/pyiceberg/catalog/dynamodb.py +++ b/pyiceberg/catalog/dynamodb.py @@ -32,11 +32,9 @@ from pyiceberg.catalog import ( BOTOCORE_SESSION, ICEBERG, - LOCATION, METADATA_LOCATION, PREVIOUS_METADATA_LOCATION, TABLE_TYPE, - WAREHOUSE_LOCATION, MetastoreCatalog, PropertiesUpdateSummary, ) @@ -655,17 +653,8 @@ def _convert_dynamo_table_item_to_iceberg_table(self, dynamo_table_item: Dict[st ) def _get_default_warehouse_location(self, database_name: str, table_name: str) -> str: - """Return the default warehouse location following the Hive convention of `warehousePath/databaseName.db/tableName`.""" - database_properties = self.load_namespace_properties(database_name) - if database_location := database_properties.get(LOCATION): - database_location = database_location.rstrip("/") - return f"{database_location}/{table_name}" - - if warehouse_path := self.properties.get(WAREHOUSE_LOCATION): - warehouse_path = warehouse_path.rstrip("/") - return f"{warehouse_path}/{database_name}.db/{table_name}" - - raise ValueError("No default path is set, please specify a location when creating a table") + """Override the default warehouse location to follow Hive-style conventions.""" + return self._get_hive_style_warehouse_location(database_name, table_name) def _get_create_table_item(database_name: str, table_name: str, properties: Properties, metadata_location: str) -> Dict[str, Any]: diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index c1940791c0..bfc493a2c3 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -48,7 +48,6 @@ METADATA_LOCATION, PREVIOUS_METADATA_LOCATION, TABLE_TYPE, - WAREHOUSE_LOCATION, MetastoreCatalog, PropertiesUpdateSummary, ) @@ -811,14 +810,5 @@ def __is_iceberg_table(table: TableTypeDef) -> bool: return table.get("Parameters", {}).get(TABLE_TYPE, "").lower() == ICEBERG def _get_default_warehouse_location(self, database_name: str, table_name: str) -> str: - """Return the default warehouse location following the Hive convention of `warehousePath/databaseName.db/tableName`.""" - database_properties = self.load_namespace_properties(database_name) - if database_location := database_properties.get(LOCATION): - database_location = database_location.rstrip("/") - return f"{database_location}/{table_name}" - - if warehouse_path := self.properties.get(WAREHOUSE_LOCATION): - warehouse_path = warehouse_path.rstrip("/") - return f"{warehouse_path}/{database_name}.db/{table_name}" - - raise ValueError("No default path is set, please specify a location when creating a table") + """Override the default warehouse location to follow Hive-style conventions.""" + return self._get_hive_style_warehouse_location(database_name, table_name) diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index 817aef4f8f..4374a6fd25 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -63,7 +63,6 @@ LOCATION, METADATA_LOCATION, TABLE_TYPE, - WAREHOUSE_LOCATION, MetastoreCatalog, PropertiesUpdateSummary, ) @@ -793,14 +792,5 @@ def drop_view(self, identifier: Union[str, Identifier]) -> None: raise NotImplementedError def _get_default_warehouse_location(self, database_name: str, table_name: str) -> str: - """Return the default warehouse location following the Hive convention of `warehousePath/databaseName.db/tableName`.""" - database_properties = self.load_namespace_properties(database_name) - if database_location := database_properties.get(LOCATION): - database_location = database_location.rstrip("/") - return f"{database_location}/{table_name}" - - if warehouse_path := self.properties.get(WAREHOUSE_LOCATION): - warehouse_path = warehouse_path.rstrip("/") - return f"{warehouse_path}/{database_name}.db/{table_name}" - - raise ValueError("No default path is set, please specify a location when creating a table") + """Override the default warehouse location to follow Hive-style conventions.""" + return self._get_hive_style_warehouse_location(database_name, table_name)