From 1bcd0ad85121842145fa93fab468a51039671299 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Sat, 20 Jul 2024 09:46:10 -0700 Subject: [PATCH 1/5] Support loading custom catalog impl --- mkdocs/docs/configuration.md | 20 +++++++++++++++++++- pyiceberg/catalog/__init__.py | 23 +++++++++++++++++++++++ tests/catalog/test_base.py | 20 +++++++++++++++----- tests/catalog/test_sql.py | 13 +++++++++++++ 4 files changed, 70 insertions(+), 6 deletions(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 985b8a775f..0199bef6f1 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -139,7 +139,13 @@ For the FileIO there are several configuration options available: ## Catalogs -PyIceberg currently has native support for REST, SQL, Hive, Glue and DynamoDB. +PyIceberg currently has native catalog type support for REST, SQL, Hive, Glue and DynamoDB. +You can also set the catalog implementation explicitly: + +| Key | Example | Description | +| --------------- | ---------------------------- | ------------------------------------------------------------------------------------------------ | +| type | rest | Type of catalog, one of `rest`, `sql`, `hive`, `glue`, `dymamodb`. Default to `rest` | +| py-catalog-impl | mypackage.mymodule.MyCatalog | Sets the catalog explicitly to an implementation, and will fail explicitly if it can't be loaded | There are three ways to pass in configuration: @@ -379,6 +385,18 @@ catalog: +### Custom Catalog Implementations + +If you want to load any custom catalog implementation, you can set catalog configurations like the following: + +```yaml +catalog: + default: + py-catalog-impl: mypackage.mymodule.MyCatalog + custom-key1: value1 + custom-key2: value2 +``` + ## Unified AWS Credentials You can explicitly set the AWS credentials for both Glue/DynamoDB Catalog and S3 FileIO by configuring `client.*` properties. For example: diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index a84bde0d0c..6c9a527951 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -17,6 +17,7 @@ from __future__ import annotations +import importlib import logging import re import uuid @@ -77,6 +78,7 @@ TOKEN = "token" TYPE = "type" +PY_CATALOG_IMPL = "py-catalog-impl" ICEBERG = "iceberg" TABLE_TYPE = "table_type" WAREHOUSE_LOCATION = "warehouse" @@ -230,6 +232,13 @@ def load_catalog(name: Optional[str] = None, **properties: Optional[str]) -> Cat env = _ENV_CONFIG.get_catalog_config(name) conf: RecursiveDict = merge_config(env or {}, cast(RecursiveDict, properties)) + if catalog_impl := properties.get(PY_CATALOG_IMPL): + if catalog := _import_catalog(name, catalog_impl, properties): + logger.info("Loaded Catalog: %s", catalog_impl) + return catalog + else: + raise ValueError(f"Could not initialize Catalog: {catalog_impl}") + catalog_type: Optional[CatalogType] provided_catalog_type = conf.get(TYPE) @@ -283,6 +292,20 @@ def delete_data_files(io: FileIO, manifests_to_delete: List[ManifestFile]) -> No deleted_files[path] = True +def _import_catalog(name: str, catalog_impl: str, properties: Properties) -> Optional[Catalog]: + try: + path_parts = catalog_impl.split(".") + if len(path_parts) < 2: + raise ValueError(f"py-catalog-impl should be full path (module.CustomCatalog), got: {catalog_impl}") + module_name, class_name = ".".join(path_parts[:-1]), path_parts[-1] + module = importlib.import_module(module_name) + class_ = getattr(module, class_name) + return class_(name, **properties) + except ModuleNotFoundError: + logger.warning("Could not initialize Catalog: %s", catalog_impl) + return None + + @dataclass class PropertiesUpdateSummary: removed: List[str] diff --git a/tests/catalog/test_base.py b/tests/catalog/test_base.py index 06e9a8a3aa..52a6a302e7 100644 --- a/tests/catalog/test_base.py +++ b/tests/catalog/test_base.py @@ -32,11 +32,7 @@ from pydantic_core import ValidationError from pytest_lazyfixture import lazy_fixture -from pyiceberg.catalog import ( - Catalog, - MetastoreCatalog, - PropertiesUpdateSummary, -) +from pyiceberg.catalog import PY_CATALOG_IMPL, Catalog, MetastoreCatalog, PropertiesUpdateSummary, load_catalog from pyiceberg.exceptions import ( NamespaceAlreadyExistsError, NamespaceNotEmptyError, @@ -295,6 +291,20 @@ def given_catalog_has_a_table( ) +def test_load_catalog_impl_not_full_path() -> None: + with pytest.raises(ValueError) as exc_info: + load_catalog("catalog", **{PY_CATALOG_IMPL: "CustomCatalog"}) + + assert "py-catalog-impl should be full path (module.CustomCatalog), got: CustomCatalog" in str(exc_info.value) + + +def test_load_catalog_impl_does_not_exist() -> None: + with pytest.raises(ValueError) as exc_info: + load_catalog("catalog", **{PY_CATALOG_IMPL: "pyiceberg.does.not.exist.Catalog"}) + + assert "Could not initialize Catalog: pyiceberg.does.not.exist.Catalog" in str(exc_info.value) + + def test_namespace_from_tuple() -> None: # Given identifier = ("com", "organization", "department", "my_table") diff --git a/tests/catalog/test_sql.py b/tests/catalog/test_sql.py index f887b1ea3b..82b6b0a48d 100644 --- a/tests/catalog/test_sql.py +++ b/tests/catalog/test_sql.py @@ -27,6 +27,7 @@ from pyiceberg.catalog import ( Catalog, + load_catalog, ) from pyiceberg.catalog.sql import DEFAULT_ECHO_VALUE, DEFAULT_POOL_PRE_PING_VALUE, SqlCatalog from pyiceberg.exceptions import ( @@ -210,6 +211,18 @@ def test_creation_with_pool_pre_ping_parameter(catalog_name: str, warehouse: Pat ) +def test_creation_from_impl(catalog_name: str, warehouse: Path) -> None: + assert isinstance( + load_catalog( + catalog_name, + py_catalog_impl="pyiceberg.catalog.sql.SqlCatalog", + uri=f"sqlite:////{warehouse}/sql-catalog.db", + warehouse=f"file://{warehouse}", + ), + SqlCatalog, + ) + + @pytest.mark.parametrize( "catalog", [ From 0f3ff72f43a33da518f170163348d8a9a7d1f515 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Sat, 20 Jul 2024 12:02:00 -0700 Subject: [PATCH 2/5] add more tests --- mkdocs/docs/configuration.md | 4 ++-- pyiceberg/catalog/__init__.py | 12 +++++++++--- tests/catalog/test_base.py | 16 +++++++++++++--- tests/catalog/test_sql.py | 8 +++++--- 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 0199bef6f1..afe5674c1b 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -140,10 +140,10 @@ For the FileIO there are several configuration options available: ## Catalogs PyIceberg currently has native catalog type support for REST, SQL, Hive, Glue and DynamoDB. -You can also set the catalog implementation explicitly: +Alternatively, you can also directly set the catalog implementation: | Key | Example | Description | -| --------------- | ---------------------------- | ------------------------------------------------------------------------------------------------ | +| --------------- | ---------------------------- |--------------------------------------------------------------------------------------------------| | type | rest | Type of catalog, one of `rest`, `sql`, `hive`, `glue`, `dymamodb`. Default to `rest` | | py-catalog-impl | mypackage.mymodule.MyCatalog | Sets the catalog explicitly to an implementation, and will fail explicitly if it can't be loaded | diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 6c9a527951..a423e20fad 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -232,16 +232,22 @@ def load_catalog(name: Optional[str] = None, **properties: Optional[str]) -> Cat env = _ENV_CONFIG.get_catalog_config(name) conf: RecursiveDict = merge_config(env or {}, cast(RecursiveDict, properties)) + catalog_type: Optional[CatalogType] + provided_catalog_type = conf.get(TYPE) + if catalog_impl := properties.get(PY_CATALOG_IMPL): + if provided_catalog_type: + raise ValueError( + f"Must not set both catalog type and py-catalog-impl configuration, " + f"but found type {provided_catalog_type} and impl {catalog_impl}" + ) + if catalog := _import_catalog(name, catalog_impl, properties): logger.info("Loaded Catalog: %s", catalog_impl) return catalog else: raise ValueError(f"Could not initialize Catalog: {catalog_impl}") - catalog_type: Optional[CatalogType] - provided_catalog_type = conf.get(TYPE) - catalog_type = None if provided_catalog_type and isinstance(provided_catalog_type, str): catalog_type = CatalogType[provided_catalog_type.upper()] diff --git a/tests/catalog/test_base.py b/tests/catalog/test_base.py index 52a6a302e7..4928d31e0d 100644 --- a/tests/catalog/test_base.py +++ b/tests/catalog/test_base.py @@ -32,7 +32,7 @@ from pydantic_core import ValidationError from pytest_lazyfixture import lazy_fixture -from pyiceberg.catalog import PY_CATALOG_IMPL, Catalog, MetastoreCatalog, PropertiesUpdateSummary, load_catalog +from pyiceberg.catalog import Catalog, MetastoreCatalog, PropertiesUpdateSummary, load_catalog from pyiceberg.exceptions import ( NamespaceAlreadyExistsError, NamespaceNotEmptyError, @@ -293,18 +293,28 @@ def given_catalog_has_a_table( def test_load_catalog_impl_not_full_path() -> None: with pytest.raises(ValueError) as exc_info: - load_catalog("catalog", **{PY_CATALOG_IMPL: "CustomCatalog"}) + load_catalog("catalog", **{"py-catalog-impl": "CustomCatalog"}) assert "py-catalog-impl should be full path (module.CustomCatalog), got: CustomCatalog" in str(exc_info.value) def test_load_catalog_impl_does_not_exist() -> None: with pytest.raises(ValueError) as exc_info: - load_catalog("catalog", **{PY_CATALOG_IMPL: "pyiceberg.does.not.exist.Catalog"}) + load_catalog("catalog", **{"py-catalog-impl": "pyiceberg.does.not.exist.Catalog"}) assert "Could not initialize Catalog: pyiceberg.does.not.exist.Catalog" in str(exc_info.value) +def test_load_catalog_has_type_and_impl() -> None: + with pytest.raises(ValueError) as exc_info: + load_catalog("catalog", **{"py-catalog-impl": "pyiceberg.does.not.exist.Catalog", "type": "sql"}) + + assert ( + "Must not set both catalog type and py-catalog-impl configuration, " + "but found type sql and impl pyiceberg.does.not.exist.Catalog" in str(exc_info.value) + ) + + def test_namespace_from_tuple() -> None: # Given identifier = ("com", "organization", "department", "my_table") diff --git a/tests/catalog/test_sql.py b/tests/catalog/test_sql.py index 82b6b0a48d..7b48b08a53 100644 --- a/tests/catalog/test_sql.py +++ b/tests/catalog/test_sql.py @@ -215,9 +215,11 @@ def test_creation_from_impl(catalog_name: str, warehouse: Path) -> None: assert isinstance( load_catalog( catalog_name, - py_catalog_impl="pyiceberg.catalog.sql.SqlCatalog", - uri=f"sqlite:////{warehouse}/sql-catalog.db", - warehouse=f"file://{warehouse}", + **{ + "py-catalog-impl": "pyiceberg.catalog.sql.SqlCatalog", + "uri": f"sqlite:////{warehouse}/sql-catalog.db", + "warehouse": f"file://{warehouse}", + }, ), SqlCatalog, ) From 927c0ed8481c3e8d2c7eecd3cbf7c901b3b822b0 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Sat, 20 Jul 2024 12:05:01 -0700 Subject: [PATCH 3/5] fix lint --- mkdocs/docs/configuration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index afe5674c1b..ff3741656a 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -143,7 +143,7 @@ PyIceberg currently has native catalog type support for REST, SQL, Hive, Glue an Alternatively, you can also directly set the catalog implementation: | Key | Example | Description | -| --------------- | ---------------------------- |--------------------------------------------------------------------------------------------------| +| --------------- | ---------------------------- | ------------------------------------------------------------------------------------------------ | | type | rest | Type of catalog, one of `rest`, `sql`, `hive`, `glue`, `dymamodb`. Default to `rest` | | py-catalog-impl | mypackage.mymodule.MyCatalog | Sets the catalog explicitly to an implementation, and will fail explicitly if it can't be loaded | From 6b7df05e40294099af8cff06dc4da2386f32dfc8 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Sat, 20 Jul 2024 12:06:28 -0700 Subject: [PATCH 4/5] fix typo --- pyiceberg/catalog/__init__.py | 4 ++-- tests/catalog/test_base.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index a423e20fad..c6496fba90 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -238,8 +238,8 @@ def load_catalog(name: Optional[str] = None, **properties: Optional[str]) -> Cat if catalog_impl := properties.get(PY_CATALOG_IMPL): if provided_catalog_type: raise ValueError( - f"Must not set both catalog type and py-catalog-impl configuration, " - f"but found type {provided_catalog_type} and impl {catalog_impl}" + f"Must not set both catalog type and py-catalog-impl configurations, " + f"but found type {provided_catalog_type} and py-catalog-impl {catalog_impl}" ) if catalog := _import_catalog(name, catalog_impl, properties): diff --git a/tests/catalog/test_base.py b/tests/catalog/test_base.py index 4928d31e0d..2f53a6c27c 100644 --- a/tests/catalog/test_base.py +++ b/tests/catalog/test_base.py @@ -310,8 +310,8 @@ def test_load_catalog_has_type_and_impl() -> None: load_catalog("catalog", **{"py-catalog-impl": "pyiceberg.does.not.exist.Catalog", "type": "sql"}) assert ( - "Must not set both catalog type and py-catalog-impl configuration, " - "but found type sql and impl pyiceberg.does.not.exist.Catalog" in str(exc_info.value) + "Must not set both catalog type and py-catalog-impl configurations, " + "but found type sql and py-catalog-impl pyiceberg.does.not.exist.Catalog" in str(exc_info.value) ) From ea9acad8e41bfc55747b31e97c4b1218e9df5f81 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Sat, 20 Jul 2024 12:07:35 -0700 Subject: [PATCH 5/5] fix typo 2 --- pyiceberg/catalog/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index c6496fba90..35f0a5b164 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -238,7 +238,7 @@ def load_catalog(name: Optional[str] = None, **properties: Optional[str]) -> Cat if catalog_impl := properties.get(PY_CATALOG_IMPL): if provided_catalog_type: raise ValueError( - f"Must not set both catalog type and py-catalog-impl configurations, " + "Must not set both catalog type and py-catalog-impl configurations, " f"but found type {provided_catalog_type} and py-catalog-impl {catalog_impl}" )