From 072f39214be814be6e6258ba2128627487c871ac Mon Sep 17 00:00:00 2001 From: Yothin Muangsommuk Date: Mon, 27 May 2024 10:33:51 +0700 Subject: [PATCH 1/3] Implement Kerberos authentication support for Hive Catalog --- mkdocs/docs/configuration.md | 20 +++++++-------- mkdocs/docs/index.md | 33 ++++++++++++------------ poetry.lock | 49 +++++++++++++++++++++++++++++++++--- pyiceberg/catalog/hive.py | 21 +++++++++++++--- pyproject.toml | 3 +++ 5 files changed, 92 insertions(+), 34 deletions(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index dc67b79044..7e8d1e24b7 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -268,19 +268,19 @@ catalog: catalog: default: uri: thrift://localhost:9083 - s3.endpoint: http://localhost:9000 - s3.access-key-id: admin - s3.secret-access-key: password + hive: + hive2-compatible: true + use-kerberos: true ``` -When using Hive 2.x, make sure to set the compatibility flag: + -```yaml -catalog: - default: -... - hive.hive2-compatible: true -``` +| Key | Example | Description | +| --------------------- | ------- | --------------------------------- | +| hive.hive2-compatible | true | Using Hive 2.x compatibility mode | +| hive.use-kerberos | true | Using authentication via Kerberos | + + ### Glue Catalog diff --git a/mkdocs/docs/index.md b/mkdocs/docs/index.md index 1fee9cc69b..61bffe442c 100644 --- a/mkdocs/docs/index.md +++ b/mkdocs/docs/index.md @@ -40,22 +40,23 @@ pip install "pyiceberg[s3fs,hive]" You can mix and match optional dependencies depending on your needs: -| Key | Description: | -| ------------ | -------------------------------------------------------------------- | -| hive | Support for the Hive metastore | -| glue | Support for AWS Glue | -| dynamodb | Support for AWS DynamoDB | -| sql-postgres | Support for SQL Catalog backed by Postgresql | -| sql-sqlite | Support for SQL Catalog backed by SQLite | -| pyarrow | PyArrow as a FileIO implementation to interact with the object store | -| pandas | Installs both PyArrow and Pandas | -| duckdb | Installs both PyArrow and DuckDB | -| ray | Installs PyArrow, Pandas, and Ray | -| daft | Installs Daft | -| s3fs | S3FS as a FileIO implementation to interact with the object store | -| adlfs | ADLFS as a FileIO implementation to interact with the object store | -| snappy | Support for snappy Avro compression | -| gcsfs | GCSFS as a FileIO implementation to interact with the object store | +| Key | Description: | +| ------------- | -------------------------------------------------------------------- | +| hive | Support for the Hive metastore | +| hive-kerberos | Support for Hive metastore in Kerberos environment | +| glue | Support for AWS Glue | +| dynamodb | Support for AWS DynamoDB | +| sql-postgres | Support for SQL Catalog backed by Postgresql | +| sql-sqlite | Support for SQL Catalog backed by SQLite | +| pyarrow | PyArrow as a FileIO implementation to interact with the object store | +| pandas | Installs both PyArrow and Pandas | +| duckdb | Installs both PyArrow and DuckDB | +| ray | Installs PyArrow, Pandas, and Ray | +| daft | Installs Daft | +| s3fs | S3FS as a FileIO implementation to interact with the object store | +| adlfs | ADLFS as a FileIO implementation to interact with the object store | +| snappy | Support for snappy Avro compression | +| gcsfs | GCSFS as a FileIO implementation to interact with the object store | You either need to install `s3fs`, `adlfs`, `gcsfs`, or `pyarrow` to be able to fetch files from an object store. diff --git a/poetry.lock b/poetry.lock index e2119764f3..0b279bfde0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1989,6 +1989,19 @@ files = [ importlib-resources = {version = ">=1.4.0", markers = "python_version < \"3.9\""} referencing = ">=0.31.0" +[[package]] +name = "kerberos" +version = "1.3.1" +description = "Kerberos high-level interface" +optional = true +python-versions = "*" +files = [ + {file = "kerberos-1.3.1-cp27-cp27m-macosx_11_1_x86_64.whl", hash = "sha256:98a695c072efef535cb2b5f98e474d00671588859a94ec96c2c1508a113ff3aa"}, + {file = "kerberos-1.3.1-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:2e03c6a9d201d4aab5f899bfb8150de15335955bfce8ca43bfe9a41d7aae54dc"}, + {file = "kerberos-1.3.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2002b3b1541fc51e2c081ee7048f55e5d9ca63dd09f0d7b951c263920db3a0bb"}, + {file = "kerberos-1.3.1.tar.gz", hash = "sha256:cdd046142a4e0060f96a00eb13d82a5d9ebc0f2d7934393ed559bac773460a2c"}, +] + [[package]] name = "lazy-object-proxy" version = "1.10.0" @@ -2884,7 +2897,6 @@ files = [ {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"}, {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"}, - {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"}, @@ -2893,8 +2905,6 @@ files = [ {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"}, - {file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"}, - {file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"}, @@ -2932,6 +2942,20 @@ files = [ {file = "psycopg2_binary-2.9.9-cp39-cp39-win_amd64.whl", hash = "sha256:f7ae5d65ccfbebdfa761585228eb4d0df3a8b15cfb53bd953e713e09fbb12957"}, ] +[[package]] +name = "pure-sasl" +version = "0.6.2" +description = "Pure Python client SASL implementation" +optional = true +python-versions = "*" +files = [ + {file = "pure-sasl-0.6.2.tar.gz", hash = "sha256:53c1355f5da95e2b85b2cc9a6af435518edc20c81193faa0eea65fdc835138f4"}, + {file = "pure_sasl-0.6.2-py2-none-any.whl", hash = "sha256:edb33b1a46eb3c602c0166de0442c0fb41f5ac2bfccbde4775183b105ad89ab2"}, +] + +[package.extras] +gssapi = ["kerberos (>=1.3.0)"] + [[package]] name = "py-partiql-parser" version = "0.5.5" @@ -4113,6 +4137,22 @@ all = ["tornado (>=4.0)", "twisted"] tornado = ["tornado (>=4.0)"] twisted = ["twisted"] +[[package]] +name = "thrift-sasl" +version = "0.4.3" +description = "Thrift SASL Python module that implements SASL transports for Thrift (`TSaslClientTransport`)." +optional = true +python-versions = "*" +files = [ + {file = "thrift_sasl-0.4.3-py2.py3-none-any.whl", hash = "sha256:d24b49140115e6e2a96d08335cff225a27a28ea71866fb1b2bdb30ca5afca64e"}, + {file = "thrift_sasl-0.4.3.tar.gz", hash = "sha256:5bdd5b760d90a13d9b3abfce873db0425861aa8d6bf25912d3cc0467a4f773da"}, +] + +[package.dependencies] +pure-sasl = ">=0.6.2" +six = ">=1.13.0" +thrift = {version = ">=0.10.0", markers = "python_version >= \"3.0\""} + [[package]] name = "tomli" version = "2.0.1" @@ -4564,6 +4604,7 @@ dynamodb = ["boto3"] gcsfs = ["gcsfs"] glue = ["boto3", "mypy-boto3-glue"] hive = ["thrift"] +hive-kerberos = ["kerberos", "thrift", "thrift-sasl"] pandas = ["numpy", "pandas", "pyarrow"] pyarrow = ["numpy", "pyarrow"] ray = ["numpy", "pandas", "pyarrow", "ray", "ray"] @@ -4576,4 +4617,4 @@ zstandard = ["zstandard"] [metadata] lock-version = "2.0" python-versions = ">=3.8, <3.12, !=3.9.7" -content-hash = "4cdc5aec21e4d18b1934406a3a8857555ba6b0d723681216423c74049630fd96" +content-hash = "0046c8fa470dc5ad9ba4f8917421596adc237fde7a6eb046755988e5a10c9ca8" diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index 755cd34c80..b2e1f77d09 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -121,6 +121,9 @@ HIVE2_COMPATIBLE = "hive.hive2-compatible" HIVE2_COMPATIBLE_DEFAULT = False +HIVE_KERBEROS_AUTH = "hive.use-kerberos" +HIVE_KERBEROS_AUTH_DEFAULT = False + LOCK_CHECK_MIN_WAIT_TIME = "lock-check-min-wait-time" LOCK_CHECK_MAX_WAIT_TIME = "lock-check-max-wait-time" LOCK_CHECK_RETRIES = "lock-check-retries" @@ -138,11 +141,17 @@ class _HiveClient: _client: Client _ugi: Optional[List[str]] - def __init__(self, uri: str, ugi: Optional[str] = None): + def __init__(self, uri: str, ugi: Optional[str] = None, use_kerberos: Optional[bool] = HIVE_KERBEROS_AUTH_DEFAULT): url_parts = urlparse(uri) + transport = TSocket.TSocket(url_parts.hostname, url_parts.port) - self._transport = TTransport.TBufferedTransport(transport) - protocol = TBinaryProtocol.TBinaryProtocol(transport) + + if not use_kerberos: + self._transport = TTransport.TBufferedTransport(transport) + else: + self._transport = TTransport.TSaslClientTransport(transport, host=url_parts.hostname, service="hive") + + protocol = TBinaryProtocol.TBinaryProtocol(self._transport) self._client = Client(protocol) self._ugi = ugi.split(":") if ugi else None @@ -257,7 +266,11 @@ class HiveCatalog(MetastoreCatalog): def __init__(self, name: str, **properties: str): super().__init__(name, **properties) - self._client = _HiveClient(properties["uri"], properties.get("ugi")) + self._client = _HiveClient( + properties["uri"], + properties.get("ugi"), + PropertyUtil.property_as_bool(properties, HIVE_KERBEROS_AUTH, HIVE_KERBEROS_AUTH_DEFAULT), + ) self._lock_check_min_wait_time = property_as_float(properties, LOCK_CHECK_MIN_WAIT_TIME, DEFAULT_LOCK_CHECK_MIN_WAIT_TIME) self._lock_check_max_wait_time = property_as_float(properties, LOCK_CHECK_MAX_WAIT_TIME, DEFAULT_LOCK_CHECK_MAX_WAIT_TIME) diff --git a/pyproject.toml b/pyproject.toml index 205d5921e6..352330147a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,6 +79,8 @@ psycopg2-binary = { version = ">=2.9.6", optional = true } sqlalchemy = { version = "^2.0.18", optional = true } getdaft = { version = ">=0.2.12", optional = true } numpy = { version = "^1.22.4", optional = true } +thrift-sasl = { version = ">=0.4.3", optional = true } +kerberos = { version = "1.3.1", optional = true } [tool.poetry.group.dev.dependencies] pytest = "7.4.4" @@ -587,6 +589,7 @@ ray = ["ray", "pyarrow", "pandas", "numpy"] daft = ["getdaft"] snappy = ["python-snappy"] hive = ["thrift"] +hive-kerberos = ["thrift", "thrift_sasl", "kerberos"] s3fs = ["s3fs"] glue = ["boto3", "mypy-boto3-glue"] adlfs = ["adlfs"] From 8db504fe453a1d451d01406ece28cd9b9316bb35 Mon Sep 17 00:00:00 2001 From: Yothin Muangsommuk Date: Tue, 11 Jun 2024 13:43:44 +0700 Subject: [PATCH 2/3] feat: Change hive kerberos auuthorization config key --- mkdocs/docs/configuration.md | 18 +++++++----------- pyiceberg/catalog/hive.py | 6 +++--- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 7e8d1e24b7..e6d7caac95 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -268,19 +268,15 @@ catalog: catalog: default: uri: thrift://localhost:9083 - hive: - hive2-compatible: true - use-kerberos: true + hive.hive2-compatible: true + hive.kerberos-authorization: true + s3.endpoint: http://localhost:9000 ``` - - -| Key | Example | Description | -| --------------------- | ------- | --------------------------------- | -| hive.hive2-compatible | true | Using Hive 2.x compatibility mode | -| hive.use-kerberos | true | Using authentication via Kerberos | - - +| Key | Example | Description | +| --------------------------- | ------- | --------------------------------- | +| hive.hive2-compatible | true | Using Hive 2.x compatibility mode | +| hive.kerberos-authorization | true | Using authentication via Kerberos | ### Glue Catalog diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index b2e1f77d09..bafe51d0b7 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -121,7 +121,7 @@ HIVE2_COMPATIBLE = "hive.hive2-compatible" HIVE2_COMPATIBLE_DEFAULT = False -HIVE_KERBEROS_AUTH = "hive.use-kerberos" +HIVE_KERBEROS_AUTH = "hive.kerberos-authorization" HIVE_KERBEROS_AUTH_DEFAULT = False LOCK_CHECK_MIN_WAIT_TIME = "lock-check-min-wait-time" @@ -141,12 +141,12 @@ class _HiveClient: _client: Client _ugi: Optional[List[str]] - def __init__(self, uri: str, ugi: Optional[str] = None, use_kerberos: Optional[bool] = HIVE_KERBEROS_AUTH_DEFAULT): + def __init__(self, uri: str, ugi: Optional[str] = None, kerberos_auth: Optional[bool] = HIVE_KERBEROS_AUTH_DEFAULT): url_parts = urlparse(uri) transport = TSocket.TSocket(url_parts.hostname, url_parts.port) - if not use_kerberos: + if not kerberos_auth: self._transport = TTransport.TBufferedTransport(transport) else: self._transport = TTransport.TSaslClientTransport(transport, host=url_parts.hostname, service="hive") From 322d143d69c52fcc67a256b028821b1da9fc30b6 Mon Sep 17 00:00:00 2001 From: Yothin Muangsommuk Date: Tue, 11 Jun 2024 15:37:12 +0700 Subject: [PATCH 3/3] fix: Handle negotiation failed on kerberos sasl connection --- pyiceberg/catalog/hive.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index bafe51d0b7..c78fc2a0c2 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -142,22 +142,33 @@ class _HiveClient: _ugi: Optional[List[str]] def __init__(self, uri: str, ugi: Optional[str] = None, kerberos_auth: Optional[bool] = HIVE_KERBEROS_AUTH_DEFAULT): - url_parts = urlparse(uri) + self._uri = uri + self._kerberos_auth = kerberos_auth + self._ugi = ugi.split(":") if ugi else None + + self._init_thrift_client() + + def _init_thrift_client(self): + url_parts = urlparse(self._uri) - transport = TSocket.TSocket(url_parts.hostname, url_parts.port) + socket = TSocket.TSocket(url_parts.hostname, url_parts.port) - if not kerberos_auth: - self._transport = TTransport.TBufferedTransport(transport) + if not self._kerberos_auth: + self._transport = TTransport.TBufferedTransport(socket) else: - self._transport = TTransport.TSaslClientTransport(transport, host=url_parts.hostname, service="hive") + self._transport = TTransport.TSaslClientTransport(socket, host=url_parts.hostname, service="hive") protocol = TBinaryProtocol.TBinaryProtocol(self._transport) self._client = Client(protocol) - self._ugi = ugi.split(":") if ugi else None def __enter__(self) -> Client: - self._transport.open() + if not self._kerberos_auth: + self._transport.open() + else: + self._init_thrift_client() + self._transport.open() + if self._ugi: self._client.set_ugi(*self._ugi) return self._client