apache · yothinix · May 27, 2024 · Jun 11, 2024 · Jun 11, 2024 · kevinjqliu
diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md
@@ -268,19 +268,15 @@ catalog:
 catalog:
   default:
     uri: thrift://localhost:9083
+    hive.hive2-compatible: true
+    hive.kerberos-authorization: true
     s3.endpoint: http://localhost:9000
-    s3.access-key-id: admin
-    s3.secret-access-key: password
 ```
 
-When using Hive 2.x, make sure to set the compatibility flag:
-
-```yaml
-catalog:
-  default:
-...
-    hive.hive2-compatible: true
-```
+| Key                         | Example | Description                       |
+| --------------------------- | ------- | --------------------------------- |
+| hive.hive2-compatible       | true    | Using Hive 2.x compatibility mode |
+| hive.kerberos-authorization | true    | Using authentication via Kerberos |
 
 ### Glue Catalog
 

diff --git a/mkdocs/docs/index.md b/mkdocs/docs/index.md
@@ -40,22 +40,23 @@ pip install "pyiceberg[s3fs,hive]"
 
 You can mix and match optional dependencies depending on your needs:
 
-| Key          | Description:                                                         |
-| ------------ | -------------------------------------------------------------------- |
-| hive         | Support for the Hive metastore                                       |
-| glue         | Support for AWS Glue                                                 |
-| dynamodb     | Support for AWS DynamoDB                                             |
-| sql-postgres | Support for SQL Catalog backed by Postgresql                         |
-| sql-sqlite   | Support for SQL Catalog backed by SQLite                             |
-| pyarrow      | PyArrow as a FileIO implementation to interact with the object store |
-| pandas       | Installs both PyArrow and Pandas                                     |
-| duckdb       | Installs both PyArrow and DuckDB                                     |
-| ray          | Installs PyArrow, Pandas, and Ray                                    |
-| daft         | Installs Daft                                                        |
-| s3fs         | S3FS as a FileIO implementation to interact with the object store    |
-| adlfs        | ADLFS as a FileIO implementation to interact with the object store   |
-| snappy       | Support for snappy Avro compression                                  |
-| gcsfs        | GCSFS as a FileIO implementation to interact with the object store   |
+| Key           | Description:                                                         |
+| ------------- | -------------------------------------------------------------------- |
+| hive          | Support for the Hive metastore                                       |
+| hive-kerberos | Support for Hive metastore in Kerberos environment                   |
+| glue          | Support for AWS Glue                                                 |
+| dynamodb      | Support for AWS DynamoDB                                             |
+| sql-postgres  | Support for SQL Catalog backed by Postgresql                         |
+| sql-sqlite    | Support for SQL Catalog backed by SQLite                             |
+| pyarrow       | PyArrow as a FileIO implementation to interact with the object store |
+| pandas        | Installs both PyArrow and Pandas                                     |
+| duckdb        | Installs both PyArrow and DuckDB                                     |
+| ray           | Installs PyArrow, Pandas, and Ray                                    |
+| daft          | Installs Daft                                                        |
+| s3fs          | S3FS as a FileIO implementation to interact with the object store    |
+| adlfs         | ADLFS as a FileIO implementation to interact with the object store   |
+| snappy        | Support for snappy Avro compression                                  |
+| gcsfs         | GCSFS as a FileIO implementation to interact with the object store   |
 
 You either need to install `s3fs`, `adlfs`, `gcsfs`, or `pyarrow` to be able to fetch files from an object store.
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py
@@ -121,6 +121,9 @@
 HIVE2_COMPATIBLE = "hive.hive2-compatible"
 HIVE2_COMPATIBLE_DEFAULT = False
 
+HIVE_KERBEROS_AUTH = "hive.kerberos-authorization"
-HIVE_KERBEROS_AUTH = "hive.kerberos-authorization"
+HIVE_KERBEROS_AUTH = "hive.kerberos-authentication"
-HIVE_KERBEROS_AUTH = "hive.kerberos-authorization"
+HIVE_KERBEROS_AUTH = "hive.kerberos-authentication"
+HIVE_KERBEROS_AUTH_DEFAULT = False
+
 LOCK_CHECK_MIN_WAIT_TIME = "lock-check-min-wait-time"
 LOCK_CHECK_MAX_WAIT_TIME = "lock-check-max-wait-time"
 LOCK_CHECK_RETRIES = "lock-check-retries"
@@ -138,17 +141,34 @@ class _HiveClient:
     _client: Client
     _ugi: Optional[List[str]]
 
-    def __init__(self, uri: str, ugi: Optional[str] = None):
-        url_parts = urlparse(uri)
-        transport = TSocket.TSocket(url_parts.hostname, url_parts.port)
-        self._transport = TTransport.TBufferedTransport(transport)
-        protocol = TBinaryProtocol.TBinaryProtocol(transport)
+    def __init__(self, uri: str, ugi: Optional[str] = None, kerberos_auth: Optional[bool] = HIVE_KERBEROS_AUTH_DEFAULT):
+        self._uri = uri
+        self._kerberos_auth = kerberos_auth
+        self._ugi = ugi.split(":") if ugi else None
+
+        self._init_thrift_client()
+
+    def _init_thrift_client(self):
+        url_parts = urlparse(self._uri)
+
+        socket = TSocket.TSocket(url_parts.hostname, url_parts.port)
+
+        if not self._kerberos_auth:
+            self._transport = TTransport.TBufferedTransport(socket)
+        else:
+            self._transport = TTransport.TSaslClientTransport(socket, host=url_parts.hostname, service="hive")
+
+        protocol = TBinaryProtocol.TBinaryProtocol(self._transport)
 
         self._client = Client(protocol)
-        self._ugi = ugi.split(":") if ugi else None
 
     def __enter__(self) -> Client:
-        self._transport.open()
+        if not self._kerberos_auth:
+            self._transport.open()
+        else:
+            self._init_thrift_client()
+            self._transport.open()
+
         if self._ugi:
             self._client.set_ugi(*self._ugi)
         return self._client
@@ -257,7 +277,11 @@ class HiveCatalog(MetastoreCatalog):
 
     def __init__(self, name: str, **properties: str):
         super().__init__(name, **properties)
-        self._client = _HiveClient(properties["uri"], properties.get("ugi"))
+        self._client = _HiveClient(
+            properties["uri"],
+            properties.get("ugi"),
+            PropertyUtil.property_as_bool(properties, HIVE_KERBEROS_AUTH, HIVE_KERBEROS_AUTH_DEFAULT),
+        )
 
         self._lock_check_min_wait_time = property_as_float(properties, LOCK_CHECK_MIN_WAIT_TIME, DEFAULT_LOCK_CHECK_MIN_WAIT_TIME)
         self._lock_check_max_wait_time = property_as_float(properties, LOCK_CHECK_MAX_WAIT_TIME, DEFAULT_LOCK_CHECK_MAX_WAIT_TIME)

diff --git a/pyproject.toml b/pyproject.toml
@@ -79,6 +79,8 @@ psycopg2-binary = { version = ">=2.9.6", optional = true }
 sqlalchemy = { version = "^2.0.18", optional = true }
 getdaft = { version = ">=0.2.12", optional = true }
 numpy = { version = "^1.22.4", optional = true }
+thrift-sasl = { version = ">=0.4.3", optional = true }
+kerberos = { version = "1.3.1", optional = true }
 
 [tool.poetry.group.dev.dependencies]
 pytest = "7.4.4"
@@ -587,6 +589,7 @@ ray = ["ray", "pyarrow", "pandas", "numpy"]
 daft = ["getdaft"]
 snappy = ["python-snappy"]
 hive = ["thrift"]
+hive-kerberos = ["thrift", "thrift_sasl", "kerberos"]
 s3fs = ["s3fs"]
 glue = ["boto3", "mypy-boto3-glue"]
 adlfs = ["adlfs"]