From 4ef5a97613426b093a2d27ec183e8547a5c94760 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 Aug 2025 13:58:24 -0700 Subject: [PATCH 1/5] pass down args --- pyiceberg/io/pyarrow.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index ab85893ab4..2d36ad2fd8 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -87,9 +87,12 @@ ADLS_ACCOUNT_NAME, ADLS_BLOB_STORAGE_AUTHORITY, ADLS_BLOB_STORAGE_SCHEME, + ADLS_CLIENT_ID, + ADLS_CLIENT_SECRET, ADLS_DFS_STORAGE_AUTHORITY, ADLS_DFS_STORAGE_SCHEME, ADLS_SAS_TOKEN, + ADLS_TENANT_ID, AWS_ACCESS_KEY_ID, AWS_REGION, AWS_ROLE_ARN, @@ -535,6 +538,13 @@ def _initialize_azure_fs(self) -> FileSystem: if sas_token := self.properties.get(ADLS_SAS_TOKEN): client_kwargs["sas_token"] = sas_token + if client_id := self.properties.get(ADLS_CLIENT_ID): + client_kwargs["client_id"] = client_id + if client_secret := self.properties.get(ADLS_CLIENT_SECRET): + client_kwargs["client_secret"] = client_secret + if tenant_id := self.properties.get(ADLS_TENANT_ID): + client_kwargs["tenant_id"] = tenant_id + return AzureFileSystem(**client_kwargs) def _initialize_hdfs_fs(self, scheme: str, netloc: Optional[str]) -> FileSystem: From 21f762ae46d1782271e9315ae0bd7f93db972895 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 Aug 2025 14:00:09 -0700 Subject: [PATCH 2/5] add link --- pyiceberg/io/pyarrow.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 2d36ad2fd8..104f362f84 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -504,6 +504,7 @@ def _initialize_s3_fs(self, netloc: Optional[str]) -> FileSystem: return S3FileSystem(**client_kwargs) def _initialize_azure_fs(self) -> FileSystem: + # https://arrow.apache.org/docs/python/generated/pyarrow.fs.AzureFileSystem.html from packaging import version MIN_PYARROW_VERSION_SUPPORTING_AZURE_FS = "20.0.0" From 7c89a9bd5f0fa7fc8e783dd6d5193dad78d882b7 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 Aug 2025 14:11:08 -0700 Subject: [PATCH 3/5] add docs --- mkdocs/docs/configuration.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 40cfc0b8c9..59dbd2f853 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -153,14 +153,14 @@ For the FileIO there are several configuration options available: | adls.account-name | devstoreaccount1 | The account that you want to connect to | | adls.account-key | Eby8vdM02xNOcqF... | The key to authentication against the account. | | adls.sas-token | NuHOuuzdQN7VRM%2FOpOeqBlawRCA845IY05h9eu1Yte4%3D | The shared access signature | -| adls.tenant-id | ad667be4-b811-11ed-afa1-0242ac120002 | The tenant-id | -| adls.client-id | ad667be4-b811-11ed-afa1-0242ac120002 | The client-id | -| adls.client-secret | oCA3R6P\*ka#oa1Sms2J74z... | The client-secret | | adls.account-host | accountname1.blob.core.windows.net | The storage account host. See [AzureBlobFileSystem](https://github.com/fsspec/adlfs/blob/adb9c53b74a0d420625b86dd00fbe615b43201d2/adlfs/spec.py#L125) for reference | | adls.blob-storage-authority | .blob.core.windows.net | The hostname[:port] of the Blob Service. Defaults to `.blob.core.windows.net`. Useful for connecting to a local emulator, like [azurite](https://github.com/azure/azurite). See [AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system) for reference | | adls.dfs-storage-authority | .dfs.core.windows.net | The hostname[:port] of the Data Lake Gen 2 Service. Defaults to `.dfs.core.windows.net`. Useful for connecting to a local emulator, like [azurite](https://github.com/azure/azurite). See [AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system) for reference | | adls.blob-storage-scheme | https | Either `http` or `https`. Defaults to `https`. Useful for connecting to a local emulator, like [azurite](https://github.com/azure/azurite). See [AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system) for reference | | adls.dfs-storage-scheme | https | Either `http` or `https`. Defaults to `https`. Useful for connecting to a local emulator, like [azurite](https://github.com/azure/azurite). See [AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system) for reference | +| adls.tenant-id | ad667be4-b811-11ed-afa1-0242ac120002 | Tenant ID for Azure Active Directory authentication. | +| adls.client-id | ad667be4-b811-11ed-afa1-0242ac120002 | The client ID (Application ID) for Azure Active Directory authentication. It is the Application (client) ID of your registered Azure AD application (Service Principal). | +| adls.client-secret | oCA3R6P\*ka#oa1Sms2J74z... | Client secret for Azure Active Directory authentication. | From ad28d25f48f5cf3f15341d155895381561aba943 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 Aug 2025 14:12:01 -0700 Subject: [PATCH 4/5] Revert "add docs" This reverts commit 7c89a9bd5f0fa7fc8e783dd6d5193dad78d882b7. --- mkdocs/docs/configuration.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 59dbd2f853..40cfc0b8c9 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -153,14 +153,14 @@ For the FileIO there are several configuration options available: | adls.account-name | devstoreaccount1 | The account that you want to connect to | | adls.account-key | Eby8vdM02xNOcqF... | The key to authentication against the account. | | adls.sas-token | NuHOuuzdQN7VRM%2FOpOeqBlawRCA845IY05h9eu1Yte4%3D | The shared access signature | +| adls.tenant-id | ad667be4-b811-11ed-afa1-0242ac120002 | The tenant-id | +| adls.client-id | ad667be4-b811-11ed-afa1-0242ac120002 | The client-id | +| adls.client-secret | oCA3R6P\*ka#oa1Sms2J74z... | The client-secret | | adls.account-host | accountname1.blob.core.windows.net | The storage account host. See [AzureBlobFileSystem](https://github.com/fsspec/adlfs/blob/adb9c53b74a0d420625b86dd00fbe615b43201d2/adlfs/spec.py#L125) for reference | | adls.blob-storage-authority | .blob.core.windows.net | The hostname[:port] of the Blob Service. Defaults to `.blob.core.windows.net`. Useful for connecting to a local emulator, like [azurite](https://github.com/azure/azurite). See [AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system) for reference | | adls.dfs-storage-authority | .dfs.core.windows.net | The hostname[:port] of the Data Lake Gen 2 Service. Defaults to `.dfs.core.windows.net`. Useful for connecting to a local emulator, like [azurite](https://github.com/azure/azurite). See [AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system) for reference | | adls.blob-storage-scheme | https | Either `http` or `https`. Defaults to `https`. Useful for connecting to a local emulator, like [azurite](https://github.com/azure/azurite). See [AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system) for reference | | adls.dfs-storage-scheme | https | Either `http` or `https`. Defaults to `https`. Useful for connecting to a local emulator, like [azurite](https://github.com/azure/azurite). See [AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system) for reference | -| adls.tenant-id | ad667be4-b811-11ed-afa1-0242ac120002 | Tenant ID for Azure Active Directory authentication. | -| adls.client-id | ad667be4-b811-11ed-afa1-0242ac120002 | The client ID (Application ID) for Azure Active Directory authentication. It is the Application (client) ID of your registered Azure AD application (Service Principal). | -| adls.client-secret | oCA3R6P\*ka#oa1Sms2J74z... | Client secret for Azure Active Directory authentication. | From 42c30a4a6744befa43bfa83d638a282538fe0440 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 10 Aug 2025 17:18:05 -0700 Subject: [PATCH 5/5] validate that all three are provided together --- pyiceberg/io/pyarrow.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 104f362f84..c756487c32 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -546,6 +546,17 @@ def _initialize_azure_fs(self) -> FileSystem: if tenant_id := self.properties.get(ADLS_TENANT_ID): client_kwargs["tenant_id"] = tenant_id + # Validate that all three are provided together for ClientSecretCredential + credential_keys = ["client_id", "client_secret", "tenant_id"] + provided_keys = [key for key in credential_keys if key in client_kwargs] + if provided_keys and len(provided_keys) != len(credential_keys): + missing_keys = [key for key in credential_keys if key not in client_kwargs] + raise ValueError( + f"client_id, client_secret, and tenant_id must all be provided together " + f"to use ClientSecretCredential for Azure authentication. " + f"Provided: {provided_keys}, Missing: {missing_keys}" + ) + return AzureFileSystem(**client_kwargs) def _initialize_hdfs_fs(self, scheme: str, netloc: Optional[str]) -> FileSystem: