Skip to content

Commit 2cba3b7

Browse files
authored
fsspec: Support token in ADLS (#2331)
<!-- Thanks for opening a pull request! --> <!-- In the case this PR will resolve an issue, please replace ${GITHUB_ISSUE_ID} below with the actual Github issue id. --> <!-- Closes #${GITHUB_ISSUE_ID} --> # Rationale for this change Closes #2328 Pass `token` string as `credential` in `AzureBlobFileSystem`, https://fsspec.github.io/adlfs/api/ This is a known workaround as described by Azure/azure-sdk-for-python#9075 (comment) I've also made a feature request for java implementation apache/iceberg#13818 Note, [`pyarrow.fs.AzureFileSystem`](https://arrow.apache.org/docs/python/generated/pyarrow.fs.AzureFileSystem.html) does not currently expose credential as a parameter # Are these changes tested? Yes, manually since azurite does not integrate with entra. heres a repro script ``` from azure.identity import DefaultAzureCredential import pyarrow as pa credential = DefaultAzureCredential() token = credential.get_token("https://storage.azure.com/.default").token warehouse = "abfss://kevinliu_demo_ws@daily-onelake.blob.fabric.microsoft.com/kevinliu_demo_lh.Lakehouse/Files" account_name = "daily-onelake" account_host = f"{account_name}.blob.fabric.microsoft.com" catalog = load_catalog("default", **{ "type": "in-memory", "warehouse": warehouse, "adls.account-name": account_name, "adls.account-host": account_host, "adls.token": token, }) catalog.create_namespace_if_not_exists("default") try: catalog.drop_table("default.test") except: ... TEST_DATA = { "id": [1, 2, 3, 1, 1], "name": ["AB", "CD", "EF", "CD", "EF"], } arrow_table = pa.Table.from_pydict(TEST_DATA) tbl = catalog.create_table_if_not_exists("default.test", schema=arrow_table.schema) tbl.append(arrow_table) tbl.scan().to_arrow() ``` # Are there any user-facing changes? <!-- In the case of user-facing changes, please add the changelog label. -->
1 parent 8013545 commit 2cba3b7

File tree

3 files changed

+26
-1
lines changed

3 files changed

+26
-1
lines changed

mkdocs/docs/configuration.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ For the FileIO there are several configuration options available:
162162
| adls.dfs-storage-authority | .dfs.core.windows.net | The hostname[:port] of the Data Lake Gen 2 Service. Defaults to `.dfs.core.windows.net`. Useful for connecting to a local emulator, like [azurite](https://github.com/azure/azurite). See [AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system) for reference |
163163
| adls.blob-storage-scheme | https | Either `http` or `https`. Defaults to `https`. Useful for connecting to a local emulator, like [azurite](https://github.com/azure/azurite). See [AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system) for reference |
164164
| adls.dfs-storage-scheme | https | Either `http` or `https`. Defaults to `https`. Useful for connecting to a local emulator, like [azurite](https://github.com/azure/azurite). See [AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system) for reference |
165+
| adls.token | eyJ0eXAiOiJKV1QiLCJhbGci... | Static access token for authenticating with ADLS. Used for OAuth2 flows. |
165166

166167
<!-- markdown-link-check-enable-->
167168

pyiceberg/io/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@
8787
ADLS_DFS_STORAGE_AUTHORITY = "adls.dfs-storage-authority"
8888
ADLS_BLOB_STORAGE_SCHEME = "adls.blob-storage-scheme"
8989
ADLS_DFS_STORAGE_SCHEME = "adls.dfs-storage-scheme"
90+
ADLS_TOKEN = "adls.token"
9091
GCS_TOKEN = "gcs.oauth2.token"
9192
GCS_TOKEN_EXPIRES_AT_MS = "gcs.oauth2.token-expires-at"
9293
GCS_PROJECT_ID = "gcs.project-id"

pyiceberg/io/fsspec.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
ADLS_CREDENTIAL,
4949
ADLS_SAS_TOKEN,
5050
ADLS_TENANT_ID,
51+
ADLS_TOKEN,
5152
AWS_ACCESS_KEY_ID,
5253
AWS_REGION,
5354
AWS_SECRET_ACCESS_KEY,
@@ -197,7 +198,11 @@ def _gs(properties: Properties) -> AbstractFileSystem:
197198

198199

199200
def _adls(properties: Properties) -> AbstractFileSystem:
201+
# https://fsspec.github.io/adlfs/api/
202+
200203
from adlfs import AzureBlobFileSystem
204+
from azure.core.credentials import AccessToken
205+
from azure.core.credentials_async import AsyncTokenCredential
201206

202207
for key, sas_token in {
203208
key.replace(f"{ADLS_SAS_TOKEN}.", ""): value for key, value in properties.items() if key.startswith(ADLS_SAS_TOKEN)
@@ -207,9 +212,27 @@ def _adls(properties: Properties) -> AbstractFileSystem:
207212
if ADLS_SAS_TOKEN not in properties:
208213
properties[ADLS_SAS_TOKEN] = sas_token
209214

215+
class StaticTokenCredential(AsyncTokenCredential):
216+
_DEFAULT_EXPIRY_SECONDS = 3600
217+
218+
def __init__(self, token_string: str) -> None:
219+
self._token = token_string
220+
221+
async def get_token(self, *scopes: str, **kwargs: Any) -> AccessToken:
222+
import time
223+
224+
# Set expiration 1 hour from now
225+
expires_on = int(time.time()) + self._DEFAULT_EXPIRY_SECONDS
226+
return AccessToken(self._token, expires_on)
227+
228+
if token := properties.get(ADLS_TOKEN):
229+
credential = StaticTokenCredential(token)
230+
else:
231+
credential = properties.get(ADLS_CREDENTIAL) # type: ignore
232+
210233
return AzureBlobFileSystem(
211234
connection_string=properties.get(ADLS_CONNECTION_STRING),
212-
credential=properties.get(ADLS_CREDENTIAL),
235+
credential=credential,
213236
account_name=properties.get(ADLS_ACCOUNT_NAME),
214237
account_key=properties.get(ADLS_ACCOUNT_KEY),
215238
sas_token=properties.get(ADLS_SAS_TOKEN),

0 commit comments

Comments
 (0)