From 8931d90bd26c32b291351fa4d0915ccdfccf0cdf Mon Sep 17 00:00:00 2001 From: Fokko Date: Sat, 22 Feb 2025 21:20:25 +0100 Subject: [PATCH 1/3] Arrow: Suppress warning and cache bucket location Attemt to remove the unneccessary warning, and cache the location of the bucket independent of the FileIO. Fixes #1705 Fixes #1708 --- pyiceberg/io/pyarrow.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index c0d078abc7..cfc5126406 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -194,6 +194,13 @@ T = TypeVar("T") +@lru_cache +def _cached_resolve_s3_region(bucket: str) -> str: + from pyarrow.fs import resolve_s3_region + + return resolve_s3_region(bucket=bucket) + + class UnsupportedPyArrowTypeException(Exception): """Cannot convert PyArrow type to corresponding Iceberg type.""" @@ -414,19 +421,19 @@ def _initialize_oss_fs(self) -> FileSystem: return S3FileSystem(**client_kwargs) def _initialize_s3_fs(self, netloc: Optional[str]) -> FileSystem: - from pyarrow.fs import S3FileSystem, resolve_s3_region + from pyarrow.fs import S3FileSystem # Resolve region from netloc(bucket), fallback to user-provided region provided_region = get_first_property_value(self.properties, S3_REGION, AWS_REGION) try: - bucket_region = resolve_s3_region(bucket=netloc) + bucket_region = _cached_resolve_s3_region(bucket=netloc) except (OSError, TypeError): bucket_region = None logger.warning(f"Unable to resolve region for bucket {netloc}, using default region {provided_region}") bucket_region = bucket_region or provided_region - if bucket_region != provided_region: + if provided_region is not None and bucket_region != provided_region: logger.warning( f"PyArrow FileIO overriding S3 bucket region for bucket {netloc}: " f"provided region {provided_region}, actual region {bucket_region}" From 1e3e360c2856cb723cdbc88f954087f52400da05 Mon Sep 17 00:00:00 2001 From: Fokko Date: Sun, 23 Feb 2025 21:58:42 +0100 Subject: [PATCH 2/3] Thanks Hussein --- pyiceberg/io/pyarrow.py | 16 +++++++--------- tests/io/test_pyarrow.py | 2 +- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index cfc5126406..765dbc7358 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -195,10 +195,14 @@ @lru_cache -def _cached_resolve_s3_region(bucket: str) -> str: +def _cached_resolve_s3_region(bucket: str) -> Optional[str]: from pyarrow.fs import resolve_s3_region - return resolve_s3_region(bucket=bucket) + try: + return resolve_s3_region(bucket=bucket) + except (OSError, TypeError): + logger.warning(f"Unable to resolve region for bucket {bucket}") + return None class UnsupportedPyArrowTypeException(Exception): @@ -425,14 +429,8 @@ def _initialize_s3_fs(self, netloc: Optional[str]) -> FileSystem: # Resolve region from netloc(bucket), fallback to user-provided region provided_region = get_first_property_value(self.properties, S3_REGION, AWS_REGION) + bucket_region = _cached_resolve_s3_region(bucket=netloc) or provided_region - try: - bucket_region = _cached_resolve_s3_region(bucket=netloc) - except (OSError, TypeError): - bucket_region = None - logger.warning(f"Unable to resolve region for bucket {netloc}, using default region {provided_region}") - - bucket_region = bucket_region or provided_region if provided_region is not None and bucket_region != provided_region: logger.warning( f"PyArrow FileIO overriding S3 bucket region for bucket {netloc}: " diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index e2be7872a9..cf4ed67920 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2292,7 +2292,7 @@ def _s3_region_map(bucket: str) -> str: # The region is set to provided region if bucket region cannot be resolved with caplog.at_level(logging.WARNING): assert pyarrow_file_io.new_input("s3://non-exist-bucket/path/to/file")._filesystem.region == user_provided_region - assert f"Unable to resolve region for bucket non-exist-bucket, using default region {user_provided_region}" in caplog.text + assert "Unable to resolve region for bucket non-exist-bucket" in caplog.text for bucket_region in bucket_regions: # For s3 scheme, region is overwritten by resolved bucket region if different from user provided region From 64375021a3a0e23c939fb833dd7b87619a42c1f8 Mon Sep 17 00:00:00 2001 From: Fokko Date: Mon, 24 Feb 2025 07:13:31 +0100 Subject: [PATCH 3/3] Disable resolve by default --- mkdocs/docs/configuration.md | 33 +++++++++++++++++---------------- pyiceberg/io/__init__.py | 1 + pyiceberg/io/pyarrow.py | 20 +++++++++++++------- tests/io/test_pyarrow.py | 2 +- 4 files changed, 32 insertions(+), 24 deletions(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index c7c26c4912..bf81ff07f0 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -108,22 +108,23 @@ For the FileIO there are several configuration options available: -| Key | Example | Description | -|----------------------|----------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| s3.endpoint | | Configure an alternative endpoint of the S3 service for the FileIO to access. This could be used to use S3FileIO with any s3-compatible object storage service that has a different endpoint, or access a private S3 endpoint in a virtual private cloud. | -| s3.access-key-id | admin | Configure the static access key id used to access the FileIO. | -| s3.secret-access-key | password | Configure the static secret access key used to access the FileIO. | -| s3.session-token | AQoDYXdzEJr... | Configure the static session token used to access the FileIO. | -| s3.role-session-name | session | An optional identifier for the assumed role session. | -| s3.role-arn | arn:aws:... | AWS Role ARN. If provided instead of access_key and secret_key, temporary credentials will be fetched by assuming this role. | -| s3.signer | bearer | Configure the signature version of the FileIO. | -| s3.signer.uri | | Configure the remote signing uri if it differs from the catalog uri. Remote signing is only implemented for `FsspecFileIO`. The final request is sent to `/`. | -| s3.signer.endpoint | v1/main/s3-sign | Configure the remote signing endpoint. Remote signing is only implemented for `FsspecFileIO`. The final request is sent to `/`. (default : v1/aws/s3/sign). | -| s3.region | us-west-2 | Configure the default region used to initialize an `S3FileSystem`. `PyArrowFileIO` attempts to automatically resolve the region for each S3 bucket, falling back to this value if resolution fails. | -| s3.proxy-uri | | Configure the proxy server to be used by the FileIO. | -| s3.connect-timeout | 60.0 | Configure socket connection timeout, in seconds. | -| s3.request-timeout | 60.0 | Configure socket read timeouts on Windows and macOS, in seconds. | -| s3.force-virtual-addressing | False | Whether to use virtual addressing of buckets. If true, then virtual addressing is always enabled. If false, then virtual addressing is only enabled if endpoint_override is empty. This can be used for non-AWS backends that only support virtual hosted-style access. | +| Key | Example | Description | +|-----------------------------|----------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| s3.endpoint | | Configure an alternative endpoint of the S3 service for the FileIO to access. This could be used to use S3FileIO with any s3-compatible object storage service that has a different endpoint, or access a private S3 endpoint in a virtual private cloud. | +| s3.access-key-id | admin | Configure the static access key id used to access the FileIO. | +| s3.secret-access-key | password | Configure the static secret access key used to access the FileIO. | +| s3.session-token | AQoDYXdzEJr... | Configure the static session token used to access the FileIO. | +| s3.role-session-name | session | An optional identifier for the assumed role session. | +| s3.role-arn | arn:aws:... | AWS Role ARN. If provided instead of access_key and secret_key, temporary credentials will be fetched by assuming this role. | +| s3.signer | bearer | Configure the signature version of the FileIO. | +| s3.signer.uri | | Configure the remote signing uri if it differs from the catalog uri. Remote signing is only implemented for `FsspecFileIO`. The final request is sent to `/`. | +| s3.signer.endpoint | v1/main/s3-sign | Configure the remote signing endpoint. Remote signing is only implemented for `FsspecFileIO`. The final request is sent to `/`. (default : v1/aws/s3/sign). | +| s3.region | us-west-2 | Configure the default region used to initialize an `S3FileSystem`. `PyArrowFileIO` attempts to automatically tries to resolve the region if this isn't set (only supported for AWS S3 Buckets). | +| s3.resolve-region | False | Only supported for `PyArrowFileIO`, when enabled, it will always try to resolve the location of the bucket (only supported for AWS S3 Buckets). | +| s3.proxy-uri | | Configure the proxy server to be used by the FileIO. | +| s3.connect-timeout | 60.0 | Configure socket connection timeout, in seconds. | +| s3.request-timeout | 60.0 | Configure socket read timeouts on Windows and macOS, in seconds. | +| s3.force-virtual-addressing | False | Whether to use virtual addressing of buckets. If true, then virtual addressing is always enabled. If false, then virtual addressing is only enabled if endpoint_override is empty. This can be used for non-AWS backends that only support virtual hosted-style access. | diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index ac25c2d767..f4606b6f30 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -59,6 +59,7 @@ S3_SECRET_ACCESS_KEY = "s3.secret-access-key" S3_SESSION_TOKEN = "s3.session-token" S3_REGION = "s3.region" +S3_RESOLVE_REGION = "s3.resolve-region" S3_PROXY_URI = "s3.proxy-uri" S3_CONNECT_TIMEOUT = "s3.connect-timeout" S3_REQUEST_TIMEOUT = "s3.request-timeout" diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 765dbc7358..f7e3c7c082 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -107,6 +107,7 @@ S3_PROXY_URI, S3_REGION, S3_REQUEST_TIMEOUT, + S3_RESOLVE_REGION, S3_ROLE_ARN, S3_ROLE_SESSION_NAME, S3_SECRET_ACCESS_KEY, @@ -427,15 +428,20 @@ def _initialize_oss_fs(self) -> FileSystem: def _initialize_s3_fs(self, netloc: Optional[str]) -> FileSystem: from pyarrow.fs import S3FileSystem - # Resolve region from netloc(bucket), fallback to user-provided region provided_region = get_first_property_value(self.properties, S3_REGION, AWS_REGION) - bucket_region = _cached_resolve_s3_region(bucket=netloc) or provided_region - if provided_region is not None and bucket_region != provided_region: - logger.warning( - f"PyArrow FileIO overriding S3 bucket region for bucket {netloc}: " - f"provided region {provided_region}, actual region {bucket_region}" - ) + # Do this when we don't provide the region at all, or when we explicitly enable it + if provided_region is None or property_as_bool(self.properties, S3_RESOLVE_REGION, False) is True: + # Resolve region from netloc(bucket), fallback to user-provided region + # Only supported by buckets hosted by S3 + bucket_region = _cached_resolve_s3_region(bucket=netloc) or provided_region + if provided_region is not None and bucket_region != provided_region: + logger.warning( + f"PyArrow FileIO overriding S3 bucket region for bucket {netloc}: " + f"provided region {provided_region}, actual region {bucket_region}" + ) + else: + bucket_region = provided_region client_kwargs: Dict[str, Any] = { "endpoint_override": self.properties.get(S3_ENDPOINT), diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index cf4ed67920..3f43d9215a 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2285,7 +2285,7 @@ def _s3_region_map(bucket: str) -> str: raise OSError("Unknown bucket") # For a pyarrow io instance with configured default s3 region - pyarrow_file_io = PyArrowFileIO({"s3.region": user_provided_region}) + pyarrow_file_io = PyArrowFileIO({"s3.region": user_provided_region, "s3.resolve-region": "true"}) with patch("pyarrow.fs.resolve_s3_region") as mock_s3_region_resolver: mock_s3_region_resolver.side_effect = _s3_region_map