From 0c8fa1b38635e7b33649f7c1a85c9d47c6979c65 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 14 Dec 2024 14:54:27 +0100 Subject: [PATCH 01/71] feat: initial setup for S3TablesCatalog --- poetry.lock | 1 + pyiceberg/catalog/__init__.py | 4 +- pyiceberg/catalog/s3tables.py | 228 +++++++++++++++++++++++++++++++++ pyiceberg/exceptions.py | 5 + pyproject.toml | 1 + tests/catalog/test_s3tables.py | 71 ++++++++++ 6 files changed, 308 insertions(+), 2 deletions(-) create mode 100644 pyiceberg/catalog/s3tables.py create mode 100644 tests/catalog/test_s3tables.py diff --git a/poetry.lock b/poetry.lock index 7cbdb23508..766259dcda 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5627,6 +5627,7 @@ pyiceberg-core = ["pyiceberg-core"] ray = ["pandas", "pyarrow", "ray", "ray"] rest-sigv4 = ["boto3"] s3fs = ["s3fs"] +s3tables = ["boto3"] snappy = ["python-snappy"] sql-postgres = ["psycopg2-binary", "sqlalchemy"] sql-sqlite = ["sqlalchemy"] diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index a39f6bc711..8fe73d4369 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -942,8 +942,8 @@ def _get_default_warehouse_location(self, database_name: str, table_name: str) - raise ValueError("No default path is set, please specify a location when creating a table") @staticmethod - def _write_metadata(metadata: TableMetadata, io: FileIO, metadata_path: str) -> None: - ToOutputFile.table_metadata(metadata, io.new_output(metadata_path)) + def _write_metadata(metadata: TableMetadata, io: FileIO, metadata_path: str, overwrite: bool = False) -> None: + ToOutputFile.table_metadata(metadata, io.new_output(metadata_path), overwrite=overwrite) @staticmethod def _get_metadata_location(location: str, new_version: int = 0) -> str: diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py new file mode 100644 index 0000000000..82cd862571 --- /dev/null +++ b/pyiceberg/catalog/s3tables.py @@ -0,0 +1,228 @@ +import re +from typing import TYPE_CHECKING +from typing import List +from typing import Optional +from typing import Set +from typing import Tuple +from typing import Union + +import boto3 + +from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, MetastoreCatalog +from pyiceberg.catalog import WAREHOUSE_LOCATION +from pyiceberg.catalog import Catalog +from pyiceberg.catalog import PropertiesUpdateSummary +from pyiceberg.exceptions import S3TablesError +from pyiceberg.exceptions import TableBucketNotFound +from pyiceberg.io import AWS_ACCESS_KEY_ID +from pyiceberg.io import AWS_REGION +from pyiceberg.io import AWS_SECRET_ACCESS_KEY +from pyiceberg.io import AWS_SESSION_TOKEN +from pyiceberg.io import load_file_io +from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec +from pyiceberg.schema import Schema +from pyiceberg.serializers import FromInputFile +from pyiceberg.table import CommitTableResponse +from pyiceberg.table import CreateTableTransaction +from pyiceberg.table import Table +from pyiceberg.table import TableRequirement +from pyiceberg.table.metadata import new_table_metadata +from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder +from pyiceberg.table.update import TableUpdate +from pyiceberg.typedef import EMPTY_DICT, Identifier +from pyiceberg.typedef import Properties +from pyiceberg.utils.properties import get_first_property_value + +if TYPE_CHECKING: + import pyarrow as pa + +S3TABLES_PROFILE_NAME = "s3tables.profile-name" +S3TABLES_REGION = "s3tables.region" +S3TABLES_ACCESS_KEY_ID = "s3tables.access-key-id" +S3TABLES_SECRET_ACCESS_KEY = "s3tables.secret-access-key" +S3TABLES_SESSION_TOKEN = "s3tables.session-token" + +S3TABLES_ENDPOINT = "s3tables.endpoint" + + +class S3TableCatalog(MetastoreCatalog): + def __init__(self, name: str, **properties: str): + super().__init__(name, **properties) + + session = boto3.Session( + profile_name=properties.get(S3TABLES_PROFILE_NAME), + region_name=get_first_property_value(properties, S3TABLES_REGION, AWS_REGION), + botocore_session=properties.get(DEPRECATED_BOTOCORE_SESSION), + aws_access_key_id=get_first_property_value(properties, S3TABLES_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID), + aws_secret_access_key=get_first_property_value( + properties, S3TABLES_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY + ), + aws_session_token=get_first_property_value(properties, S3TABLES_SESSION_TOKEN, AWS_SESSION_TOKEN), + ) + # TODO: s3tables client only supported from boto3>=1.35.74 so this can crash + # TODO: set a custom user-agent for api calls like the Java implementation + self.s3tables = session.client("s3tables") + # TODO: handle malformed properties instead of just raising a key error here + self.table_bucket_arn = self.properties[WAREHOUSE_LOCATION] + try: + self.s3tables.get_table_bucket(tableBucketARN=self.table_bucket_arn) + except self.s3tables.exceptions.NotFoundException as e: + raise TableBucketNotFound(e) from e + + def commit_table( + self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] + ) -> CommitTableResponse: + return super().commit_table(table, requirements, updates) + + def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = ...) -> None: + valid_namespace: str = self._validate_namespace_identifier(namespace) + self.s3tables.create_namespace(tableBucketARN=self.table_bucket_arn, namespace=[valid_namespace]) + + def _validate_namespace_identifier(self, namespace: Union[str, Identifier]) -> str: + # for naming rules see: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html + # TODO: extract into constant variables + pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") + reserved = "aws_s3_metadata" + + namespace = self.identifier_to_database(namespace) + + if not pattern.fullmatch(namespace): + ... + + if namespace == reserved: + ... + + return namespace + + def _validate_database_and_table_identifier(self, identifier: Union[str, Identifier]) -> Tuple[str, str]: + # for naming rules see: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html + # TODO: extract into constant variables + pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") + + namespace, table_name = self.identifier_to_database_and_table(identifier) + + namespace = self._validate_namespace_identifier(namespace) + + if not pattern.fullmatch(table_name): + ... + + return namespace, table_name + + def create_table( + self, + identifier: Union[str, Identifier], + schema: Union[Schema, "pa.Schema"], + location: Optional[str] = None, + partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, + sort_order: SortOrder = UNSORTED_SORT_ORDER, + properties: Properties = EMPTY_DICT, + ) -> Table: + namespace, table_name = self._validate_database_and_table_identifier(identifier) + + schema: Schema = self._convert_schema_if_needed(schema) # type: ignore + + # TODO: check whether namespace exists and if it does, whether table_name already exists + self.s3tables.create_table( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG" + ) + + # location is given by s3 table bucket + response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) + version_token = response["versionToken"] + + location = response["warehouseLocation"] + metadata_location = self._get_metadata_location(location=location) + metadata = new_table_metadata( + location=location, + schema=schema, + partition_spec=partition_spec, + sort_order=sort_order, + properties=properties, + ) + + io = load_file_io(properties=self.properties, location=metadata_location) + # TODO: this triggers unsupported list operation error, setting overwrite=True is a workaround for now + # TODO: we can perform this check manually maybe? + self._write_metadata(metadata, io, metadata_location, overwrite=True) + # TODO: after writing need to update table metadata location + # can this operation fail if the version token does not match? + self.s3tables.update_table_metadata_location( + tableBucketARN=self.table_bucket_arn, + namespace=namespace, + name=table_name, + versionToken=version_token, + metadataLocation=metadata_location, + ) + + return self.load_table(identifier=identifier) + + def create_table_transaction( + self, + identifier: Union[str, Identifier], + schema: Union[Schema, "pa.Schema"], + location: Optional[str] = None, + partition_spec: PartitionSpec = ..., + sort_order: SortOrder = ..., + properties: Properties = ..., + ) -> CreateTableTransaction: + return super().create_table_transaction(identifier, schema, location, partition_spec, sort_order, properties) + + def drop_namespace(self, namespace: Union[str, Identifier]) -> None: + return super().drop_namespace(namespace) + + def drop_table(self, identifier: Union[str, Identifier]) -> None: + return super().drop_table(identifier) + + def drop_view(self, identifier: Union[str, Identifier]) -> None: + return super().drop_view(identifier) + + def list_namespaces(self, namespace: Union[str, Identifier] = ...) -> List[Identifier]: + # TODO: handle pagination + response = self.s3tables.list_namespaces(tableBucketARN=self.table_bucket_arn) + return [tuple(namespace["namespace"]) for namespace in response["namespaces"]] + + def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]: + return super().list_tables(namespace) + + def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: + return super().list_views(namespace) + + def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Properties: + return super().load_namespace_properties(namespace) + + def load_table(self, identifier: Union[str, Identifier]) -> Table: + namespace, table_name = self._validate_database_and_table_identifier(identifier) + # TODO: raise a NoSuchTableError if it does not exist + response = self.s3tables.get_table_metadata_location( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name + ) + # TODO: we might need to catch if table is not initialized i.e. does not have metadata setup yet + metadata_location = response["metadataLocation"] + + io = load_file_io(properties=self.properties, location=metadata_location) + file = io.new_input(metadata_location) + metadata = FromInputFile.table_metadata(file) + return Table( + identifier=(namespace, table_name), + metadata=metadata, + metadata_location=metadata_location, + io=self._load_file_io(metadata.properties, metadata_location), + catalog=self, + ) + + def purge_table(self, identifier: Union[str, Identifier]) -> None: + return super().purge_table(identifier) + + def register_table(self, identifier: Union[str, Identifier], metadata_location: str) -> Table: + return super().register_table(identifier, metadata_location) + + def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: Union[str, Identifier]) -> Table: + return super().rename_table(from_identifier, to_identifier) + + def table_exists(self, identifier: Union[str, Identifier]) -> bool: + return super().table_exists(identifier) + + def update_namespace_properties( + self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = ... + ) -> PropertiesUpdateSummary: + return super().update_namespace_properties(namespace, removals, updates) diff --git a/pyiceberg/exceptions.py b/pyiceberg/exceptions.py index 56574ff471..6096096f45 100644 --- a/pyiceberg/exceptions.py +++ b/pyiceberg/exceptions.py @@ -111,6 +111,11 @@ class ConditionalCheckFailedException(DynamoDbError): class GenericDynamoDbError(DynamoDbError): pass +class S3TablesError(Exception): + pass + +class TableBucketNotFound(S3TablesError): + pass class CommitFailedException(Exception): """Commit failed, refresh and try again.""" diff --git a/pyproject.toml b/pyproject.toml index 5775bc8fbf..b56976cd6b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1211,6 +1211,7 @@ sql-postgres = ["sqlalchemy", "psycopg2-binary"] sql-sqlite = ["sqlalchemy"] gcsfs = ["gcsfs"] rest-sigv4 = ["boto3"] +s3tables = ["boto3"] pyiceberg-core = ["pyiceberg-core"] [tool.pytest.ini_options] diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py new file mode 100644 index 0000000000..220b0ad507 --- /dev/null +++ b/tests/catalog/test_s3tables.py @@ -0,0 +1,71 @@ +import boto3 +from pyiceberg.schema import Schema +import pytest + +from pyiceberg.catalog.s3tables import S3TableCatalog +from pyiceberg.exceptions import TableBucketNotFound + + +@pytest.fixture +def database_name(database_name): + # naming rules prevent "-" in namespaces for s3 table buckets + return database_name.replace("-", "_") + + +@pytest.fixture +def table_name(table_name): + # naming rules prevent "-" in table namees for s3 table buckets + return table_name.replace("-", "_") + +@pytest.fixture +def table_bucket_arn(): + import os + # since the moto library does not support s3tables as of 2024-12-14 we have to test against a real AWS endpoint + # in one of the supported regions. + + return os.environ["ARN"] + + +def test_s3tables_boto_api(table_bucket_arn): + client = boto3.client("s3tables") + response = client.list_namespaces(tableBucketARN=table_bucket_arn) + print(response["namespaces"]) + + response = client.get_table_bucket(tableBucketARN=table_bucket_arn + "abc") + print(response) + + + +def test_s3tables_namespaces_api(table_bucket_arn): + client = boto3.client("s3tables") + response = client.create_namespace(tableBucketARN=table_bucket_arn, namespace=["one", "two"]) + print(response) + response = client.list_namespaces(tableBucketARN=table_bucket_arn) + print(response) + +def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn): + properties = {"warehouse": f"{table_bucket_arn}-modified"} + with pytest.raises(TableBucketNotFound): + S3TableCatalog(name="test_s3tables_catalog", **properties) + + +def test_create_namespace(table_bucket_arn, database_name: str): + properties = {"warehouse": table_bucket_arn} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + catalog.create_namespace(namespace=database_name) + namespaces = catalog.list_namespaces() + assert (database_name,) in namespaces + + +def test_create_table(table_bucket_arn, database_name: str, table_name:str, table_schema_nested: Schema): + properties = {"warehouse": table_bucket_arn} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + print(database_name, table_name) + # this fails with + # OSError: When completing multiple part upload for key 'metadata/00000-55a9c37c-b822-4a81-ac0e-1efbcd145dba.metadata.json' in bucket '14e4e036-d4ae-44f8-koana45eruw + # Uunable to parse ExceptionName: S3TablesUnsupportedHeader Message: S3 Tables does not support the following header: x-amz-api-version value: 2006-03-01 + table = catalog.create_table(identifier=identifier, schema=table_schema_nested) + print(table) From 1ca5e86d556fadf9dbf3d181bff19a234d442471 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 14 Dec 2024 17:41:40 +0100 Subject: [PATCH 02/71] feat: support create_table using FsspecFileIO --- pyiceberg/catalog/s3tables.py | 7 ++++++- tests/catalog/test_s3tables.py | 27 ++++----------------------- 2 files changed, 10 insertions(+), 24 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 82cd862571..740e72a124 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -14,7 +14,7 @@ from pyiceberg.catalog import PropertiesUpdateSummary from pyiceberg.exceptions import S3TablesError from pyiceberg.exceptions import TableBucketNotFound -from pyiceberg.io import AWS_ACCESS_KEY_ID +from pyiceberg.io import AWS_ACCESS_KEY_ID, PY_IO_IMPL from pyiceberg.io import AWS_REGION from pyiceberg.io import AWS_SECRET_ACCESS_KEY from pyiceberg.io import AWS_SESSION_TOKEN @@ -44,10 +44,15 @@ S3TABLES_ENDPOINT = "s3tables.endpoint" +# pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 +S3TABLES_FILE_IO_DEFAULT = "pyiceberg.io.fsspec.FsspecFileIO" + class S3TableCatalog(MetastoreCatalog): def __init__(self, name: str, **properties: str): super().__init__(name, **properties) + # TODO: implement a proper check for FileIO + self.properties[PY_IO_IMPL] = S3TABLES_FILE_IO_DEFAULT session = boto3.Session( profile_name=properties.get(S3TABLES_PROFILE_NAME), diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 220b0ad507..855b144a14 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -26,23 +26,6 @@ def table_bucket_arn(): return os.environ["ARN"] -def test_s3tables_boto_api(table_bucket_arn): - client = boto3.client("s3tables") - response = client.list_namespaces(tableBucketARN=table_bucket_arn) - print(response["namespaces"]) - - response = client.get_table_bucket(tableBucketARN=table_bucket_arn + "abc") - print(response) - - - -def test_s3tables_namespaces_api(table_bucket_arn): - client = boto3.client("s3tables") - response = client.create_namespace(tableBucketARN=table_bucket_arn, namespace=["one", "two"]) - print(response) - response = client.list_namespaces(tableBucketARN=table_bucket_arn) - print(response) - def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn): properties = {"warehouse": f"{table_bucket_arn}-modified"} with pytest.raises(TableBucketNotFound): @@ -58,14 +41,12 @@ def test_create_namespace(table_bucket_arn, database_name: str): def test_create_table(table_bucket_arn, database_name: str, table_name:str, table_schema_nested: Schema): - properties = {"warehouse": table_bucket_arn} + # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet + properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) - print(database_name, table_name) - # this fails with - # OSError: When completing multiple part upload for key 'metadata/00000-55a9c37c-b822-4a81-ac0e-1efbcd145dba.metadata.json' in bucket '14e4e036-d4ae-44f8-koana45eruw - # Uunable to parse ExceptionName: S3TablesUnsupportedHeader Message: S3 Tables does not support the following header: x-amz-api-version value: 2006-03-01 table = catalog.create_table(identifier=identifier, schema=table_schema_nested) - print(table) + + assert table == catalog.load_table(identifier) From e659da1a684d736a00671b335c28b23a98cd44e2 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 14 Dec 2024 18:13:38 +0100 Subject: [PATCH 03/71] feat: implement drop_table --- pyiceberg/catalog/s3tables.py | 38 ++++++++++++++++++++++++++-------- tests/catalog/test_s3tables.py | 18 +++++++++++++++- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 740e72a124..7234e7223a 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -8,18 +8,22 @@ import boto3 -from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, MetastoreCatalog +from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION from pyiceberg.catalog import WAREHOUSE_LOCATION from pyiceberg.catalog import Catalog +from pyiceberg.catalog import MetastoreCatalog from pyiceberg.catalog import PropertiesUpdateSummary +from pyiceberg.exceptions import NoSuchTableError from pyiceberg.exceptions import S3TablesError from pyiceberg.exceptions import TableBucketNotFound -from pyiceberg.io import AWS_ACCESS_KEY_ID, PY_IO_IMPL +from pyiceberg.io import AWS_ACCESS_KEY_ID from pyiceberg.io import AWS_REGION from pyiceberg.io import AWS_SECRET_ACCESS_KEY from pyiceberg.io import AWS_SESSION_TOKEN +from pyiceberg.io import PY_IO_IMPL from pyiceberg.io import load_file_io -from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec +from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC +from pyiceberg.partitioning import PartitionSpec from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile from pyiceberg.table import CommitTableResponse @@ -27,9 +31,11 @@ from pyiceberg.table import Table from pyiceberg.table import TableRequirement from pyiceberg.table.metadata import new_table_metadata -from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder +from pyiceberg.table.sorting import UNSORTED_SORT_ORDER +from pyiceberg.table.sorting import SortOrder from pyiceberg.table.update import TableUpdate -from pyiceberg.typedef import EMPTY_DICT, Identifier +from pyiceberg.typedef import EMPTY_DICT +from pyiceberg.typedef import Identifier from pyiceberg.typedef import Properties from pyiceberg.utils.properties import get_first_property_value @@ -176,7 +182,18 @@ def drop_namespace(self, namespace: Union[str, Identifier]) -> None: return super().drop_namespace(namespace) def drop_table(self, identifier: Union[str, Identifier]) -> None: - return super().drop_table(identifier) + namespace, table_name = self._validate_database_and_table_identifier(identifier) + try: + response = self.s3tables.get_table( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name + ) + except self.s3tables.exceptions.NotFoundException as e: + raise NoSuchTableError(f"No table with identifier {identifier} exists.") from e + + # TODO: handle conflicts due to versionToken mismatch that might occur + self.s3tables.delete_table( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, versionToken=response["versionToken"] + ) def drop_view(self, identifier: Union[str, Identifier]) -> None: return super().drop_view(identifier) @@ -198,9 +215,12 @@ def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Proper def load_table(self, identifier: Union[str, Identifier]) -> Table: namespace, table_name = self._validate_database_and_table_identifier(identifier) # TODO: raise a NoSuchTableError if it does not exist - response = self.s3tables.get_table_metadata_location( - tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name - ) + try: + response = self.s3tables.get_table_metadata_location( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name + ) + except self.s3tables.exceptions.NotFoundException as e: + raise NoSuchTableError(f"No table with identifier {identifier} exists.") from e # TODO: we might need to catch if table is not initialized i.e. does not have metadata setup yet metadata_location = response["metadataLocation"] diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 855b144a14..9a68cdb794 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -3,7 +3,7 @@ import pytest from pyiceberg.catalog.s3tables import S3TableCatalog -from pyiceberg.exceptions import TableBucketNotFound +from pyiceberg.exceptions import NoSuchTableError, TableBucketNotFound @pytest.fixture @@ -50,3 +50,19 @@ def test_create_table(table_bucket_arn, database_name: str, table_name:str, tabl table = catalog.create_table(identifier=identifier, schema=table_schema_nested) assert table == catalog.load_table(identifier) + + + +def test_drop_table(table_bucket_arn, database_name: str, table_name:str, table_schema_nested: Schema): + # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet + properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + + catalog.drop_table(identifier=identifier) + + with pytest.raises(NoSuchTableError): + catalog.load_table(identifier=identifier) From 9973d12ec37da5b1120564d6984f762a92123609 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 14 Dec 2024 18:20:02 +0100 Subject: [PATCH 04/71] feat: implement drop_namespace --- pyiceberg/catalog/s3tables.py | 9 +++++++-- tests/catalog/test_s3tables.py | 9 +++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 7234e7223a..fd7d7addc0 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -13,7 +13,7 @@ from pyiceberg.catalog import Catalog from pyiceberg.catalog import MetastoreCatalog from pyiceberg.catalog import PropertiesUpdateSummary -from pyiceberg.exceptions import NoSuchTableError +from pyiceberg.exceptions import NamespaceNotEmptyError, NoSuchTableError from pyiceberg.exceptions import S3TablesError from pyiceberg.exceptions import TableBucketNotFound from pyiceberg.io import AWS_ACCESS_KEY_ID @@ -179,7 +179,12 @@ def create_table_transaction( return super().create_table_transaction(identifier, schema, location, partition_spec, sort_order, properties) def drop_namespace(self, namespace: Union[str, Identifier]) -> None: - return super().drop_namespace(namespace) + namespace = self._validate_namespace_identifier(namespace) + try: + self.s3tables.delete_namespace(tableBucketARN=self.table_bucket_arn, namespace=namespace) + except self.s3tables.exceptions.ConflictException as e: + raise NamespaceNotEmptyError(f"Namespace {namespace} is not empty.") from e + def drop_table(self, identifier: Union[str, Identifier]) -> None: namespace, table_name = self._validate_database_and_table_identifier(identifier) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 9a68cdb794..0503fff9c1 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -39,6 +39,15 @@ def test_create_namespace(table_bucket_arn, database_name: str): namespaces = catalog.list_namespaces() assert (database_name,) in namespaces +def test_drop_namespace(table_bucket_arn, database_name: str): + properties = {"warehouse": table_bucket_arn} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + catalog.create_namespace(namespace=database_name) + assert (database_name,) in catalog.list_namespaces() + catalog.drop_namespace(namespace=database_name) + assert (database_name,) not in catalog.list_namespaces() + + def test_create_table(table_bucket_arn, database_name: str, table_name:str, table_schema_nested: Schema): # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet From 3c36450bdc2b783f4cb5b1f9ed052933c737c97a Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 15 Dec 2024 12:01:36 +0100 Subject: [PATCH 05/71] test: validate how version conflict is handled with s3tables API --- tests/catalog/test_s3tables.py | 40 +++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 0503fff9c1..c7b9d13df8 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -1,9 +1,12 @@ +import uuid + import boto3 -from pyiceberg.schema import Schema import pytest from pyiceberg.catalog.s3tables import S3TableCatalog -from pyiceberg.exceptions import NoSuchTableError, TableBucketNotFound +from pyiceberg.exceptions import NoSuchTableError +from pyiceberg.exceptions import TableBucketNotFound +from pyiceberg.schema import Schema @pytest.fixture @@ -17,15 +20,41 @@ def table_name(table_name): # naming rules prevent "-" in table namees for s3 table buckets return table_name.replace("-", "_") + @pytest.fixture def table_bucket_arn(): import os + # since the moto library does not support s3tables as of 2024-12-14 we have to test against a real AWS endpoint # in one of the supported regions. return os.environ["ARN"] +def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, database_name, table_name): + client = boto3.client("s3tables") + client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) + response = client.create_table( + tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG" + ) + version_token = response["versionToken"] + scrambled_version_token = version_token[::-1] + + warehouse_location = client.get_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name)[ + "warehouseLocation" + ] + metadata_location = f"{warehouse_location}/metadata/00001-{uuid.uuid4()}.metadata.json" + + with pytest.raises(client.exceptions.ConflictException): + client.update_table_metadata_location( + tableBucketARN=table_bucket_arn, + namespace=database_name, + name=table_name, + versionToken=scrambled_version_token, + metadataLocation=metadata_location, + ) + + def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn): properties = {"warehouse": f"{table_bucket_arn}-modified"} with pytest.raises(TableBucketNotFound): @@ -39,6 +68,7 @@ def test_create_namespace(table_bucket_arn, database_name: str): namespaces = catalog.list_namespaces() assert (database_name,) in namespaces + def test_drop_namespace(table_bucket_arn, database_name: str): properties = {"warehouse": table_bucket_arn} catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) @@ -48,8 +78,7 @@ def test_drop_namespace(table_bucket_arn, database_name: str): assert (database_name,) not in catalog.list_namespaces() - -def test_create_table(table_bucket_arn, database_name: str, table_name:str, table_schema_nested: Schema): +def test_create_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) @@ -61,8 +90,7 @@ def test_create_table(table_bucket_arn, database_name: str, table_name:str, tabl assert table == catalog.load_table(identifier) - -def test_drop_table(table_bucket_arn, database_name: str, table_name:str, table_schema_nested: Schema): +def test_drop_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) From c876a3748490a0ba937f4b3d8de3a7eb22bce0a4 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 15 Dec 2024 13:40:23 +0100 Subject: [PATCH 06/71] feat: implement commit_table --- pyiceberg/catalog/s3tables.py | 52 +++++++++++++++++++++++++++++++--- tests/catalog/test_s3tables.py | 30 ++++++++++++++++++++ 2 files changed, 78 insertions(+), 4 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index fd7d7addc0..1fb384b21d 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -13,7 +13,9 @@ from pyiceberg.catalog import Catalog from pyiceberg.catalog import MetastoreCatalog from pyiceberg.catalog import PropertiesUpdateSummary -from pyiceberg.exceptions import NamespaceNotEmptyError, NoSuchTableError +from pyiceberg.exceptions import CommitFailedException +from pyiceberg.exceptions import NamespaceNotEmptyError +from pyiceberg.exceptions import NoSuchTableError from pyiceberg.exceptions import S3TablesError from pyiceberg.exceptions import TableBucketNotFound from pyiceberg.io import AWS_ACCESS_KEY_ID @@ -83,7 +85,47 @@ def __init__(self, name: str, **properties: str): def commit_table( self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] ) -> CommitTableResponse: - return super().commit_table(table, requirements, updates) + table_identifier = table.name() + database_name, table_name = self.identifier_to_database_and_table(table_identifier, NoSuchTableError) + + # TODO: loading table and getting versionToken should be an atomic operation, otherwise + # the table can change between those two API calls without noticing + # -> change this into an internal API that returns table information along with versionToken + current_table = self.load_table(identifier=table_identifier) + version_token = self.s3tables.get_table( + tableBucketARN=self.table_bucket_arn, namespace=database_name, name=table_name + )["versionToken"] + + updated_staged_table = self._update_and_stage_table(current_table, table_identifier, requirements, updates) + if current_table and updated_staged_table.metadata == current_table.metadata: + # no changes, do nothing + return CommitTableResponse( + metadata=current_table.metadata, metadata_location=current_table.metadata_location + ) + + self._write_metadata( + metadata=updated_staged_table.metadata, + io=updated_staged_table.io, + metadata_path=updated_staged_table.metadata_location, + overwrite=True, + ) + + # try to update metadata location which will fail if the versionToken changed meanwhile + try: + self.s3tables.update_table_metadata_location( + tableBucketARN=self.table_bucket_arn, + namespace=database_name, + name=table_name, + versionToken=version_token, + metadataLocation=updated_staged_table.metadata_location, + ) + except self.s3tables.exceptions.ConflictException as e: + raise CommitFailedException( + f"Cannot commit {database_name}.{table_name} because of a concurrent update to the table version {version_token}." + ) from e + return CommitTableResponse( + metadata=updated_staged_table.metadata, metadata_location=updated_staged_table.metadata_location + ) def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = ...) -> None: valid_namespace: str = self._validate_namespace_identifier(namespace) @@ -185,7 +227,6 @@ def drop_namespace(self, namespace: Union[str, Identifier]) -> None: except self.s3tables.exceptions.ConflictException as e: raise NamespaceNotEmptyError(f"Namespace {namespace} is not empty.") from e - def drop_table(self, identifier: Union[str, Identifier]) -> None: namespace, table_name = self._validate_database_and_table_identifier(identifier) try: @@ -197,7 +238,10 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: # TODO: handle conflicts due to versionToken mismatch that might occur self.s3tables.delete_table( - tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, versionToken=response["versionToken"] + tableBucketARN=self.table_bucket_arn, + namespace=namespace, + name=table_name, + versionToken=response["versionToken"], ) def drop_view(self, identifier: Union[str, Identifier]) -> None: diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index c7b9d13df8..d185bd795d 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -7,6 +7,7 @@ from pyiceberg.exceptions import NoSuchTableError from pyiceberg.exceptions import TableBucketNotFound from pyiceberg.schema import Schema +from pyiceberg.types import IntegerType @pytest.fixture @@ -103,3 +104,32 @@ def test_drop_table(table_bucket_arn, database_name: str, table_name: str, table with pytest.raises(NoSuchTableError): catalog.load_table(identifier=identifier) + + +def test_commit_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): + # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet + properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + table = catalog.create_table(identifier=identifier, schema=table_schema_nested) + + last_updated_ms = table.metadata.last_updated_ms + original_table_metadata_location = table.metadata_location + original_table_last_updated_ms = table.metadata.last_updated_ms + + + + transaction = table.transaction() + update = transaction.update_schema() + update.add_column(path="b", field_type=IntegerType()) + update.commit() + transaction.commit_transaction() + + updated_table_metadata = table.metadata + assert updated_table_metadata.current_schema_id == 1 + assert len(updated_table_metadata.schemas) == 2 + assert updated_table_metadata.last_updated_ms > last_updated_ms + assert updated_table_metadata.metadata_log[0].metadata_file == original_table_metadata_location + assert updated_table_metadata.metadata_log[0].timestamp_ms == original_table_last_updated_ms From 43deab4d38a262ff9b9c99087aafa4abb8f5d1b1 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 18 Dec 2024 21:37:10 +0100 Subject: [PATCH 07/71] feat: implement table_exists --- pyiceberg/catalog/s3tables.py | 9 ++++++++- tests/catalog/test_s3tables.py | 12 ++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 1fb384b21d..947157320d 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -294,7 +294,14 @@ def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: U return super().rename_table(from_identifier, to_identifier) def table_exists(self, identifier: Union[str, Identifier]) -> bool: - return super().table_exists(identifier) + namespace, table_name = self._validate_database_and_table_identifier(identifier) + try: + self.s3tables.get_table( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name + ) + except self.s3tables.exceptions.NotFoundException: + return False + return True def update_namespace_properties( self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = ... diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index d185bd795d..f33d04d953 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -91,6 +91,18 @@ def test_create_table(table_bucket_arn, database_name: str, table_name: str, tab assert table == catalog.load_table(identifier) +def test_table_exists(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): + # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet + properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + assert not catalog.table_exists(identifier=identifier) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + assert catalog.table_exists(identifier=identifier) + + def test_drop_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} From 907ed28d66374f0133f1ec2a3f9824f30c3c90ff Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 18 Dec 2024 21:53:07 +0100 Subject: [PATCH 08/71] feat: implement list_tables --- pyiceberg/catalog/s3tables.py | 7 ++++++- tests/catalog/test_s3tables.py | 12 ++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 947157320d..047c689afd 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -253,7 +253,12 @@ def list_namespaces(self, namespace: Union[str, Identifier] = ...) -> List[Ident return [tuple(namespace["namespace"]) for namespace in response["namespaces"]] def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]: - return super().list_tables(namespace) + namespace = self._validate_namespace_identifier(namespace) + paginator = self.s3tables.get_paginator("list_tables") + tables: List[str] = [] + for page in paginator.paginate(tableBucketARN=self.table_bucket_arn, namespace=namespace): + tables.extend(table["name"] for table in page["tables"]) + return tables def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: return super().list_views(namespace) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index f33d04d953..e74bddc973 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -103,6 +103,18 @@ def test_table_exists(table_bucket_arn, database_name: str, table_name: str, tab assert catalog.table_exists(identifier=identifier) +def test_list_tables(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): + # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet + properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + assert not catalog.list_tables(namespace=database_name) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + assert catalog.list_tables(namespace=database_name) + + def test_drop_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} From 4438c8e1a9109ccac4012fdb714bf752395ebee9 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 18 Dec 2024 21:59:14 +0100 Subject: [PATCH 09/71] refactor: improve list_namespace --- pyiceberg/catalog/s3tables.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 047c689afd..9e223dcc7a 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -247,10 +247,17 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: def drop_view(self, identifier: Union[str, Identifier]) -> None: return super().drop_view(identifier) - def list_namespaces(self, namespace: Union[str, Identifier] = ...) -> List[Identifier]: - # TODO: handle pagination - response = self.s3tables.list_namespaces(tableBucketARN=self.table_bucket_arn) - return [tuple(namespace["namespace"]) for namespace in response["namespaces"]] + def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]: + # TODO: s3tables only support single level namespaces + if namespace: + namespace = self._validate_namespace_identifier(namespace) + paginator = self.s3tables.get_paginator("list_namespaces") + + namespaces: List[Identifier] = [] + for page in paginator.paginate(tableBucketARN=self.table_bucket_arn): + namespaces.extend(tuple(entry["namespace"]) for entry in page["namespaces"]) + + return namespaces def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]: namespace = self._validate_namespace_identifier(namespace) From 34996432fd9dd333ea7a84104d41fe4774fa299b Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 18 Dec 2024 21:59:50 +0100 Subject: [PATCH 10/71] fix: return Identifier from list_tables --- pyiceberg/catalog/s3tables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 9e223dcc7a..f974c354bd 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -262,9 +262,9 @@ def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identi def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]: namespace = self._validate_namespace_identifier(namespace) paginator = self.s3tables.get_paginator("list_tables") - tables: List[str] = [] + tables: List[Identifier] = [] for page in paginator.paginate(tableBucketARN=self.table_bucket_arn, namespace=namespace): - tables.extend(table["name"] for table in page["tables"]) + tables.extend((namespace, table["name"]) for table in page["tables"]) return tables def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: From fef4e69a5203e391ff957f9ec29d8b4dedf765f6 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 14:22:25 +0100 Subject: [PATCH 11/71] feat: implement rename table --- pyiceberg/catalog/s3tables.py | 22 ++++++++++++++++++---- tests/catalog/test_s3tables.py | 19 +++++++++++++++++++ 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index f974c354bd..54d57e347d 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -303,14 +303,28 @@ def register_table(self, identifier: Union[str, Identifier], metadata_location: return super().register_table(identifier, metadata_location) def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: Union[str, Identifier]) -> Table: - return super().rename_table(from_identifier, to_identifier) + from_namespace, from_table_name = self._validate_database_and_table_identifier(from_identifier) + to_namespace, to_table_name = self._validate_database_and_table_identifier(to_identifier) + + version_token = self.s3tables.get_table( + tableBucketARN=self.table_bucket_arn, namespace=from_namespace, name=from_table_name + )["versionToken"] + + self.s3tables.rename_table( + tableBucketARN=self.table_bucket_arn, + namespace=from_namespace, + name=from_table_name, + newNamespaceName=to_namespace, + newName=to_table_name, + versionToken=version_token + ) + + return self.load_table(to_identifier) def table_exists(self, identifier: Union[str, Identifier]) -> bool: namespace, table_name = self._validate_database_and_table_identifier(identifier) try: - self.s3tables.get_table( - tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name - ) + self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) except self.s3tables.exceptions.NotFoundException: return False return True diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index e74bddc973..3750b8a357 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -103,6 +103,25 @@ def test_table_exists(table_bucket_arn, database_name: str, table_name: str, tab assert catalog.table_exists(identifier=identifier) +def test_rename_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): + # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet + properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + + to_database_name = f"{database_name}new" + to_table_name = f"{table_name}new" + to_identifier = (to_database_name, to_table_name) + catalog.create_namespace(namespace=to_database_name) + catalog.rename_table(from_identifier=identifier, to_identifier=to_identifier) + + assert not catalog.table_exists(identifier=identifier) + assert catalog.table_exists(identifier=to_identifier) + + def test_list_tables(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} From 85d9bf54792472c4cc6df944d89517d9ac89f58d Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 14:32:03 +0100 Subject: [PATCH 12/71] feat: implement load_namespace_properties --- pyiceberg/catalog/s3tables.py | 11 +++++++++-- tests/catalog/test_s3tables.py | 9 +++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 54d57e347d..c77651444b 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -271,7 +271,14 @@ def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: return super().list_views(namespace) def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Properties: - return super().load_namespace_properties(namespace) + namespace = self._validate_namespace_identifier(namespace) + response = self.s3tables.get_namespace(tableBucketARN=self.table_bucket_arn, namespace=namespace) + return { + "namespace": response["namespace"], + "createdAt": response["createdAt"], + "createdBy": response["createdBy"], + "ownerAccountId": response["ownerAccountId"], + } def load_table(self, identifier: Union[str, Identifier]) -> Table: namespace, table_name = self._validate_database_and_table_identifier(identifier) @@ -316,7 +323,7 @@ def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: U name=from_table_name, newNamespaceName=to_namespace, newName=to_table_name, - versionToken=version_token + versionToken=version_token, ) return self.load_table(to_identifier) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 3750b8a357..5b7393b3a0 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -70,6 +70,13 @@ def test_create_namespace(table_bucket_arn, database_name: str): assert (database_name,) in namespaces +def test_load_namespace_properties(table_bucket_arn, database_name: str): + properties = {"warehouse": table_bucket_arn} + catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) + catalog.create_namespace(namespace=database_name) + assert database_name in catalog.load_namespace_properties(database_name)["namespace"] + + def test_drop_namespace(table_bucket_arn, database_name: str): properties = {"warehouse": table_bucket_arn} catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) @@ -162,8 +169,6 @@ def test_commit_table(table_bucket_arn, database_name: str, table_name: str, tab original_table_metadata_location = table.metadata_location original_table_last_updated_ms = table.metadata.last_updated_ms - - transaction = table.transaction() update = transaction.update_schema() update.add_column(path="b", field_type=IntegerType()) From 1360ead9c05e2f7c2d6566a582c287bf1a14b9c4 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 14:38:07 +0100 Subject: [PATCH 13/71] refactor: move some methods around --- pyiceberg/catalog/s3tables.py | 47 ++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index c77651444b..2370a5099c 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -10,7 +10,6 @@ from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION from pyiceberg.catalog import WAREHOUSE_LOCATION -from pyiceberg.catalog import Catalog from pyiceberg.catalog import MetastoreCatalog from pyiceberg.catalog import PropertiesUpdateSummary from pyiceberg.exceptions import CommitFailedException @@ -157,6 +156,7 @@ def _validate_database_and_table_identifier(self, identifier: Union[str, Identif namespace = self._validate_namespace_identifier(namespace) if not pattern.fullmatch(table_name): + # TODO: raise proper errors for invalid table_name ... return namespace, table_name @@ -209,16 +209,6 @@ def create_table( return self.load_table(identifier=identifier) - def create_table_transaction( - self, - identifier: Union[str, Identifier], - schema: Union[Schema, "pa.Schema"], - location: Optional[str] = None, - partition_spec: PartitionSpec = ..., - sort_order: SortOrder = ..., - properties: Properties = ..., - ) -> CreateTableTransaction: - return super().create_table_transaction(identifier, schema, location, partition_spec, sort_order, properties) def drop_namespace(self, namespace: Union[str, Identifier]) -> None: namespace = self._validate_namespace_identifier(namespace) @@ -244,9 +234,6 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: versionToken=response["versionToken"], ) - def drop_view(self, identifier: Union[str, Identifier]) -> None: - return super().drop_view(identifier) - def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]: # TODO: s3tables only support single level namespaces if namespace: @@ -267,9 +254,6 @@ def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]: tables.extend((namespace, table["name"]) for table in page["tables"]) return tables - def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: - return super().list_views(namespace) - def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Properties: namespace = self._validate_namespace_identifier(namespace) response = self.s3tables.get_namespace(tableBucketARN=self.table_bucket_arn, namespace=namespace) @@ -303,12 +287,6 @@ def load_table(self, identifier: Union[str, Identifier]) -> Table: catalog=self, ) - def purge_table(self, identifier: Union[str, Identifier]) -> None: - return super().purge_table(identifier) - - def register_table(self, identifier: Union[str, Identifier], metadata_location: str) -> Table: - return super().register_table(identifier, metadata_location) - def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: Union[str, Identifier]) -> Table: from_namespace, from_table_name = self._validate_database_and_table_identifier(from_identifier) to_namespace, to_table_name = self._validate_database_and_table_identifier(to_identifier) @@ -340,3 +318,26 @@ def update_namespace_properties( self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = ... ) -> PropertiesUpdateSummary: return super().update_namespace_properties(namespace, removals, updates) + + def create_table_transaction( + self, + identifier: Union[str, Identifier], + schema: Union[Schema, "pa.Schema"], + location: Optional[str] = None, + partition_spec: PartitionSpec = ..., + sort_order: SortOrder = ..., + properties: Properties = ..., + ) -> CreateTableTransaction: + return super().create_table_transaction(identifier, schema, location, partition_spec, sort_order, properties) + + def purge_table(self, identifier: Union[str, Identifier]) -> None: + return super().purge_table(identifier) + + def register_table(self, identifier: Union[str, Identifier], metadata_location: str) -> Table: + return super().register_table(identifier, metadata_location) + + def drop_view(self, identifier: Union[str, Identifier]) -> None: + return super().drop_view(identifier) + + def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: + return super().list_views(namespace) From 695107696dfa6c7ea870e65e79f3a43b60d46399 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 14:39:04 +0100 Subject: [PATCH 14/71] feat: raise NotImplementedError for views functionality --- pyiceberg/catalog/s3tables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 2370a5099c..bb0007b796 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -337,7 +337,7 @@ def register_table(self, identifier: Union[str, Identifier], metadata_location: return super().register_table(identifier, metadata_location) def drop_view(self, identifier: Union[str, Identifier]) -> None: - return super().drop_view(identifier) + raise NotImplementedError def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: - return super().list_views(namespace) + raise NotImplementedError From 28ef8427a6a9d47ca13045d7a8111928920355f5 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 14:43:11 +0100 Subject: [PATCH 15/71] feat: raise NotImplementedError for purge_table --- pyiceberg/catalog/s3tables.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index bb0007b796..c91f76eb08 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -331,7 +331,9 @@ def create_table_transaction( return super().create_table_transaction(identifier, schema, location, partition_spec, sort_order, properties) def purge_table(self, identifier: Union[str, Identifier]) -> None: - return super().purge_table(identifier) + # purge is not supported as s3tables doesn't support delete operations + # table maintenance is automated + raise NotImplementedError def register_table(self, identifier: Union[str, Identifier], metadata_location: str) -> Table: return super().register_table(identifier, metadata_location) From ff168f21dcc3377d36f837c7a66db0dab41cb215 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 14:52:50 +0100 Subject: [PATCH 16/71] feat: raise NotImplementedError for update_namespace_properties --- pyiceberg/catalog/s3tables.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index c91f76eb08..058d2acc59 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -315,9 +315,10 @@ def table_exists(self, identifier: Union[str, Identifier]) -> bool: return True def update_namespace_properties( - self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = ... + self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: - return super().update_namespace_properties(namespace, removals, updates) + # namespace properties are read only + raise NotImplementedError def create_table_transaction( self, From cf244bd2fb663aee3644b840872b869954c7ee22 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 15:02:57 +0100 Subject: [PATCH 17/71] feat: raise NotImplementedError for register_table --- pyiceberg/catalog/s3tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 058d2acc59..3c4a1efda7 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -337,7 +337,7 @@ def purge_table(self, identifier: Union[str, Identifier]) -> None: raise NotImplementedError def register_table(self, identifier: Union[str, Identifier], metadata_location: str) -> Table: - return super().register_table(identifier, metadata_location) + raise NotImplementedError def drop_view(self, identifier: Union[str, Identifier]) -> None: raise NotImplementedError From f1de32b754d7ee90706852d4dbee146166446e69 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 15:10:51 +0100 Subject: [PATCH 18/71] fix: don't override create_table_transaction --- pyiceberg/catalog/s3tables.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 3c4a1efda7..555e9783e0 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -320,17 +320,6 @@ def update_namespace_properties( # namespace properties are read only raise NotImplementedError - def create_table_transaction( - self, - identifier: Union[str, Identifier], - schema: Union[Schema, "pa.Schema"], - location: Optional[str] = None, - partition_spec: PartitionSpec = ..., - sort_order: SortOrder = ..., - properties: Properties = ..., - ) -> CreateTableTransaction: - return super().create_table_transaction(identifier, schema, location, partition_spec, sort_order, properties) - def purge_table(self, identifier: Union[str, Identifier]) -> None: # purge is not supported as s3tables doesn't support delete operations # table maintenance is automated From 9fbc40249de9eefa1ac6342b5c9db2ad8ff67477 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 21 Dec 2024 15:15:57 +0100 Subject: [PATCH 19/71] chore: run formatter --- pyiceberg/catalog/s3tables.py | 68 ++++++++++------------------------ pyiceberg/exceptions.py | 3 ++ tests/catalog/test_s3tables.py | 7 +--- 3 files changed, 25 insertions(+), 53 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 555e9783e0..b4aa7283b5 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -1,43 +1,24 @@ import re -from typing import TYPE_CHECKING -from typing import List -from typing import Optional -from typing import Set -from typing import Tuple -from typing import Union +from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union import boto3 -from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION -from pyiceberg.catalog import WAREHOUSE_LOCATION -from pyiceberg.catalog import MetastoreCatalog -from pyiceberg.catalog import PropertiesUpdateSummary -from pyiceberg.exceptions import CommitFailedException -from pyiceberg.exceptions import NamespaceNotEmptyError -from pyiceberg.exceptions import NoSuchTableError -from pyiceberg.exceptions import S3TablesError -from pyiceberg.exceptions import TableBucketNotFound -from pyiceberg.io import AWS_ACCESS_KEY_ID -from pyiceberg.io import AWS_REGION -from pyiceberg.io import AWS_SECRET_ACCESS_KEY -from pyiceberg.io import AWS_SESSION_TOKEN -from pyiceberg.io import PY_IO_IMPL -from pyiceberg.io import load_file_io -from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC -from pyiceberg.partitioning import PartitionSpec +from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, WAREHOUSE_LOCATION, MetastoreCatalog, PropertiesUpdateSummary +from pyiceberg.exceptions import ( + CommitFailedException, + NamespaceNotEmptyError, + NoSuchTableError, + TableBucketNotFound, +) +from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, PY_IO_IMPL, load_file_io +from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile -from pyiceberg.table import CommitTableResponse -from pyiceberg.table import CreateTableTransaction -from pyiceberg.table import Table -from pyiceberg.table import TableRequirement +from pyiceberg.table import CommitTableResponse, Table, TableRequirement from pyiceberg.table.metadata import new_table_metadata -from pyiceberg.table.sorting import UNSORTED_SORT_ORDER -from pyiceberg.table.sorting import SortOrder +from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder from pyiceberg.table.update import TableUpdate -from pyiceberg.typedef import EMPTY_DICT -from pyiceberg.typedef import Identifier -from pyiceberg.typedef import Properties +from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties from pyiceberg.utils.properties import get_first_property_value if TYPE_CHECKING: @@ -66,9 +47,7 @@ def __init__(self, name: str, **properties: str): region_name=get_first_property_value(properties, S3TABLES_REGION, AWS_REGION), botocore_session=properties.get(DEPRECATED_BOTOCORE_SESSION), aws_access_key_id=get_first_property_value(properties, S3TABLES_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID), - aws_secret_access_key=get_first_property_value( - properties, S3TABLES_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY - ), + aws_secret_access_key=get_first_property_value(properties, S3TABLES_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY), aws_session_token=get_first_property_value(properties, S3TABLES_SESSION_TOKEN, AWS_SESSION_TOKEN), ) # TODO: s3tables client only supported from boto3>=1.35.74 so this can crash @@ -91,16 +70,14 @@ def commit_table( # the table can change between those two API calls without noticing # -> change this into an internal API that returns table information along with versionToken current_table = self.load_table(identifier=table_identifier) - version_token = self.s3tables.get_table( - tableBucketARN=self.table_bucket_arn, namespace=database_name, name=table_name - )["versionToken"] + version_token = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=database_name, name=table_name)[ + "versionToken" + ] updated_staged_table = self._update_and_stage_table(current_table, table_identifier, requirements, updates) if current_table and updated_staged_table.metadata == current_table.metadata: # no changes, do nothing - return CommitTableResponse( - metadata=current_table.metadata, metadata_location=current_table.metadata_location - ) + return CommitTableResponse(metadata=current_table.metadata, metadata_location=current_table.metadata_location) self._write_metadata( metadata=updated_staged_table.metadata, @@ -175,9 +152,7 @@ def create_table( schema: Schema = self._convert_schema_if_needed(schema) # type: ignore # TODO: check whether namespace exists and if it does, whether table_name already exists - self.s3tables.create_table( - tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG" - ) + self.s3tables.create_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG") # location is given by s3 table bucket response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) @@ -209,7 +184,6 @@ def create_table( return self.load_table(identifier=identifier) - def drop_namespace(self, namespace: Union[str, Identifier]) -> None: namespace = self._validate_namespace_identifier(namespace) try: @@ -220,9 +194,7 @@ def drop_namespace(self, namespace: Union[str, Identifier]) -> None: def drop_table(self, identifier: Union[str, Identifier]) -> None: namespace, table_name = self._validate_database_and_table_identifier(identifier) try: - response = self.s3tables.get_table( - tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name - ) + response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) except self.s3tables.exceptions.NotFoundException as e: raise NoSuchTableError(f"No table with identifier {identifier} exists.") from e diff --git a/pyiceberg/exceptions.py b/pyiceberg/exceptions.py index 6096096f45..f633e4c143 100644 --- a/pyiceberg/exceptions.py +++ b/pyiceberg/exceptions.py @@ -111,12 +111,15 @@ class ConditionalCheckFailedException(DynamoDbError): class GenericDynamoDbError(DynamoDbError): pass + class S3TablesError(Exception): pass + class TableBucketNotFound(S3TablesError): pass + class CommitFailedException(Exception): """Commit failed, refresh and try again.""" diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 5b7393b3a0..2cbdca15c3 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -4,8 +4,7 @@ import pytest from pyiceberg.catalog.s3tables import S3TableCatalog -from pyiceberg.exceptions import NoSuchTableError -from pyiceberg.exceptions import TableBucketNotFound +from pyiceberg.exceptions import NoSuchTableError, TableBucketNotFound from pyiceberg.schema import Schema from pyiceberg.types import IntegerType @@ -35,9 +34,7 @@ def table_bucket_arn(): def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, database_name, table_name): client = boto3.client("s3tables") client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) - response = client.create_table( - tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG" - ) + response = client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") version_token = response["versionToken"] scrambled_version_token = version_token[::-1] From 4639e1e78ee13f9c2d05999706d0a95625c24955 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 23 Dec 2024 14:19:27 +0100 Subject: [PATCH 20/71] feat: raise exceptions if boto3 doesn't support s3tables --- pyiceberg/catalog/s3tables.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index b4aa7283b5..b8009191b2 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union import boto3 +from boto3.session import UnknownServiceError from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, WAREHOUSE_LOCATION, MetastoreCatalog, PropertiesUpdateSummary from pyiceberg.exceptions import ( @@ -9,6 +10,7 @@ NamespaceNotEmptyError, NoSuchTableError, TableBucketNotFound, + S3TablesError ) from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, PY_IO_IMPL, load_file_io from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec @@ -50,9 +52,11 @@ def __init__(self, name: str, **properties: str): aws_secret_access_key=get_first_property_value(properties, S3TABLES_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY), aws_session_token=get_first_property_value(properties, S3TABLES_SESSION_TOKEN, AWS_SESSION_TOKEN), ) - # TODO: s3tables client only supported from boto3>=1.35.74 so this can crash - # TODO: set a custom user-agent for api calls like the Java implementation - self.s3tables = session.client("s3tables") + try: + self.s3tables = session.client("s3tables") + except UnknownServiceError as e: + raise S3TablesError("'s3tables' requires boto3>=1.35.74. Current version: {boto3.__version__}.") from e + # TODO: handle malformed properties instead of just raising a key error here self.table_bucket_arn = self.properties[WAREHOUSE_LOCATION] try: From 6b4d258e61d8a681c2345cb99e0fbc8a89faa952 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 23 Dec 2024 14:24:55 +0100 Subject: [PATCH 21/71] feat: make endpoint configurable --- pyiceberg/catalog/s3tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index b8009191b2..ab3e77a016 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -53,7 +53,7 @@ def __init__(self, name: str, **properties: str): aws_session_token=get_first_property_value(properties, S3TABLES_SESSION_TOKEN, AWS_SESSION_TOKEN), ) try: - self.s3tables = session.client("s3tables") + self.s3tables = session.client("s3tables", endpoint_url=properties.get(S3TABLES_ENDPOINT)) except UnknownServiceError as e: raise S3TablesError("'s3tables' requires boto3>=1.35.74. Current version: {boto3.__version__}.") from e From ed0cba62c0b13338c784ecc52507dbcd50e35c61 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 23 Dec 2024 14:34:53 +0100 Subject: [PATCH 22/71] feat: explicitly configure tableBucketARN --- pyiceberg/catalog/s3tables.py | 6 ++-- tests/catalog/test_s3tables.py | 50 +++++++++++----------------------- 2 files changed, 20 insertions(+), 36 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index ab3e77a016..4675a9cf76 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -32,6 +32,8 @@ S3TABLES_SECRET_ACCESS_KEY = "s3tables.secret-access-key" S3TABLES_SESSION_TOKEN = "s3tables.session-token" +S3TABLES_TABLE_BUCKET_ARN = "s3tables.table-bucket-arn" + S3TABLES_ENDPOINT = "s3tables.endpoint" # pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 @@ -44,6 +46,8 @@ def __init__(self, name: str, **properties: str): # TODO: implement a proper check for FileIO self.properties[PY_IO_IMPL] = S3TABLES_FILE_IO_DEFAULT + self.table_bucket_arn = self.properties[S3TABLES_TABLE_BUCKET_ARN] + session = boto3.Session( profile_name=properties.get(S3TABLES_PROFILE_NAME), region_name=get_first_property_value(properties, S3TABLES_REGION, AWS_REGION), @@ -57,8 +61,6 @@ def __init__(self, name: str, **properties: str): except UnknownServiceError as e: raise S3TablesError("'s3tables' requires boto3>=1.35.74. Current version: {boto3.__version__}.") from e - # TODO: handle malformed properties instead of just raising a key error here - self.table_bucket_arn = self.properties[WAREHOUSE_LOCATION] try: self.s3tables.get_table_bucket(tableBucketARN=self.table_bucket_arn) except self.s3tables.exceptions.NotFoundException as e: diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 2cbdca15c3..67cdeb21c8 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -30,6 +30,12 @@ def table_bucket_arn(): return os.environ["ARN"] +@pytest.fixture +def catalog(table_bucket_arn): + # setting FileIO to FsspecFileIO explicitly is required as pyarrow does not work with S3 Table Buckets yet + properties = {"s3tables.table-bucket-arn": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + return S3TableCatalog(name="test_s3tables_catalog", **properties) + def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, database_name, table_name): client = boto3.client("s3tables") @@ -54,39 +60,30 @@ def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, dat def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn): - properties = {"warehouse": f"{table_bucket_arn}-modified"} + properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified"} with pytest.raises(TableBucketNotFound): S3TableCatalog(name="test_s3tables_catalog", **properties) -def test_create_namespace(table_bucket_arn, database_name: str): - properties = {"warehouse": table_bucket_arn} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_create_namespace(catalog, database_name: str): catalog.create_namespace(namespace=database_name) namespaces = catalog.list_namespaces() assert (database_name,) in namespaces -def test_load_namespace_properties(table_bucket_arn, database_name: str): - properties = {"warehouse": table_bucket_arn} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_load_namespace_properties(catalog, database_name: str): catalog.create_namespace(namespace=database_name) assert database_name in catalog.load_namespace_properties(database_name)["namespace"] -def test_drop_namespace(table_bucket_arn, database_name: str): - properties = {"warehouse": table_bucket_arn} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_drop_namespace(catalog, database_name: str): catalog.create_namespace(namespace=database_name) assert (database_name,) in catalog.list_namespaces() catalog.drop_namespace(namespace=database_name) assert (database_name,) not in catalog.list_namespaces() -def test_create_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): - # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet - properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_create_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -95,10 +92,7 @@ def test_create_table(table_bucket_arn, database_name: str, table_name: str, tab assert table == catalog.load_table(identifier) -def test_table_exists(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): - # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet - properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_table_exists(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -107,10 +101,7 @@ def test_table_exists(table_bucket_arn, database_name: str, table_name: str, tab assert catalog.table_exists(identifier=identifier) -def test_rename_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): - # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet - properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_rename_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -126,10 +117,7 @@ def test_rename_table(table_bucket_arn, database_name: str, table_name: str, tab assert catalog.table_exists(identifier=to_identifier) -def test_list_tables(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): - # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet - properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_list_tables(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -138,10 +126,7 @@ def test_list_tables(table_bucket_arn, database_name: str, table_name: str, tabl assert catalog.list_tables(namespace=database_name) -def test_drop_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): - # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet - properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_drop_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -153,10 +138,7 @@ def test_drop_table(table_bucket_arn, database_name: str, table_name: str, table catalog.load_table(identifier=identifier) -def test_commit_table(table_bucket_arn, database_name: str, table_name: str, table_schema_nested: Schema): - # setting FileIO to FsspecFileIO explicitly is required as pyarrwo does not work with S3 Table Buckets yet - properties = {"warehouse": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} - catalog = S3TableCatalog(name="test_s3tables_catalog", **properties) +def test_commit_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) From 49355e2b9577c25e7639ed39ab364b9fb3004464 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 23 Dec 2024 14:37:04 +0100 Subject: [PATCH 23/71] fix: remove defaulting to FsspecIO --- pyiceberg/catalog/s3tables.py | 5 ----- tests/catalog/test_s3tables.py | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 4675a9cf76..b8d4ebf672 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -36,15 +36,10 @@ S3TABLES_ENDPOINT = "s3tables.endpoint" -# pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 -S3TABLES_FILE_IO_DEFAULT = "pyiceberg.io.fsspec.FsspecFileIO" - class S3TableCatalog(MetastoreCatalog): def __init__(self, name: str, **properties: str): super().__init__(name, **properties) - # TODO: implement a proper check for FileIO - self.properties[PY_IO_IMPL] = S3TABLES_FILE_IO_DEFAULT self.table_bucket_arn = self.properties[S3TABLES_TABLE_BUCKET_ARN] diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 67cdeb21c8..c761ab79ad 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -32,7 +32,7 @@ def table_bucket_arn(): @pytest.fixture def catalog(table_bucket_arn): - # setting FileIO to FsspecFileIO explicitly is required as pyarrow does not work with S3 Table Buckets yet + # pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 properties = {"s3tables.table-bucket-arn": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} return S3TableCatalog(name="test_s3tables_catalog", **properties) From 61c4e9cde5ec35a0c60c6a14b6f97c37d2d08a09 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 23 Dec 2024 14:55:12 +0100 Subject: [PATCH 24/71] feat: raise exceptions for invalid namespace/table name --- pyiceberg/catalog/s3tables.py | 31 ++++++++++++------------------- pyiceberg/exceptions.py | 7 +++++++ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index b8d4ebf672..7a3c56ae01 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -4,13 +4,15 @@ import boto3 from boto3.session import UnknownServiceError -from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, WAREHOUSE_LOCATION, MetastoreCatalog, PropertiesUpdateSummary +from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, MetastoreCatalog, PropertiesUpdateSummary from pyiceberg.exceptions import ( CommitFailedException, NamespaceNotEmptyError, NoSuchTableError, TableBucketNotFound, - S3TablesError + S3TablesError, + InvalidNamespaceName, + InvalidTableName, ) from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, PY_IO_IMPL, load_file_io from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec @@ -36,6 +38,10 @@ S3TABLES_ENDPOINT = "s3tables.endpoint" +# for naming rules see: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html +S3TABLES_VALID_NAME_REGEX = pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") +S3TABLES_RESERVED_NAMESPACE = "aws_s3_metadata" + class S3TableCatalog(MetastoreCatalog): def __init__(self, name: str, **properties: str): @@ -109,33 +115,20 @@ def create_namespace(self, namespace: Union[str, Identifier], properties: Proper self.s3tables.create_namespace(tableBucketARN=self.table_bucket_arn, namespace=[valid_namespace]) def _validate_namespace_identifier(self, namespace: Union[str, Identifier]) -> str: - # for naming rules see: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html - # TODO: extract into constant variables - pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") - reserved = "aws_s3_metadata" - namespace = self.identifier_to_database(namespace) - if not pattern.fullmatch(namespace): - ... - - if namespace == reserved: - ... + if not S3TABLES_VALID_NAME_REGEX.fullmatch(namespace) or namespace == S3TABLES_RESERVED_NAMESPACE: + raise InvalidNamespaceName("The specified namespace name is not valid.") return namespace def _validate_database_and_table_identifier(self, identifier: Union[str, Identifier]) -> Tuple[str, str]: - # for naming rules see: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html - # TODO: extract into constant variables - pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") - namespace, table_name = self.identifier_to_database_and_table(identifier) namespace = self._validate_namespace_identifier(namespace) - if not pattern.fullmatch(table_name): - # TODO: raise proper errors for invalid table_name - ... + if not S3TABLES_VALID_NAME_REGEX.fullmatch(table_name): + raise InvalidTableName("The specified table name is not valid.") return namespace, table_name diff --git a/pyiceberg/exceptions.py b/pyiceberg/exceptions.py index f633e4c143..245fffd3c1 100644 --- a/pyiceberg/exceptions.py +++ b/pyiceberg/exceptions.py @@ -115,6 +115,13 @@ class GenericDynamoDbError(DynamoDbError): class S3TablesError(Exception): pass +class InvalidNamespaceName(S3TablesError): + pass + + +class InvalidTableName(S3TablesError): + pass + class TableBucketNotFound(S3TablesError): pass From a164e7737e6a55e0ea9d88082e19f58a7605113b Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 14:48:10 +0100 Subject: [PATCH 25/71] feat: improve error handling for create_table --- pyiceberg/catalog/s3tables.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 7a3c56ae01..d9ddeff5dc 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -164,17 +164,20 @@ def create_table( io = load_file_io(properties=self.properties, location=metadata_location) # TODO: this triggers unsupported list operation error, setting overwrite=True is a workaround for now - # TODO: we can perform this check manually maybe? self._write_metadata(metadata, io, metadata_location, overwrite=True) - # TODO: after writing need to update table metadata location - # can this operation fail if the version token does not match? - self.s3tables.update_table_metadata_location( - tableBucketARN=self.table_bucket_arn, - namespace=namespace, - name=table_name, - versionToken=version_token, - metadataLocation=metadata_location, - ) + + try: + self.s3tables.update_table_metadata_location( + tableBucketARN=self.table_bucket_arn, + namespace=namespace, + name=table_name, + versionToken=version_token, + metadataLocation=metadata_location, + ) + except self.s3tables.exceptions.ConflictException as e: + raise CommitFailedException( + f"Cannot create {namespace}.{table_name} because of a concurrent update to the table version {version_token}." + ) from e return self.load_table(identifier=identifier) From 37bd60983ddf3153b98d528acafbe0067b4bcab8 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 14:49:24 +0100 Subject: [PATCH 26/71] feat: improve error handling for delete_table --- pyiceberg/catalog/s3tables.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index d9ddeff5dc..92c8f78077 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -195,13 +195,18 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: except self.s3tables.exceptions.NotFoundException as e: raise NoSuchTableError(f"No table with identifier {identifier} exists.") from e - # TODO: handle conflicts due to versionToken mismatch that might occur - self.s3tables.delete_table( - tableBucketARN=self.table_bucket_arn, - namespace=namespace, - name=table_name, - versionToken=response["versionToken"], - ) + version_token = response["versionToken"] + try: + self.s3tables.delete_table( + tableBucketARN=self.table_bucket_arn, + namespace=namespace, + name=table_name, + versionToken=version_token, + ) + except self.s3tables.exceptions.ConflictException as e: + raise CommitFailedException( + f"Cannot delete {namespace}.{table_name} because of a concurrent update to the table version {version_token}." + ) from e def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]: # TODO: s3tables only support single level namespaces From 63f852589a06e391021ffe6baa17f537cb8f37e4 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 14:50:49 +0100 Subject: [PATCH 27/71] chore: cleanup comments --- pyiceberg/catalog/s3tables.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 92c8f78077..b95f2ba8d2 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -209,7 +209,6 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: ) from e def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]: - # TODO: s3tables only support single level namespaces if namespace: namespace = self._validate_namespace_identifier(namespace) paginator = self.s3tables.get_paginator("list_namespaces") @@ -240,7 +239,7 @@ def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Proper def load_table(self, identifier: Union[str, Identifier]) -> Table: namespace, table_name = self._validate_database_and_table_identifier(identifier) - # TODO: raise a NoSuchTableError if it does not exist + try: response = self.s3tables.get_table_metadata_location( tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name From da91a11e9399ae9f08adcdf708f078fe472a535e Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 14:55:16 +0100 Subject: [PATCH 28/71] feat: catch missing metadata for load_table --- pyiceberg/catalog/s3tables.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index b95f2ba8d2..077d93d7d0 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -246,8 +246,10 @@ def load_table(self, identifier: Union[str, Identifier]) -> Table: ) except self.s3tables.exceptions.NotFoundException as e: raise NoSuchTableError(f"No table with identifier {identifier} exists.") from e - # TODO: we might need to catch if table is not initialized i.e. does not have metadata setup yet - metadata_location = response["metadataLocation"] + + metadata_location = response.get("metadataLocation") + if not metadata_location: + raise S3TablesError("No table metadata found.") io = load_file_io(properties=self.properties, location=metadata_location) file = io.new_input(metadata_location) From bd5be822d942af08c57e27b0b3a37569da0b66f8 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 16:02:17 +0100 Subject: [PATCH 29/71] feat: handle missing namespace and preexisting table --- pyiceberg/catalog/s3tables.py | 15 ++++++++++++--- tests/catalog/test_s3tables.py | 19 ++++++++++++++++++- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 077d93d7d0..6ae2db3f32 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -13,6 +13,8 @@ S3TablesError, InvalidNamespaceName, InvalidTableName, + TableAlreadyExistsError, + NoSuchNamespaceError ) from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, PY_IO_IMPL, load_file_io from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec @@ -145,10 +147,17 @@ def create_table( schema: Schema = self._convert_schema_if_needed(schema) # type: ignore - # TODO: check whether namespace exists and if it does, whether table_name already exists - self.s3tables.create_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG") + try: + self.s3tables.create_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG") + except self.s3tables.exceptions.NotFoundException as e: + raise NoSuchNamespaceError( + f"Cannot create {namespace}.{table_name} because no such namespace exists." + ) from e + except self.s3tables.exceptions.ConflictException as e: + raise TableAlreadyExistsError( + f"Cannot create {namespace}.{table_name} because a table of the same name already exists in the namespace." + ) from e - # location is given by s3 table bucket response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) version_token = response["versionToken"] diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index c761ab79ad..cd764737f2 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -4,7 +4,7 @@ import pytest from pyiceberg.catalog.s3tables import S3TableCatalog -from pyiceberg.exceptions import NoSuchTableError, TableBucketNotFound +from pyiceberg.exceptions import NoSuchTableError, TableBucketNotFound, NoSuchNamespaceError from pyiceberg.schema import Schema from pyiceberg.types import IntegerType @@ -59,6 +59,14 @@ def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, dat ) +def test_s3tables_api_raises_on_preexisting_table(table_bucket_arn, database_name, table_name): + client = boto3.client("s3tables") + client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) + client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") + with pytest.raises(client.exceptions.ConflictException): + client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") + + def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn): properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified"} with pytest.raises(TableBucketNotFound): @@ -92,6 +100,15 @@ def test_create_table(catalog, database_name: str, table_name: str, table_schema assert table == catalog.load_table(identifier) +def test_create_table_in_invalid_namespace_raises_exception(catalog, database_name: str, table_name: str, table_schema_nested: Schema): + identifier = (database_name, table_name) + + with pytest.raises(NoSuchNamespaceError): + catalog.create_table(identifier=identifier, schema=table_schema_nested) + + + + def test_table_exists(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) From c15ffdbc4f845714887a86e13f8c880ab43c1a21 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 16:12:21 +0100 Subject: [PATCH 30/71] feat: handle versionToken and table in an atomic operation --- pyiceberg/catalog/s3tables.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 6ae2db3f32..e8bbebe960 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -75,13 +75,7 @@ def commit_table( table_identifier = table.name() database_name, table_name = self.identifier_to_database_and_table(table_identifier, NoSuchTableError) - # TODO: loading table and getting versionToken should be an atomic operation, otherwise - # the table can change between those two API calls without noticing - # -> change this into an internal API that returns table information along with versionToken - current_table = self.load_table(identifier=table_identifier) - version_token = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=database_name, name=table_name)[ - "versionToken" - ] + current_table, version_token = self._load_table_and_version(identifier=table_identifier) updated_staged_table = self._update_and_stage_table(current_table, table_identifier, requirements, updates) if current_table and updated_staged_table.metadata == current_table.metadata: @@ -172,7 +166,7 @@ def create_table( ) io = load_file_io(properties=self.properties, location=metadata_location) - # TODO: this triggers unsupported list operation error, setting overwrite=True is a workaround for now + # this triggers unsupported list operation error, setting overwrite=True is a workaround for now self._write_metadata(metadata, io, metadata_location, overwrite=True) try: @@ -247,6 +241,10 @@ def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Proper } def load_table(self, identifier: Union[str, Identifier]) -> Table: + table, _ = self._load_table_and_version(identifier) + return table + + def _load_table_and_version(self, identifier: Union[str, Identifier]) -> Tuple[Table, str]: namespace, table_name = self._validate_database_and_table_identifier(identifier) try: @@ -260,6 +258,8 @@ def load_table(self, identifier: Union[str, Identifier]) -> Table: if not metadata_location: raise S3TablesError("No table metadata found.") + version_token = response["versionToken"] + io = load_file_io(properties=self.properties, location=metadata_location) file = io.new_input(metadata_location) metadata = FromInputFile.table_metadata(file) @@ -269,7 +269,7 @@ def load_table(self, identifier: Union[str, Identifier]) -> Table: metadata_location=metadata_location, io=self._load_file_io(metadata.properties, metadata_location), catalog=self, - ) + ), version_token def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: Union[str, Identifier]) -> Table: from_namespace, from_table_name = self._validate_database_and_table_identifier(from_identifier) From dceb55ddbda448daa1cccad75461eddbd6e24e5e Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 16:13:00 +0100 Subject: [PATCH 31/71] chore: run formatter --- pyiceberg/catalog/s3tables.py | 18 +++++++++--------- pyiceberg/exceptions.py | 1 + tests/catalog/test_s3tables.py | 9 +++++---- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index e8bbebe960..85971d9c4b 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -7,16 +7,16 @@ from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, MetastoreCatalog, PropertiesUpdateSummary from pyiceberg.exceptions import ( CommitFailedException, + InvalidNamespaceName, + InvalidTableName, NamespaceNotEmptyError, + NoSuchNamespaceError, NoSuchTableError, - TableBucketNotFound, S3TablesError, - InvalidNamespaceName, - InvalidTableName, TableAlreadyExistsError, - NoSuchNamespaceError + TableBucketNotFound, ) -from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, PY_IO_IMPL, load_file_io +from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, load_file_io from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile @@ -142,11 +142,11 @@ def create_table( schema: Schema = self._convert_schema_if_needed(schema) # type: ignore try: - self.s3tables.create_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG") + self.s3tables.create_table( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG" + ) except self.s3tables.exceptions.NotFoundException as e: - raise NoSuchNamespaceError( - f"Cannot create {namespace}.{table_name} because no such namespace exists." - ) from e + raise NoSuchNamespaceError(f"Cannot create {namespace}.{table_name} because no such namespace exists.") from e except self.s3tables.exceptions.ConflictException as e: raise TableAlreadyExistsError( f"Cannot create {namespace}.{table_name} because a table of the same name already exists in the namespace." diff --git a/pyiceberg/exceptions.py b/pyiceberg/exceptions.py index 245fffd3c1..8367d91b79 100644 --- a/pyiceberg/exceptions.py +++ b/pyiceberg/exceptions.py @@ -115,6 +115,7 @@ class GenericDynamoDbError(DynamoDbError): class S3TablesError(Exception): pass + class InvalidNamespaceName(S3TablesError): pass diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index cd764737f2..d379e73b82 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -4,7 +4,7 @@ import pytest from pyiceberg.catalog.s3tables import S3TableCatalog -from pyiceberg.exceptions import NoSuchTableError, TableBucketNotFound, NoSuchNamespaceError +from pyiceberg.exceptions import NoSuchNamespaceError, NoSuchTableError, TableBucketNotFound from pyiceberg.schema import Schema from pyiceberg.types import IntegerType @@ -30,6 +30,7 @@ def table_bucket_arn(): return os.environ["ARN"] + @pytest.fixture def catalog(table_bucket_arn): # pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 @@ -100,15 +101,15 @@ def test_create_table(catalog, database_name: str, table_name: str, table_schema assert table == catalog.load_table(identifier) -def test_create_table_in_invalid_namespace_raises_exception(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_create_table_in_invalid_namespace_raises_exception( + catalog, database_name: str, table_name: str, table_schema_nested: Schema +): identifier = (database_name, table_name) with pytest.raises(NoSuchNamespaceError): catalog.create_table(identifier=identifier, schema=table_schema_nested) - - def test_table_exists(catalog, database_name: str, table_name: str, table_schema_nested: Schema): identifier = (database_name, table_name) From 31144248e40a8bdae588b833d20e1e760e724367 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 29 Dec 2024 16:17:09 +0100 Subject: [PATCH 32/71] chore: add type hints for tests --- pyiceberg/catalog/s3tables.py | 15 +++++++------- tests/catalog/test_s3tables.py | 36 +++++++++++++++++----------------- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 85971d9c4b..fae82b0ee8 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -2,7 +2,6 @@ from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union import boto3 -from boto3.session import UnknownServiceError from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, MetastoreCatalog, PropertiesUpdateSummary from pyiceberg.exceptions import ( @@ -20,10 +19,10 @@ from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile -from pyiceberg.table import CommitTableResponse, Table, TableRequirement +from pyiceberg.table import CommitTableResponse, Table from pyiceberg.table.metadata import new_table_metadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder -from pyiceberg.table.update import TableUpdate +from pyiceberg.table.update import TableRequirement, TableUpdate from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties from pyiceberg.utils.properties import get_first_property_value @@ -61,7 +60,7 @@ def __init__(self, name: str, **properties: str): ) try: self.s3tables = session.client("s3tables", endpoint_url=properties.get(S3TABLES_ENDPOINT)) - except UnknownServiceError as e: + except boto3.session.UnknownServiceError as e: raise S3TablesError("'s3tables' requires boto3>=1.35.74. Current version: {boto3.__version__}.") from e try: @@ -106,7 +105,7 @@ def commit_table( metadata=updated_staged_table.metadata, metadata_location=updated_staged_table.metadata_location ) - def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = ...) -> None: + def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = EMPTY_DICT) -> None: valid_namespace: str = self._validate_namespace_identifier(namespace) self.s3tables.create_namespace(tableBucketARN=self.table_bucket_arn, namespace=[valid_namespace]) @@ -155,10 +154,10 @@ def create_table( response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) version_token = response["versionToken"] - location = response["warehouseLocation"] - metadata_location = self._get_metadata_location(location=location) + warehouse_location = response["warehouseLocation"] + metadata_location = self._get_metadata_location(location=warehouse_location) metadata = new_table_metadata( - location=location, + location=warehouse_location, schema=schema, partition_spec=partition_spec, sort_order=sort_order, diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index d379e73b82..101eae485e 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -10,19 +10,19 @@ @pytest.fixture -def database_name(database_name): +def database_name(database_name: str) -> str: # naming rules prevent "-" in namespaces for s3 table buckets return database_name.replace("-", "_") @pytest.fixture -def table_name(table_name): +def table_name(table_name: str) -> str: # naming rules prevent "-" in table namees for s3 table buckets return table_name.replace("-", "_") @pytest.fixture -def table_bucket_arn(): +def table_bucket_arn() -> str: import os # since the moto library does not support s3tables as of 2024-12-14 we have to test against a real AWS endpoint @@ -32,13 +32,13 @@ def table_bucket_arn(): @pytest.fixture -def catalog(table_bucket_arn): +def catalog(table_bucket_arn: str) -> S3TableCatalog: # pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 properties = {"s3tables.table-bucket-arn": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} return S3TableCatalog(name="test_s3tables_catalog", **properties) -def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, database_name, table_name): +def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn: str, database_name: str, table_name: str) -> None: client = boto3.client("s3tables") client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) response = client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") @@ -60,7 +60,7 @@ def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn, dat ) -def test_s3tables_api_raises_on_preexisting_table(table_bucket_arn, database_name, table_name): +def test_s3tables_api_raises_on_preexisting_table(table_bucket_arn: str, database_name: str, table_name: str) -> None: client = boto3.client("s3tables") client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") @@ -68,31 +68,31 @@ def test_s3tables_api_raises_on_preexisting_table(table_bucket_arn, database_nam client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") -def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn): +def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str) -> None: properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified"} with pytest.raises(TableBucketNotFound): S3TableCatalog(name="test_s3tables_catalog", **properties) -def test_create_namespace(catalog, database_name: str): +def test_create_namespace(catalog: S3TableCatalog, database_name: str) -> None: catalog.create_namespace(namespace=database_name) namespaces = catalog.list_namespaces() assert (database_name,) in namespaces -def test_load_namespace_properties(catalog, database_name: str): +def test_load_namespace_properties(catalog: S3TableCatalog, database_name: str) -> None: catalog.create_namespace(namespace=database_name) assert database_name in catalog.load_namespace_properties(database_name)["namespace"] -def test_drop_namespace(catalog, database_name: str): +def test_drop_namespace(catalog: S3TableCatalog, database_name: str) -> None: catalog.create_namespace(namespace=database_name) assert (database_name,) in catalog.list_namespaces() catalog.drop_namespace(namespace=database_name) assert (database_name,) not in catalog.list_namespaces() -def test_create_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_create_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -102,15 +102,15 @@ def test_create_table(catalog, database_name: str, table_name: str, table_schema def test_create_table_in_invalid_namespace_raises_exception( - catalog, database_name: str, table_name: str, table_schema_nested: Schema -): + catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema +) -> None: identifier = (database_name, table_name) with pytest.raises(NoSuchNamespaceError): catalog.create_table(identifier=identifier, schema=table_schema_nested) -def test_table_exists(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_table_exists(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -119,7 +119,7 @@ def test_table_exists(catalog, database_name: str, table_name: str, table_schema assert catalog.table_exists(identifier=identifier) -def test_rename_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_rename_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -135,7 +135,7 @@ def test_rename_table(catalog, database_name: str, table_name: str, table_schema assert catalog.table_exists(identifier=to_identifier) -def test_list_tables(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_list_tables(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -144,7 +144,7 @@ def test_list_tables(catalog, database_name: str, table_name: str, table_schema_ assert catalog.list_tables(namespace=database_name) -def test_drop_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_drop_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -156,7 +156,7 @@ def test_drop_table(catalog, database_name: str, table_name: str, table_schema_n catalog.load_table(identifier=identifier) -def test_commit_table(catalog, database_name: str, table_name: str, table_schema_nested: Schema): +def test_commit_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) From 1dda96dc5812fd6fe83df842403bbba02fb4b29d Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 4 Jan 2025 13:29:37 +0100 Subject: [PATCH 33/71] fix: no longer enforce FsspecFileIO --- tests/catalog/test_s3tables.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 101eae485e..76663de25a 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -33,8 +33,7 @@ def table_bucket_arn() -> str: @pytest.fixture def catalog(table_bucket_arn: str) -> S3TableCatalog: - # pyarrow does not support writing to S3 Table buckets as of 2024-12-14 https://github.com/apache/iceberg-python/issues/1404#issuecomment-2543174146 - properties = {"s3tables.table-bucket-arn": table_bucket_arn, "py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} + properties = {"s3tables.table-bucket-arn": table_bucket_arn} return S3TableCatalog(name="test_s3tables_catalog", **properties) From 99d272d9b6ff94ae566c5ae8714dcb004d0fc6a2 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 4 Jan 2025 13:37:01 +0100 Subject: [PATCH 34/71] test: remove tests for boto3 behavior --- tests/catalog/test_s3tables.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 76663de25a..d9f05df430 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -37,36 +37,6 @@ def catalog(table_bucket_arn: str) -> S3TableCatalog: return S3TableCatalog(name="test_s3tables_catalog", **properties) -def test_s3tables_api_raises_on_conflicting_version_tokens(table_bucket_arn: str, database_name: str, table_name: str) -> None: - client = boto3.client("s3tables") - client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) - response = client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") - version_token = response["versionToken"] - scrambled_version_token = version_token[::-1] - - warehouse_location = client.get_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name)[ - "warehouseLocation" - ] - metadata_location = f"{warehouse_location}/metadata/00001-{uuid.uuid4()}.metadata.json" - - with pytest.raises(client.exceptions.ConflictException): - client.update_table_metadata_location( - tableBucketARN=table_bucket_arn, - namespace=database_name, - name=table_name, - versionToken=scrambled_version_token, - metadataLocation=metadata_location, - ) - - -def test_s3tables_api_raises_on_preexisting_table(table_bucket_arn: str, database_name: str, table_name: str) -> None: - client = boto3.client("s3tables") - client.create_namespace(tableBucketARN=table_bucket_arn, namespace=[database_name]) - client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") - with pytest.raises(client.exceptions.ConflictException): - client.create_table(tableBucketARN=table_bucket_arn, namespace=database_name, name=table_name, format="ICEBERG") - - def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str) -> None: properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified"} with pytest.raises(TableBucketNotFound): From c060ad910652f243a4da259837bf6f7232f58633 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 4 Jan 2025 13:44:14 +0100 Subject: [PATCH 35/71] test: verify column was created on commit --- tests/catalog/test_s3tables.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index d9f05df430..15d9ba57ed 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -147,3 +147,4 @@ def test_commit_table(catalog: S3TableCatalog, database_name: str, table_name: s assert updated_table_metadata.last_updated_ms > last_updated_ms assert updated_table_metadata.metadata_log[0].metadata_file == original_table_metadata_location assert updated_table_metadata.metadata_log[0].timestamp_ms == original_table_last_updated_ms + assert table.schema().columns[-1].name == "b" From da6516b968c9bf822990f3293121915aeeb96664 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 4 Jan 2025 13:57:59 +0100 Subject: [PATCH 36/71] test: verify new data can be committed to table --- tests/catalog/test_s3tables.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 15d9ba57ed..f97371540b 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -1,6 +1,3 @@ -import uuid - -import boto3 import pytest from pyiceberg.catalog.s3tables import S3TableCatalog @@ -125,7 +122,9 @@ def test_drop_table(catalog: S3TableCatalog, database_name: str, table_name: str catalog.load_table(identifier=identifier) -def test_commit_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: +def test_commit_new_column_to_table( + catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema +) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -148,3 +147,27 @@ def test_commit_table(catalog: S3TableCatalog, database_name: str, table_name: s assert updated_table_metadata.metadata_log[0].metadata_file == original_table_metadata_location assert updated_table_metadata.metadata_log[0].timestamp_ms == original_table_last_updated_ms assert table.schema().columns[-1].name == "b" + + +def test_commit_new_data_to_table( + catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema +) -> None: + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + table = catalog.create_table(identifier=identifier, schema=table_schema_nested) + + row_count = table.scan().to_arrow().num_rows + last_updated_ms = table.metadata.last_updated_ms + original_table_metadata_location = table.metadata_location + original_table_last_updated_ms = table.metadata.last_updated_ms + + transaction = table.transaction() + transaction.append(table.scan().to_arrow()) + transaction.commit_transaction() + + updated_table_metadata = table.metadata + assert updated_table_metadata.last_updated_ms > last_updated_ms + assert updated_table_metadata.metadata_log[0].metadata_file == original_table_metadata_location + assert updated_table_metadata.metadata_log[0].timestamp_ms == original_table_last_updated_ms + assert table.scan().to_arrow().num_rows == 2 * row_count From 9f890c22d7483a5c879302d80aff336e27791706 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 5 Jan 2025 17:11:33 +0100 Subject: [PATCH 37/71] docs: update documentation for create_table --- pyiceberg/catalog/s3tables.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index fae82b0ee8..0485bb3072 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -140,6 +140,9 @@ def create_table( schema: Schema = self._convert_schema_if_needed(schema) # type: ignore + # creating a new table with S3 Tables is a two step process. We first have to create an S3 Table with the + # S3 Tables API and then write the new metadata.json to the warehouseLocaiton associated with the newly + # created S3 Table. try: self.s3tables.create_table( tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG" @@ -165,7 +168,8 @@ def create_table( ) io = load_file_io(properties=self.properties, location=metadata_location) - # this triggers unsupported list operation error, setting overwrite=True is a workaround for now + # this triggers unsupported list operation error as S3 Table Buckets only support a subset of the S3 Bucket API, + # setting overwrite=True is a workaround for now since it prevents a call to list_objects self._write_metadata(metadata, io, metadata_location, overwrite=True) try: From ee93da296cb1b81094b25ee3d91c921ecbde6a2b Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 5 Jan 2025 17:25:29 +0100 Subject: [PATCH 38/71] test: set AWS regions explicitly --- tests/catalog/test_s3tables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index f97371540b..930b11ec9a 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -30,12 +30,12 @@ def table_bucket_arn() -> str: @pytest.fixture def catalog(table_bucket_arn: str) -> S3TableCatalog: - properties = {"s3tables.table-bucket-arn": table_bucket_arn} + properties = {"s3tables.table-bucket-arn": table_bucket_arn, "s3tables.region": "us-east-1", "s3.region": "us-east-1"} return S3TableCatalog(name="test_s3tables_catalog", **properties) def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str) -> None: - properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified"} + properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified", "s3tables.region": "us-east-1"} with pytest.raises(TableBucketNotFound): S3TableCatalog(name="test_s3tables_catalog", **properties) From 0952b55519e5426b8e2c34c2c3ffa28c86c6ec7a Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 6 Jan 2025 17:19:43 +0100 Subject: [PATCH 39/71] Apply suggestions from code review Co-authored-by: Kevin Liu --- pyiceberg/catalog/s3tables.py | 2 +- tests/catalog/test_s3tables.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 0485bb3072..1408322ea7 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -141,7 +141,7 @@ def create_table( schema: Schema = self._convert_schema_if_needed(schema) # type: ignore # creating a new table with S3 Tables is a two step process. We first have to create an S3 Table with the - # S3 Tables API and then write the new metadata.json to the warehouseLocaiton associated with the newly + # S3 Tables API and then write the new metadata.json to the warehouseLocation associated with the newly # created S3 Table. try: self.s3tables.create_table( diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 930b11ec9a..d9f48e363a 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -26,11 +26,15 @@ def table_bucket_arn() -> str: # in one of the supported regions. return os.environ["ARN"] +@pytest.fixture +def aws_region() -> str: + import os + return os.environ["AWS_REGION"] @pytest.fixture -def catalog(table_bucket_arn: str) -> S3TableCatalog: - properties = {"s3tables.table-bucket-arn": table_bucket_arn, "s3tables.region": "us-east-1", "s3.region": "us-east-1"} +def catalog(table_bucket_arn: str, aws_region: str) -> S3TableCatalog: + properties = {"s3tables.table-bucket-arn": table_bucket_arn, "s3tables.region": aws_region} return S3TableCatalog(name="test_s3tables_catalog", **properties) From 27414e15b8a3a6cd3d4e52e2c0090ff36ab9bcd0 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 6 Jan 2025 17:52:34 +0100 Subject: [PATCH 40/71] test: commit new data to table --- tests/catalog/test_s3tables.py | 62 ++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index d9f48e363a..d1f6e6ffd1 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -26,12 +26,15 @@ def table_bucket_arn() -> str: # in one of the supported regions. return os.environ["ARN"] + + @pytest.fixture def aws_region() -> str: import os return os.environ["AWS_REGION"] + @pytest.fixture def catalog(table_bucket_arn: str, aws_region: str) -> S3TableCatalog: properties = {"s3tables.table-bucket-arn": table_bucket_arn, "s3tables.region": aws_region} @@ -153,15 +156,62 @@ def test_commit_new_column_to_table( assert table.schema().columns[-1].name == "b" -def test_commit_new_data_to_table( - catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema -) -> None: +def test_write_pyarrow_table(catalog: S3TableCatalog, database_name: str, table_name: str) -> None: identifier = (database_name, table_name) + catalog.create_namespace(namespace=database_name) + import pyarrow as pa + + pyarrow_table = pa.Table.from_arrays( + [ + pa.array([None, "A", "B", "C"]), # 'foo' column + pa.array([1, 2, 3, 4]), # 'bar' column + pa.array([True, None, False, True]), # 'baz' column + pa.array([None, "A", "B", "C"]), # 'large' column + ], + schema=pa.schema( + [ + pa.field("foo", pa.large_string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("large", pa.large_string(), nullable=True), + ] + ), + ) + table = catalog.create_table(identifier=identifier, schema=pyarrow_table.schema) + table.append(pyarrow_table) + + assert table.scan().to_arrow().num_rows == pyarrow_table.num_rows + + +def test_commit_new_data_to_table(catalog: S3TableCatalog, database_name: str, table_name: str) -> None: + identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) - table = catalog.create_table(identifier=identifier, schema=table_schema_nested) + + import pyarrow as pa + + pyarrow_table = pa.Table.from_arrays( + [ + pa.array([None, "A", "B", "C"]), # 'foo' column + pa.array([1, 2, 3, 4]), # 'bar' column + pa.array([True, None, False, True]), # 'baz' column + pa.array([None, "A", "B", "C"]), # 'large' column + ], + schema=pa.schema( + [ + pa.field("foo", pa.large_string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("large", pa.large_string(), nullable=True), + ] + ), + ) + + table = catalog.create_table(identifier=identifier, schema=pyarrow_table.schema) + table.append(pyarrow_table) row_count = table.scan().to_arrow().num_rows + assert row_count last_updated_ms = table.metadata.last_updated_ms original_table_metadata_location = table.metadata_location original_table_last_updated_ms = table.metadata.last_updated_ms @@ -172,6 +222,6 @@ def test_commit_new_data_to_table( updated_table_metadata = table.metadata assert updated_table_metadata.last_updated_ms > last_updated_ms - assert updated_table_metadata.metadata_log[0].metadata_file == original_table_metadata_location - assert updated_table_metadata.metadata_log[0].timestamp_ms == original_table_last_updated_ms + assert updated_table_metadata.metadata_log[-1].metadata_file == original_table_metadata_location + assert updated_table_metadata.metadata_log[-1].timestamp_ms == original_table_last_updated_ms assert table.scan().to_arrow().num_rows == 2 * row_count From 2a8c5c48ab2136ac5775e5632059d236517de6b1 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 6 Jan 2025 17:56:05 +0100 Subject: [PATCH 41/71] feat: clarify update_namespace_properties error --- pyiceberg/catalog/s3tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 1408322ea7..01c42f9e3b 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -305,7 +305,7 @@ def update_namespace_properties( self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: # namespace properties are read only - raise NotImplementedError + raise NotImplementedError("Namespace properties are read only") def purge_table(self, identifier: Union[str, Identifier]) -> None: # purge is not supported as s3tables doesn't support delete operations From 80884a6cac95727735bd1d91fbbedf4381a81895 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 6 Jan 2025 17:59:55 +0100 Subject: [PATCH 42/71] feat: raise error when setting custom namespace properties --- pyiceberg/catalog/s3tables.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 01c42f9e3b..d5f5bbc305 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -106,6 +106,8 @@ def commit_table( ) def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = EMPTY_DICT) -> None: + if properties: + raise NotImplementedError("Setting namespace properties is not supported.") valid_namespace: str = self._validate_namespace_identifier(namespace) self.s3tables.create_namespace(tableBucketARN=self.table_bucket_arn, namespace=[valid_namespace]) @@ -305,7 +307,7 @@ def update_namespace_properties( self, namespace: Union[str, Identifier], removals: Optional[Set[str]] = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: # namespace properties are read only - raise NotImplementedError("Namespace properties are read only") + raise NotImplementedError("Namespace properties are read only.") def purge_table(self, identifier: Union[str, Identifier]) -> None: # purge is not supported as s3tables doesn't support delete operations From a6a112f17ae684698941b43c6ff3bb4e17d24069 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 08:57:31 +0100 Subject: [PATCH 43/71] refactor: change S3TableCatalog -> S3TablesCatalog --- pyiceberg/catalog/s3tables.py | 2 +- tests/catalog/test_s3tables.py | 32 ++++++++++++++++---------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index d5f5bbc305..ed6a8027f2 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -44,7 +44,7 @@ S3TABLES_RESERVED_NAMESPACE = "aws_s3_metadata" -class S3TableCatalog(MetastoreCatalog): +class S3TablesCatalog(MetastoreCatalog): def __init__(self, name: str, **properties: str): super().__init__(name, **properties) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index d1f6e6ffd1..c5cf959d4a 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -1,6 +1,6 @@ import pytest -from pyiceberg.catalog.s3tables import S3TableCatalog +from pyiceberg.catalog.s3tables import S3TablesCatalog from pyiceberg.exceptions import NoSuchNamespaceError, NoSuchTableError, TableBucketNotFound from pyiceberg.schema import Schema from pyiceberg.types import IntegerType @@ -36,36 +36,36 @@ def aws_region() -> str: @pytest.fixture -def catalog(table_bucket_arn: str, aws_region: str) -> S3TableCatalog: +def catalog(table_bucket_arn: str, aws_region: str) -> S3TablesCatalog: properties = {"s3tables.table-bucket-arn": table_bucket_arn, "s3tables.region": aws_region} - return S3TableCatalog(name="test_s3tables_catalog", **properties) + return S3TablesCatalog(name="test_s3tables_catalog", **properties) def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str) -> None: properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified", "s3tables.region": "us-east-1"} with pytest.raises(TableBucketNotFound): - S3TableCatalog(name="test_s3tables_catalog", **properties) + S3TablesCatalog(name="test_s3tables_catalog", **properties) -def test_create_namespace(catalog: S3TableCatalog, database_name: str) -> None: +def test_create_namespace(catalog: S3TablesCatalog, database_name: str) -> None: catalog.create_namespace(namespace=database_name) namespaces = catalog.list_namespaces() assert (database_name,) in namespaces -def test_load_namespace_properties(catalog: S3TableCatalog, database_name: str) -> None: +def test_load_namespace_properties(catalog: S3TablesCatalog, database_name: str) -> None: catalog.create_namespace(namespace=database_name) assert database_name in catalog.load_namespace_properties(database_name)["namespace"] -def test_drop_namespace(catalog: S3TableCatalog, database_name: str) -> None: +def test_drop_namespace(catalog: S3TablesCatalog, database_name: str) -> None: catalog.create_namespace(namespace=database_name) assert (database_name,) in catalog.list_namespaces() catalog.drop_namespace(namespace=database_name) assert (database_name,) not in catalog.list_namespaces() -def test_create_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: +def test_create_table(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -75,7 +75,7 @@ def test_create_table(catalog: S3TableCatalog, database_name: str, table_name: s def test_create_table_in_invalid_namespace_raises_exception( - catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema + catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema ) -> None: identifier = (database_name, table_name) @@ -83,7 +83,7 @@ def test_create_table_in_invalid_namespace_raises_exception( catalog.create_table(identifier=identifier, schema=table_schema_nested) -def test_table_exists(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: +def test_table_exists(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -92,7 +92,7 @@ def test_table_exists(catalog: S3TableCatalog, database_name: str, table_name: s assert catalog.table_exists(identifier=identifier) -def test_rename_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: +def test_rename_table(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -108,7 +108,7 @@ def test_rename_table(catalog: S3TableCatalog, database_name: str, table_name: s assert catalog.table_exists(identifier=to_identifier) -def test_list_tables(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: +def test_list_tables(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -117,7 +117,7 @@ def test_list_tables(catalog: S3TableCatalog, database_name: str, table_name: st assert catalog.list_tables(namespace=database_name) -def test_drop_table(catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: +def test_drop_table(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -130,7 +130,7 @@ def test_drop_table(catalog: S3TableCatalog, database_name: str, table_name: str def test_commit_new_column_to_table( - catalog: S3TableCatalog, database_name: str, table_name: str, table_schema_nested: Schema + catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema ) -> None: identifier = (database_name, table_name) @@ -156,7 +156,7 @@ def test_commit_new_column_to_table( assert table.schema().columns[-1].name == "b" -def test_write_pyarrow_table(catalog: S3TableCatalog, database_name: str, table_name: str) -> None: +def test_write_pyarrow_table(catalog: S3TablesCatalog, database_name: str, table_name: str) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) @@ -184,7 +184,7 @@ def test_write_pyarrow_table(catalog: S3TableCatalog, database_name: str, table_ assert table.scan().to_arrow().num_rows == pyarrow_table.num_rows -def test_commit_new_data_to_table(catalog: S3TableCatalog, database_name: str, table_name: str) -> None: +def test_commit_new_data_to_table(catalog: S3TablesCatalog, database_name: str, table_name: str) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) From 662b5eab1463337487fdecc0034b15bd4d865323 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 08:59:42 +0100 Subject: [PATCH 44/71] feat: raise error on specified table location --- pyiceberg/catalog/s3tables.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index ed6a8027f2..a732afcdd9 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -138,6 +138,8 @@ def create_table( sort_order: SortOrder = UNSORTED_SORT_ORDER, properties: Properties = EMPTY_DICT, ) -> Table: + if location: + raise NotImplementedError("S3 Tables does not support user specified table locations.") namespace, table_name = self._validate_database_and_table_identifier(identifier) schema: Schema = self._convert_schema_if_needed(schema) # type: ignore From 1cb6f68a18f60b3c5c59697bcf6b1733d377abfc Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 17:57:00 +0100 Subject: [PATCH 45/71] feat: return empty list when querying a hierarchical namespace --- pyiceberg/catalog/s3tables.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index a732afcdd9..0237dcd0ae 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -220,7 +220,8 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]: if namespace: - namespace = self._validate_namespace_identifier(namespace) + # hierarchical namespaces are not supported + return [] paginator = self.s3tables.get_paginator("list_namespaces") namespaces: List[Identifier] = [] From c110d71aae7ee2218f661ffe9e374330d767c257 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:00:22 +0100 Subject: [PATCH 46/71] refactor: use get_table_metadata_location instead of get_table --- pyiceberg/catalog/s3tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 0237dcd0ae..6384823705 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -158,7 +158,7 @@ def create_table( f"Cannot create {namespace}.{table_name} because a table of the same name already exists in the namespace." ) from e - response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) + response = self.s3tables.get_table_metadata_location(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) version_token = response["versionToken"] warehouse_location = response["warehouseLocation"] From 3d2f7491ee8146312bf34a3c6b03dc2a29ff4fd8 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:02:57 +0100 Subject: [PATCH 47/71] refactor: extract 'ICEBERG' table format into constant --- pyiceberg/catalog/s3tables.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 6384823705..efc22a42c3 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -43,6 +43,8 @@ S3TABLES_VALID_NAME_REGEX = pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") S3TABLES_RESERVED_NAMESPACE = "aws_s3_metadata" +S3TABLES_FORMAT = "ICEBERG" + class S3TablesCatalog(MetastoreCatalog): def __init__(self, name: str, **properties: str): @@ -149,7 +151,7 @@ def create_table( # created S3 Table. try: self.s3tables.create_table( - tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format="ICEBERG" + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format=S3TABLES_FORMAT ) except self.s3tables.exceptions.NotFoundException as e: raise NoSuchNamespaceError(f"Cannot create {namespace}.{table_name} because no such namespace exists.") from e From 44d7a1fb88e153b7bdaf34641eb2d68308dca9bc Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:05:44 +0100 Subject: [PATCH 48/71] feat: change s3tables.table-bucket-arn -> s3tables.warehouse --- pyiceberg/catalog/s3tables.py | 2 +- tests/catalog/test_s3tables.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index efc22a42c3..4d22807687 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -35,7 +35,7 @@ S3TABLES_SECRET_ACCESS_KEY = "s3tables.secret-access-key" S3TABLES_SESSION_TOKEN = "s3tables.session-token" -S3TABLES_TABLE_BUCKET_ARN = "s3tables.table-bucket-arn" +S3TABLES_TABLE_BUCKET_ARN = "s3tables.warehouse" S3TABLES_ENDPOINT = "s3tables.endpoint" diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index c5cf959d4a..dc3bb88238 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -37,12 +37,12 @@ def aws_region() -> str: @pytest.fixture def catalog(table_bucket_arn: str, aws_region: str) -> S3TablesCatalog: - properties = {"s3tables.table-bucket-arn": table_bucket_arn, "s3tables.region": aws_region} + properties = {"s3tables.warehouse": table_bucket_arn, "s3tables.region": aws_region} return S3TablesCatalog(name="test_s3tables_catalog", **properties) def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str) -> None: - properties = {"s3tables.table-bucket-arn": f"{table_bucket_arn}-modified", "s3tables.region": "us-east-1"} + properties = {"s3tables.warehouse": f"{table_bucket_arn}-modified", "s3tables.region": "us-east-1"} with pytest.raises(TableBucketNotFound): S3TablesCatalog(name="test_s3tables_catalog", **properties) From 9c828e3cb5368b4abbf3cd347d74a71abb56c3e8 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:09:03 +0100 Subject: [PATCH 49/71] Apply suggestions from code review Co-authored-by: Honah J. --- pyiceberg/catalog/s3tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 4d22807687..1854dbfc9b 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -40,7 +40,7 @@ S3TABLES_ENDPOINT = "s3tables.endpoint" # for naming rules see: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html -S3TABLES_VALID_NAME_REGEX = pattern = re.compile("[a-z0-9][a-z0-9_]{2,62}") +S3TABLES_VALID_NAME_REGEX = pattern = re.compile("[a-z0-9][a-z0-9_]{1,61}[a-z0-9]") S3TABLES_RESERVED_NAMESPACE = "aws_s3_metadata" S3TABLES_FORMAT = "ICEBERG" From 517f31d90e11f891299c9ffb59ada44b49bd4497 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:13:19 +0100 Subject: [PATCH 50/71] feat: add link to naming-rules for invalid name errors --- pyiceberg/catalog/s3tables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 1854dbfc9b..4d89108d8d 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -117,7 +117,7 @@ def _validate_namespace_identifier(self, namespace: Union[str, Identifier]) -> s namespace = self.identifier_to_database(namespace) if not S3TABLES_VALID_NAME_REGEX.fullmatch(namespace) or namespace == S3TABLES_RESERVED_NAMESPACE: - raise InvalidNamespaceName("The specified namespace name is not valid.") + raise InvalidNamespaceName("The specified namespace name is not valid. See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html for naming rules.") return namespace @@ -127,7 +127,7 @@ def _validate_database_and_table_identifier(self, identifier: Union[str, Identif namespace = self._validate_namespace_identifier(namespace) if not S3TABLES_VALID_NAME_REGEX.fullmatch(table_name): - raise InvalidTableName("The specified table name is not valid.") + raise InvalidTableName("The specified table name is not valid. See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html for naming rules.") return namespace, table_name From 83739d5a9e0547729e9ef52d336a6f14c4d841de Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:22:14 +0100 Subject: [PATCH 51/71] feat: delete s3 table if writing new_table_metadata is unsuccessful --- pyiceberg/catalog/s3tables.py | 63 ++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 4d89108d8d..2cfdf9595b 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -150,9 +150,9 @@ def create_table( # S3 Tables API and then write the new metadata.json to the warehouseLocation associated with the newly # created S3 Table. try: - self.s3tables.create_table( + version_token = self.s3tables.create_table( tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name, format=S3TABLES_FORMAT - ) + )["versionToken"] except self.s3tables.exceptions.NotFoundException as e: raise NoSuchNamespaceError(f"Cannot create {namespace}.{table_name} because no such namespace exists.") from e except self.s3tables.exceptions.ConflictException as e: @@ -160,36 +160,39 @@ def create_table( f"Cannot create {namespace}.{table_name} because a table of the same name already exists in the namespace." ) from e - response = self.s3tables.get_table_metadata_location(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) - version_token = response["versionToken"] - - warehouse_location = response["warehouseLocation"] - metadata_location = self._get_metadata_location(location=warehouse_location) - metadata = new_table_metadata( - location=warehouse_location, - schema=schema, - partition_spec=partition_spec, - sort_order=sort_order, - properties=properties, - ) - - io = load_file_io(properties=self.properties, location=metadata_location) - # this triggers unsupported list operation error as S3 Table Buckets only support a subset of the S3 Bucket API, - # setting overwrite=True is a workaround for now since it prevents a call to list_objects - self._write_metadata(metadata, io, metadata_location, overwrite=True) - try: - self.s3tables.update_table_metadata_location( - tableBucketARN=self.table_bucket_arn, - namespace=namespace, - name=table_name, - versionToken=version_token, - metadataLocation=metadata_location, + response = self.s3tables.get_table_metadata_location(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) + warehouse_location = response["warehouseLocation"] + + metadata_location = self._get_metadata_location(location=warehouse_location) + metadata = new_table_metadata( + location=warehouse_location, + schema=schema, + partition_spec=partition_spec, + sort_order=sort_order, + properties=properties, ) - except self.s3tables.exceptions.ConflictException as e: - raise CommitFailedException( - f"Cannot create {namespace}.{table_name} because of a concurrent update to the table version {version_token}." - ) from e + + io = load_file_io(properties=self.properties, location=metadata_location) + # this triggers unsupported list operation error as S3 Table Buckets only support a subset of the S3 Bucket API, + # setting overwrite=True is a workaround for now since it prevents a call to list_objects + self._write_metadata(metadata, io, metadata_location, overwrite=True) + + try: + self.s3tables.update_table_metadata_location( + tableBucketARN=self.table_bucket_arn, + namespace=namespace, + name=table_name, + versionToken=version_token, + metadataLocation=metadata_location, + ) + except self.s3tables.exceptions.ConflictException as e: + raise CommitFailedException( + f"Cannot create {namespace}.{table_name} because of a concurrent update to the table version {version_token}." + ) from e + except: + self.s3tables.delete_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) + raise return self.load_table(identifier=identifier) From 1475f5b8548340b937e5e70c1dbd9d65d6d68b4e Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 18:22:38 +0100 Subject: [PATCH 52/71] chore: run linter --- pyiceberg/catalog/s3tables.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 2cfdf9595b..1e6b8585d6 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -117,7 +117,9 @@ def _validate_namespace_identifier(self, namespace: Union[str, Identifier]) -> s namespace = self.identifier_to_database(namespace) if not S3TABLES_VALID_NAME_REGEX.fullmatch(namespace) or namespace == S3TABLES_RESERVED_NAMESPACE: - raise InvalidNamespaceName("The specified namespace name is not valid. See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html for naming rules.") + raise InvalidNamespaceName( + "The specified namespace name is not valid. See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html for naming rules." + ) return namespace @@ -127,7 +129,9 @@ def _validate_database_and_table_identifier(self, identifier: Union[str, Identif namespace = self._validate_namespace_identifier(namespace) if not S3TABLES_VALID_NAME_REGEX.fullmatch(table_name): - raise InvalidTableName("The specified table name is not valid. See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html for naming rules.") + raise InvalidTableName( + "The specified table name is not valid. See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-buckets-naming.html for naming rules." + ) return namespace, table_name @@ -161,7 +165,9 @@ def create_table( ) from e try: - response = self.s3tables.get_table_metadata_location(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) + response = self.s3tables.get_table_metadata_location( + tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name + ) warehouse_location = response["warehouseLocation"] metadata_location = self._get_metadata_location(location=warehouse_location) From 9ceea4b32896dbe9ea1f38f9f86669b74408da44 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Tue, 7 Jan 2025 19:15:06 +0100 Subject: [PATCH 53/71] test: rename test_s3tables.py -> integration_test_s3tables.py --- ..._s3tables.py => integration_test_s3tables.py} | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) rename tests/catalog/{test_s3tables.py => integration_test_s3tables.py} (93%) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/integration_test_s3tables.py similarity index 93% rename from tests/catalog/test_s3tables.py rename to tests/catalog/integration_test_s3tables.py index dc3bb88238..7b880bee31 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/integration_test_s3tables.py @@ -20,19 +20,25 @@ def table_name(table_name: str) -> str: @pytest.fixture def table_bucket_arn() -> str: + """Set the environment variable AWS_TEST_S3_TABLE_BUCKET_ARN for an S3 table bucket to test.""" import os - # since the moto library does not support s3tables as of 2024-12-14 we have to test against a real AWS endpoint - # in one of the supported regions. - - return os.environ["ARN"] + table_bucket_arn = os.getenv("AWS_TEST_S3_TABLE_BUCKET_ARN") + if not table_bucket_arn: + raise ValueError( + "Please specify a table bucket arn to run the test by setting environment variable AWS_TEST_S3_TABLE_BUCKET_ARN" + ) + return table_bucket_arn @pytest.fixture def aws_region() -> str: import os - return os.environ["AWS_REGION"] + aws_region = os.getenv("AWS_REGION") + if not aws_region: + raise ValueError("Please specify an AWS region to run the test by setting environment variable AWS_REGION") + return aws_region @pytest.fixture From 930cc3e96e5f13a282089d3e01710eeff924a1f0 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 8 Jan 2025 20:19:53 +0100 Subject: [PATCH 54/71] fix: add license to files --- pyiceberg/catalog/s3tables.py | 16 ++++++++++++++++ tests/catalog/integration_test_s3tables.py | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 1e6b8585d6..a577613c92 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -1,3 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. import re from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union diff --git a/tests/catalog/integration_test_s3tables.py b/tests/catalog/integration_test_s3tables.py index 7b880bee31..351f530122 100644 --- a/tests/catalog/integration_test_s3tables.py +++ b/tests/catalog/integration_test_s3tables.py @@ -1,3 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. import pytest from pyiceberg.catalog.s3tables import S3TablesCatalog From 73cf92211a07988f8ceae75cb1015a6ce08310fe Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Thu, 9 Jan 2025 08:56:43 +0100 Subject: [PATCH 55/71] fix: raise error when creating a table during a transaction --- pyiceberg/catalog/s3tables.py | 79 ++++++++++++++-------- tests/catalog/integration_test_s3tables.py | 36 ++++++++++ 2 files changed, 86 insertions(+), 29 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index a577613c92..14c249448e 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -35,7 +35,7 @@ from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile -from pyiceberg.table import CommitTableResponse, Table +from pyiceberg.table import CommitTableResponse, CreateTableTransaction, Table from pyiceberg.table.metadata import new_table_metadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder from pyiceberg.table.update import TableRequirement, TableUpdate @@ -92,36 +92,57 @@ def commit_table( table_identifier = table.name() database_name, table_name = self.identifier_to_database_and_table(table_identifier, NoSuchTableError) - current_table, version_token = self._load_table_and_version(identifier=table_identifier) - - updated_staged_table = self._update_and_stage_table(current_table, table_identifier, requirements, updates) - if current_table and updated_staged_table.metadata == current_table.metadata: - # no changes, do nothing - return CommitTableResponse(metadata=current_table.metadata, metadata_location=current_table.metadata_location) - - self._write_metadata( - metadata=updated_staged_table.metadata, - io=updated_staged_table.io, - metadata_path=updated_staged_table.metadata_location, - overwrite=True, - ) - - # try to update metadata location which will fail if the versionToken changed meanwhile + current_table: Optional[Table] + version_token: Optional[str] try: - self.s3tables.update_table_metadata_location( - tableBucketARN=self.table_bucket_arn, - namespace=database_name, - name=table_name, - versionToken=version_token, - metadataLocation=updated_staged_table.metadata_location, + current_table, version_token = self._load_table_and_version(identifier=table_identifier) + except NoSuchTableError: + current_table = None + version_token = None + + if current_table: + updated_staged_table = self._update_and_stage_table(current_table, table_identifier, requirements, updates) + if updated_staged_table.metadata == current_table.metadata: + # no changes, do nothing + return CommitTableResponse(metadata=current_table.metadata, metadata_location=current_table.metadata_location) + + self._write_metadata( + metadata=updated_staged_table.metadata, + io=updated_staged_table.io, + metadata_path=updated_staged_table.metadata_location, + overwrite=True, ) - except self.s3tables.exceptions.ConflictException as e: - raise CommitFailedException( - f"Cannot commit {database_name}.{table_name} because of a concurrent update to the table version {version_token}." - ) from e - return CommitTableResponse( - metadata=updated_staged_table.metadata, metadata_location=updated_staged_table.metadata_location - ) + + # try to update metadata location which will fail if the versionToken changed meanwhile + try: + self.s3tables.update_table_metadata_location( + tableBucketARN=self.table_bucket_arn, + namespace=database_name, + name=table_name, + versionToken=version_token, + metadataLocation=updated_staged_table.metadata_location, + ) + except self.s3tables.exceptions.ConflictException as e: + raise CommitFailedException( + f"Cannot commit {database_name}.{table_name} because of a concurrent update to the table version {version_token}." + ) from e + return CommitTableResponse( + metadata=updated_staged_table.metadata, metadata_location=updated_staged_table.metadata_location + ) + else: + # table does not exist, create it + raise NotImplementedError("Creating a table on commit is currently not supported.") + + def create_table_transaction( + self, + identifier: Union[str, Identifier], + schema: Union[Schema, "pa.Schema"], + location: Optional[str] = None, + partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, + sort_order: SortOrder = UNSORTED_SORT_ORDER, + properties: Properties = EMPTY_DICT, + ) -> CreateTableTransaction: + raise NotImplementedError("create_table_transaction currently not supported.") def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = EMPTY_DICT) -> None: if properties: diff --git a/tests/catalog/integration_test_s3tables.py b/tests/catalog/integration_test_s3tables.py index 351f530122..e803b5a2b9 100644 --- a/tests/catalog/integration_test_s3tables.py +++ b/tests/catalog/integration_test_s3tables.py @@ -18,7 +18,9 @@ from pyiceberg.catalog.s3tables import S3TablesCatalog from pyiceberg.exceptions import NoSuchNamespaceError, NoSuchTableError, TableBucketNotFound +from pyiceberg.partitioning import PartitionField, PartitionSpec from pyiceberg.schema import Schema +from pyiceberg.transforms import IdentityTransform from pyiceberg.types import IntegerType @@ -247,3 +249,37 @@ def test_commit_new_data_to_table(catalog: S3TablesCatalog, database_name: str, assert updated_table_metadata.metadata_log[-1].metadata_file == original_table_metadata_location assert updated_table_metadata.metadata_log[-1].timestamp_ms == original_table_last_updated_ms assert table.scan().to_arrow().num_rows == 2 * row_count + + +def test_create_table_transaction( + catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: str +) -> None: + identifier = (database_name, table_name) + catalog.create_namespace(namespace=database_name) + + with catalog.create_table_transaction( + identifier, + table_schema_nested, + partition_spec=PartitionSpec(PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="foo")), + ) as txn: + last_updated_metadata = txn.table_metadata.last_updated_ms + with txn.update_schema() as update_schema: + update_schema.add_column(path="b", field_type=IntegerType()) + + with txn.update_spec() as update_spec: + update_spec.add_identity("bar") + + txn.set_properties(test_a="test_aa", test_b="test_b", test_c="test_c") + + table = catalog.load_table(identifier) + + assert table.schema().find_field("b").field_type == IntegerType() + assert table.properties == {"test_a": "test_aa", "test_b": "test_b", "test_c": "test_c"} + assert table.spec().last_assigned_field_id == 1001 + assert table.spec().fields_by_source_id(1)[0].name == "foo" + assert table.spec().fields_by_source_id(1)[0].field_id == 1000 + assert table.spec().fields_by_source_id(1)[0].transform == IdentityTransform() + assert table.spec().fields_by_source_id(2)[0].name == "bar" + assert table.spec().fields_by_source_id(2)[0].field_id == 1001 + assert table.spec().fields_by_source_id(2)[0].transform == IdentityTransform() + assert table.metadata.last_updated_ms > last_updated_metadata From bbc5706f9a17d671b01f1408aa641a16a00297ad Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Thu, 9 Jan 2025 09:01:26 +0100 Subject: [PATCH 56/71] test: mark create_table_transaction test wiht xfail --- tests/catalog/integration_test_s3tables.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/catalog/integration_test_s3tables.py b/tests/catalog/integration_test_s3tables.py index e803b5a2b9..8cfcbe7b44 100644 --- a/tests/catalog/integration_test_s3tables.py +++ b/tests/catalog/integration_test_s3tables.py @@ -251,6 +251,7 @@ def test_commit_new_data_to_table(catalog: S3TablesCatalog, database_name: str, assert table.scan().to_arrow().num_rows == 2 * row_count +@pytest.mark.xfail(raises=NotImplementedError, reason="create_table_transaction not implemented yet") def test_create_table_transaction( catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: str ) -> None: From bad0eb516ca7fce5a3186db15f58d809ff7b8e3d Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 1 Feb 2025 19:59:02 +0100 Subject: [PATCH 57/71] feat: raise NotImplementedError for view_exists --- pyiceberg/catalog/s3tables.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 14c249448e..16da5346e0 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -373,3 +373,6 @@ def drop_view(self, identifier: Union[str, Identifier]) -> None: def list_views(self, namespace: Union[str, Identifier]) -> List[Identifier]: raise NotImplementedError + + def view_exists(self, identifier: Union[str, Identifier]) -> bool: + raise NotImplementedError From 38c4e6f23060c9fad4f4150b15c5a802274bc802 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 2 Feb 2025 12:21:57 +0100 Subject: [PATCH 58/71] test: use moto server for s3tables tests --- pyiceberg/catalog/__init__.py | 11 ++ tests/catalog/test_s3tables.py | 307 +++++++++++++++++++++++++++++++++ 2 files changed, 318 insertions(+) create mode 100644 tests/catalog/test_s3tables.py diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 8fe73d4369..6a11eda5ca 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -119,6 +119,7 @@ class CatalogType(Enum): DYNAMODB = "dynamodb" SQL = "sql" IN_MEMORY = "in-memory" + S3TABLES = "s3tables" def load_rest(name: str, conf: Properties) -> Catalog: @@ -174,6 +175,15 @@ def load_in_memory(name: str, conf: Properties) -> Catalog: raise NotInstalledError("SQLAlchemy support not installed: pip install 'pyiceberg[sql-sqlite]'") from exc +def load_s3tables(name: str, conf: Properties) -> Catalog: + try: + from pyiceberg.catalog.s3tables import S3TablesCatalog + + return S3TablesCatalog(name, **conf) + except ImportError as exc: + raise NotInstalledError("AWS S3Tables support not installed: pip install 'pyiceberg[s3tables]'") from exc + + AVAILABLE_CATALOGS: dict[CatalogType, Callable[[str, Properties], Catalog]] = { CatalogType.REST: load_rest, CatalogType.HIVE: load_hive, @@ -181,6 +191,7 @@ def load_in_memory(name: str, conf: Properties) -> Catalog: CatalogType.DYNAMODB: load_dynamodb, CatalogType.SQL: load_sql, CatalogType.IN_MEMORY: load_in_memory, + CatalogType.S3TABLES: load_s3tables, } diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py new file mode 100644 index 0000000000..8ae0ff1da0 --- /dev/null +++ b/tests/catalog/test_s3tables.py @@ -0,0 +1,307 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import string +from random import choice + +import boto3 +import pytest + +from pyiceberg.catalog import load_catalog +from pyiceberg.catalog.s3tables import S3TablesCatalog +from pyiceberg.exceptions import NoSuchNamespaceError, NoSuchTableError, TableBucketNotFound +from pyiceberg.partitioning import PartitionField, PartitionSpec +from pyiceberg.schema import Schema +from pyiceberg.transforms import IdentityTransform +from pyiceberg.types import IntegerType + + +@pytest.fixture +def database_name(database_name: str) -> str: + # naming rules prevent "-" in namespaces for s3 table buckets + return database_name.replace("-", "_") + + +@pytest.fixture +def table_name(table_name: str) -> str: + # naming rules prevent "-" in table namees for s3 table buckets + return table_name.replace("-", "_") + + +@pytest.fixture +def aws_region() -> str: + return "us-east-1" + + +@pytest.fixture +def table_bucket_arn(monkeypatch: pytest.MonkeyPatch, moto_endpoint_url: str) -> str: + monkeypatch.setenv("AWS_ENDPOINT_URL", moto_endpoint_url) + + prefix = "pyiceberg-table-bucket-" + random_tag = "".join(choice(string.ascii_letters) for _ in range(12)) + name = (prefix + random_tag).lower() + table_bucket_arn = boto3.client("s3tables", endpoint_url=moto_endpoint_url).create_table_bucket(name=name)["arn"] + return table_bucket_arn + + +@pytest.fixture(params=["pyiceberg.io.fsspec.FsspecFileIO", "pyiceberg.io.pyarrow.PyArrowFileIO"]) +def file_io_impl(request: pytest.FixtureRequest) -> str: + return request.param + + +@pytest.fixture +def catalog(table_bucket_arn: str, aws_region: str, moto_endpoint_url: str, file_io_impl: str) -> S3TablesCatalog: + properties = { + "s3tables.warehouse": table_bucket_arn, + "s3tables.region": aws_region, + "py-io-impl": file_io_impl, + "s3tables.endpoint": moto_endpoint_url, + "s3.endpoint": moto_endpoint_url, + } + return S3TablesCatalog(name="test_s3tables_catalog", **properties) + + +def test_load_catalog(table_bucket_arn: str, aws_region: str, moto_endpoint_url: str) -> None: + properties = { + "type": "s3tables", + "s3tables.warehouse": table_bucket_arn, + "s3tables.region": aws_region, + "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO", + "s3tables.endpoint": moto_endpoint_url, + } + catalog = load_catalog(**properties) + assert isinstance(catalog, S3TablesCatalog) + + +def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str) -> None: + properties = {"s3tables.warehouse": f"{table_bucket_arn}-modified", "s3tables.region": "us-east-1"} + with pytest.raises(TableBucketNotFound): + S3TablesCatalog(name="test_s3tables_catalog", **properties) + + +def test_create_namespace(catalog: S3TablesCatalog, database_name: str) -> None: + catalog.create_namespace(namespace=database_name) + namespaces = catalog.list_namespaces() + assert (database_name,) in namespaces + + +def test_load_namespace_properties(catalog: S3TablesCatalog, database_name: str) -> None: + catalog.create_namespace(namespace=database_name) + assert database_name in catalog.load_namespace_properties(database_name)["namespace"] + + +def test_drop_namespace(catalog: S3TablesCatalog, database_name: str) -> None: + catalog.create_namespace(namespace=database_name) + assert (database_name,) in catalog.list_namespaces() + catalog.drop_namespace(namespace=database_name) + assert (database_name,) not in catalog.list_namespaces() + + +def test_create_table(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + table = catalog.create_table(identifier=identifier, schema=table_schema_nested) + + assert table == catalog.load_table(identifier) + + +def test_create_table_in_invalid_namespace_raises_exception( + catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema +) -> None: + identifier = (database_name, table_name) + + with pytest.raises(NoSuchNamespaceError): + catalog.create_table(identifier=identifier, schema=table_schema_nested) + + +def test_table_exists(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + assert not catalog.table_exists(identifier=identifier) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + assert catalog.table_exists(identifier=identifier) + + +def test_rename_table(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + + to_database_name = f"{database_name}new" + to_table_name = f"{table_name}new" + to_identifier = (to_database_name, to_table_name) + catalog.create_namespace(namespace=to_database_name) + catalog.rename_table(from_identifier=identifier, to_identifier=to_identifier) + + assert not catalog.table_exists(identifier=identifier) + assert catalog.table_exists(identifier=to_identifier) + + +def test_list_tables(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + assert not catalog.list_tables(namespace=database_name) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + assert catalog.list_tables(namespace=database_name) + + +def test_drop_table(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + + catalog.drop_table(identifier=identifier) + + with pytest.raises(NoSuchTableError): + catalog.load_table(identifier=identifier) + + +def test_commit_new_column_to_table( + catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema +) -> None: + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + table = catalog.create_table(identifier=identifier, schema=table_schema_nested) + + last_updated_ms = table.metadata.last_updated_ms + original_table_metadata_location = table.metadata_location + original_table_last_updated_ms = table.metadata.last_updated_ms + + transaction = table.transaction() + update = transaction.update_schema() + update.add_column(path="b", field_type=IntegerType()) + update.commit() + transaction.commit_transaction() + + updated_table_metadata = table.metadata + assert updated_table_metadata.current_schema_id == 1 + assert len(updated_table_metadata.schemas) == 2 + assert updated_table_metadata.last_updated_ms > last_updated_ms + assert updated_table_metadata.metadata_log[0].metadata_file == original_table_metadata_location + assert updated_table_metadata.metadata_log[0].timestamp_ms == original_table_last_updated_ms + assert table.schema().columns[-1].name == "b" + + +def test_write_pyarrow_table(catalog: S3TablesCatalog, database_name: str, table_name: str) -> None: + identifier = (database_name, table_name) + catalog.create_namespace(namespace=database_name) + + import pyarrow as pa + + pyarrow_table = pa.Table.from_arrays( + [ + pa.array([None, "A", "B", "C"]), # 'foo' column + pa.array([1, 2, 3, 4]), # 'bar' column + pa.array([True, None, False, True]), # 'baz' column + pa.array([None, "A", "B", "C"]), # 'large' column + ], + schema=pa.schema( + [ + pa.field("foo", pa.large_string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("large", pa.large_string(), nullable=True), + ] + ), + ) + table = catalog.create_table(identifier=identifier, schema=pyarrow_table.schema) + table.append(pyarrow_table) + + assert table.scan().to_arrow().num_rows == pyarrow_table.num_rows + + +def test_commit_new_data_to_table(catalog: S3TablesCatalog, database_name: str, table_name: str) -> None: + identifier = (database_name, table_name) + catalog.create_namespace(namespace=database_name) + + import pyarrow as pa + + pyarrow_table = pa.Table.from_arrays( + [ + pa.array([None, "A", "B", "C"]), # 'foo' column + pa.array([1, 2, 3, 4]), # 'bar' column + pa.array([True, None, False, True]), # 'baz' column + pa.array([None, "A", "B", "C"]), # 'large' column + ], + schema=pa.schema( + [ + pa.field("foo", pa.large_string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("large", pa.large_string(), nullable=True), + ] + ), + ) + + table = catalog.create_table(identifier=identifier, schema=pyarrow_table.schema) + table.append(pyarrow_table) + + row_count = table.scan().to_arrow().num_rows + assert row_count + last_updated_ms = table.metadata.last_updated_ms + original_table_metadata_location = table.metadata_location + original_table_last_updated_ms = table.metadata.last_updated_ms + + transaction = table.transaction() + transaction.append(table.scan().to_arrow()) + transaction.commit_transaction() + + updated_table_metadata = table.metadata + assert updated_table_metadata.last_updated_ms > last_updated_ms + assert updated_table_metadata.metadata_log[-1].metadata_file == original_table_metadata_location + assert updated_table_metadata.metadata_log[-1].timestamp_ms == original_table_last_updated_ms + assert table.scan().to_arrow().num_rows == 2 * row_count + + +@pytest.mark.xfail(raises=NotImplementedError) +def test_create_table_transaction( + catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: str +) -> None: + identifier = (database_name, table_name) + catalog.create_namespace(namespace=database_name) + + with catalog.create_table_transaction( + identifier, + table_schema_nested, + partition_spec=PartitionSpec(PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="foo")), + ) as txn: + last_updated_metadata = txn.table_metadata.last_updated_ms + with txn.update_schema() as update_schema: + update_schema.add_column(path="b", field_type=IntegerType()) + + with txn.update_spec() as update_spec: + update_spec.add_identity("bar") + + txn.set_properties(test_a="test_aa", test_b="test_b", test_c="test_c") + + table = catalog.load_table(identifier) + + assert table.schema().find_field("b").field_type == IntegerType() + assert table.properties == {"test_a": "test_aa", "test_b": "test_b", "test_c": "test_c"} + assert table.spec().last_assigned_field_id == 1001 + assert table.spec().fields_by_source_id(1)[0].name == "foo" + assert table.spec().fields_by_source_id(1)[0].field_id == 1000 + assert table.spec().fields_by_source_id(1)[0].transform == IdentityTransform() + assert table.spec().fields_by_source_id(2)[0].name == "bar" + assert table.spec().fields_by_source_id(2)[0].field_id == 1001 + assert table.spec().fields_by_source_id(2)[0].transform == IdentityTransform() + assert table.metadata.last_updated_ms > last_updated_metadata From 937d6af2c79553cededa64e769afcf6b3b18fdb1 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 2 Feb 2025 13:54:49 +0100 Subject: [PATCH 59/71] docs: add s3tables catalog --- mkdocs/docs/configuration.md | 98 ++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 4ae767b5db..07f5ce75cc 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -543,6 +543,104 @@ catalog: +### S3Tables Catalog + +The S3Tables Catalog leverages the catalog functionalities of the Amazon S3Tables service and requires an existing S3 Tables Bucket to operate. + +To use Amazon S3Tables as your catalog, you can configure pyiceberg using one of the following methods. Additionally, refer to the [AWS documentation](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) on configuring credentials to set up your AWS account credentials locally. + +If you intend to use the same credentials for both the S3Tables Catalog and S3 FileIO, you can configure the [`client.*` properties](configuration.md#unified-aws-credentials) to streamline the process. + +Note that the S3Tables Catalog manages the underlying table locations internally, which makes it incompatible with S3-like storage systems such as MinIO. If you specify the `s3tables.endpoint`, ensure that the `s3.endpoint` is configured accordingly. + +```yaml +catalog: + default: + type: s3tables + warehouse: arn:aws:s3tables:us-east-1:012345678901:bucket/pyiceberg-catalog +``` + +If you prefer to pass the credentials explicitly to the client instead of relying on environment variables, + +```yaml +catalog: + default: + type: s3tables + s3tables.access-key-id: + s3tables.secret-access-key: + s3tables.session-token: + s3tables.region: + s3tables.endpoint: http://localhost:9000 + s3.endpoint: http://localhost:9000 +``` + + + +!!! Note "Client-specific Properties" + `s3tables.*` properties are for S3TablesCatalog only. If you want to use the same credentials for both S3TablesCatalog and S3 FileIO, you can set the `client.*` properties. See the [Unified AWS Credentials](configuration.md#unified-aws-credentials) section for more details. + + + + + +| Key | Example | Description | +| -------------------------- | ------------------- | -------------------------------------------------------------------------- | +| s3tables.profile-name | default | Configure the static profile used to access the S3Tables Catalog | +| s3tables.region | us-east-1 | Set the region of the S3Tables Catalog | +| s3tables.access-key-id | admin | Configure the static access key id used to access the S3Tables Catalog | +| s3tables.secret-access-key | password | Configure the static secret access key used to access the S3Tables Catalog | +| s3tables.session-token | AQoDYXdzEJr... | Configure the static session token used to access the S3Tables Catalog | +| s3tables.endpoint | ... | Configure the AWS endpoint | +| s3tables.warehouse | arn:aws:s3tables... | Set the underlying S3 Table Bucket | + + + + + +!!! warning "Removed Properties" + The properties `profile_name`, `region_name`, `aws_access_key_id`, `aws_secret_access_key`, and `aws_session_token` were deprecated and removed in 0.8.0 + + + +An example usage of the S3Tables Catalog is shown below: + +```python +from pyiceberg.catalog.s3tables import S3TablesCatalog +import pyarrow as pa + + +table_bucket_arn: str = "..." +aws_region: str = "..." + +properties = {"s3tables.warehouse": table_bucket_arn, "s3tables.region": aws_region} +catalog = S3TablesCatalog(name="s3tables_catalog", **properties) + +database_name = "prod" + +catalog.create_namespace(namespace=database_name) + +pyarrow_table = pa.Table.from_arrays( + [ + pa.array([None, "A", "B", "C"]), + pa.array([1, 2, 3, 4]), + pa.array([True, None, False, True]), + pa.array([None, "A", "B", "C"]), + ], + schema=pa.schema( + [ + pa.field("foo", pa.large_string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("large", pa.large_string(), nullable=True), + ] + ), +) + +identifier = (database_name, "orders") +table = catalog.create_table(identifier=identifier, schema=pyarrow_table.schema) +table.append(pyarrow_table) +``` + ### Custom Catalog Implementations If you want to load any custom catalog implementation, you can set catalog configurations like the following: From cf03cba575ce01988ed9c45d6a40c0107cae08f4 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 3 Feb 2025 09:05:34 +0100 Subject: [PATCH 60/71] chore: bump moto library --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b56976cd6b..4a0e8877c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,7 +89,7 @@ pre-commit = "4.1.0" fastavro = "1.10.0" coverage = { version = "^7.4.2", extras = ["toml"] } requests-mock = "1.12.1" -moto = { version = "^5.0.2", extras = ["server"] } +moto = { version = "^5.0.28", extras = ["server"] } typing-extensions = "4.12.2" pytest-mock = "3.14.0" pyspark = "3.5.3" From af6bce75245f97ed99ae9148fbebe0f8599a1079 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 12 Feb 2025 08:28:04 +0100 Subject: [PATCH 61/71] test: set region when creating table bucket --- tests/catalog/test_s3tables.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 8ae0ff1da0..85891559f3 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -47,13 +47,15 @@ def aws_region() -> str: @pytest.fixture -def table_bucket_arn(monkeypatch: pytest.MonkeyPatch, moto_endpoint_url: str) -> str: +def table_bucket_arn(monkeypatch: pytest.MonkeyPatch, moto_endpoint_url: str, aws_region: str) -> str: monkeypatch.setenv("AWS_ENDPOINT_URL", moto_endpoint_url) prefix = "pyiceberg-table-bucket-" random_tag = "".join(choice(string.ascii_letters) for _ in range(12)) name = (prefix + random_tag).lower() - table_bucket_arn = boto3.client("s3tables", endpoint_url=moto_endpoint_url).create_table_bucket(name=name)["arn"] + table_bucket_arn = boto3.client("s3tables", endpoint_url=moto_endpoint_url, region_name=aws_region).create_table_bucket( + name=name + )["arn"] return table_bucket_arn From 6af33919bacc8493897d63396230aaadaaaae4c6 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 12 Feb 2025 15:56:00 +0100 Subject: [PATCH 62/71] test: mock aws credentials --- tests/catalog/test_s3tables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 85891559f3..7120702605 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -41,8 +41,8 @@ def table_name(table_name: str) -> str: return table_name.replace("-", "_") -@pytest.fixture -def aws_region() -> str: +@pytest.fixture() +def aws_region(_aws_credentials: None) -> str: return "us-east-1" From 018bf0b0c84b068671027f5f34f1cca4fdac6564 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 12 Feb 2025 16:17:02 +0100 Subject: [PATCH 63/71] chore: update poetry lock with s3tables --- poetry.lock | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 766259dcda..c8ef1ba7c7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -447,7 +447,7 @@ files = [ {file = "boto3-1.36.3-py3-none-any.whl", hash = "sha256:f9843a5d06f501d66ada06f5a5417f671823af2cf319e36ceefa1bafaaaaa953"}, {file = "boto3-1.36.3.tar.gz", hash = "sha256:53a5307f6a3526ee2f8590e3c45efa504a3ea4532c1bfe4926c0c19bf188d141"}, ] -markers = {main = "extra == \"glue\" or extra == \"dynamodb\" or extra == \"rest-sigv4\""} +markers = {main = "extra == \"glue\" or extra == \"dynamodb\" or extra == \"rest-sigv4\" or extra == \"s3tables\""} [package.dependencies] botocore = ">=1.36.3,<1.37.0" @@ -468,7 +468,7 @@ files = [ {file = "botocore-1.36.3-py3-none-any.whl", hash = "sha256:536ab828e6f90dbb000e3702ac45fd76642113ae2db1b7b1373ad24104e89255"}, {file = "botocore-1.36.3.tar.gz", hash = "sha256:775b835e979da5c96548ed1a0b798101a145aec3cd46541d62e27dda5a94d7f8"}, ] -markers = {main = "extra == \"glue\" or extra == \"dynamodb\" or extra == \"rest-sigv4\" or extra == \"s3fs\""} +markers = {main = "extra == \"glue\" or extra == \"dynamodb\" or extra == \"rest-sigv4\" or extra == \"s3tables\" or extra == \"s3fs\""} [package.dependencies] jmespath = ">=0.7.1,<2.0.0" @@ -2117,7 +2117,7 @@ files = [ {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] -markers = {main = "extra == \"glue\" or extra == \"dynamodb\" or extra == \"rest-sigv4\" or extra == \"s3fs\""} +markers = {main = "extra == \"glue\" or extra == \"dynamodb\" or extra == \"rest-sigv4\" or extra == \"s3tables\" or extra == \"s3fs\""} [[package]] name = "joserfc" @@ -2161,6 +2161,8 @@ python-versions = "*" groups = ["dev"] files = [ {file = "jsonpath-ng-1.7.0.tar.gz", hash = "sha256:f6f5f7fd4e5ff79c785f1573b394043b39849fb2bb47bcead935d12b00beab3c"}, + {file = "jsonpath_ng-1.7.0-py2-none-any.whl", hash = "sha256:898c93fc173f0c336784a3fa63d7434297544b7198124a68f9a3ef9597b0ae6e"}, + {file = "jsonpath_ng-1.7.0-py3-none-any.whl", hash = "sha256:f3d7f9e848cba1b6da28c55b1c26ff915dc9e0b1ba7e752a53d6da8d5cbd00b6"}, ] [package.dependencies] @@ -3598,6 +3600,7 @@ files = [ {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"}, + {file = "psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"}, @@ -4702,7 +4705,7 @@ files = [ {file = "s3transfer-0.11.1-py3-none-any.whl", hash = "sha256:8fa0aa48177be1f3425176dfe1ab85dcd3d962df603c3dbfc585e6bf857ef0ff"}, {file = "s3transfer-0.11.1.tar.gz", hash = "sha256:3f25c900a367c8b7f7d8f9c34edc87e300bde424f779dc9f0a8ae4f9df9264f6"}, ] -markers = {main = "extra == \"glue\" or extra == \"dynamodb\" or extra == \"rest-sigv4\""} +markers = {main = "extra == \"glue\" or extra == \"dynamodb\" or extra == \"rest-sigv4\" or extra == \"s3tables\""} [package.dependencies] botocore = ">=1.36.0,<2.0a.0" @@ -5636,4 +5639,4 @@ zstandard = ["zstandard"] [metadata] lock-version = "2.1" python-versions = "^3.9.2, !=3.9.7" -content-hash = "f8d052c8ef957bbb66f94093ef9dfa5805073cc938bba282c957a814c605a1f2" +content-hash = "28e0424e2b560e9c27dfab9515df6b880555b8ae7f56eda9335d53394aee2537" From f738e5947e0b7441fc51667ad3a320c9eca24371 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 16 Feb 2025 16:21:57 -0800 Subject: [PATCH 64/71] use new locationprovider for metadata location --- pyiceberg/catalog/s3tables.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 16da5346e0..c9cc7d972a 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -36,6 +36,7 @@ from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile from pyiceberg.table import CommitTableResponse, CreateTableTransaction, Table +from pyiceberg.table.locations import load_location_provider from pyiceberg.table.metadata import new_table_metadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder from pyiceberg.table.update import TableRequirement, TableUpdate @@ -207,7 +208,8 @@ def create_table( ) warehouse_location = response["warehouseLocation"] - metadata_location = self._get_metadata_location(location=warehouse_location) + provider = load_location_provider(warehouse_location, properties) + metadata_location = provider.new_table_metadata_file_location() metadata = new_table_metadata( location=warehouse_location, schema=schema, From df97205b33127c2ab83c0551dab18a73c40e41c2 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 16 Feb 2025 17:00:53 -0800 Subject: [PATCH 65/71] fix test region --- tests/catalog/integration_test_s3tables.py | 4 ++-- tests/catalog/test_s3tables.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/catalog/integration_test_s3tables.py b/tests/catalog/integration_test_s3tables.py index 8cfcbe7b44..7fd134b23c 100644 --- a/tests/catalog/integration_test_s3tables.py +++ b/tests/catalog/integration_test_s3tables.py @@ -65,8 +65,8 @@ def catalog(table_bucket_arn: str, aws_region: str) -> S3TablesCatalog: return S3TablesCatalog(name="test_s3tables_catalog", **properties) -def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str) -> None: - properties = {"s3tables.warehouse": f"{table_bucket_arn}-modified", "s3tables.region": "us-east-1"} +def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str, aws_region: str) -> None: + properties = {"s3tables.warehouse": f"{table_bucket_arn}-modified", "s3tables.region": aws_region} with pytest.raises(TableBucketNotFound): S3TablesCatalog(name="test_s3tables_catalog", **properties) diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index 7120702605..d380b9453a 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -88,8 +88,8 @@ def test_load_catalog(table_bucket_arn: str, aws_region: str, moto_endpoint_url: assert isinstance(catalog, S3TablesCatalog) -def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str) -> None: - properties = {"s3tables.warehouse": f"{table_bucket_arn}-modified", "s3tables.region": "us-east-1"} +def test_creating_catalog_validates_s3_table_bucket_exists(table_bucket_arn: str, aws_region: str) -> None: + properties = {"s3tables.warehouse": f"{table_bucket_arn}-modified", "s3tables.region": aws_region} with pytest.raises(TableBucketNotFound): S3TablesCatalog(name="test_s3tables_catalog", **properties) From fe90a006e2758e2abbf29cd4f3dd040ce2bba498 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sat, 22 Feb 2025 14:04:57 +0100 Subject: [PATCH 66/71] fix: use f-string to raise version error --- pyiceberg/catalog/s3tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index c9cc7d972a..fd3ad5044b 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -80,7 +80,7 @@ def __init__(self, name: str, **properties: str): try: self.s3tables = session.client("s3tables", endpoint_url=properties.get(S3TABLES_ENDPOINT)) except boto3.session.UnknownServiceError as e: - raise S3TablesError("'s3tables' requires boto3>=1.35.74. Current version: {boto3.__version__}.") from e + raise S3TablesError(f"'s3tables' requires boto3>=1.35.74. Current version: {boto3.__version__}.") from e try: self.s3tables.get_table_bucket(tableBucketARN=self.table_bucket_arn) From 1e6c4d7a6c38d370a6032b87e603b75f89d79c5d Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 23 Feb 2025 00:42:23 +0100 Subject: [PATCH 67/71] fix: raise appropriate NoSuchNamespaceError --- pyiceberg/catalog/s3tables.py | 12 +++++++++--- tests/catalog/test_s3tables.py | 10 ++++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index fd3ad5044b..1d7b011418 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -284,13 +284,19 @@ def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]: namespace = self._validate_namespace_identifier(namespace) paginator = self.s3tables.get_paginator("list_tables") tables: List[Identifier] = [] - for page in paginator.paginate(tableBucketARN=self.table_bucket_arn, namespace=namespace): - tables.extend((namespace, table["name"]) for table in page["tables"]) + try: + for page in paginator.paginate(tableBucketARN=self.table_bucket_arn, namespace=namespace): + tables.extend((namespace, table["name"]) for table in page["tables"]) + except self.s3tables.exceptions.NotFoundException as e: + raise NoSuchNamespaceError(f"Namespace {namespace} does not exist.") from e return tables def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Properties: namespace = self._validate_namespace_identifier(namespace) - response = self.s3tables.get_namespace(tableBucketARN=self.table_bucket_arn, namespace=namespace) + try: + response = self.s3tables.get_namespace(tableBucketARN=self.table_bucket_arn, namespace=namespace) + except self.s3tables.exceptions.NotFoundException as e: + raise NoSuchNamespaceError(f"Namespace {namespace} does not exist.") from e return { "namespace": response["namespace"], "createdAt": response["createdAt"], diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index d380b9453a..f391917204 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -105,6 +105,11 @@ def test_load_namespace_properties(catalog: S3TablesCatalog, database_name: str) assert database_name in catalog.load_namespace_properties(database_name)["namespace"] +def test_load_namespace_properties_for_invalid_namespace_raises_exception(catalog: S3TablesCatalog, database_name: str) -> None: + with pytest.raises(NoSuchNamespaceError): + catalog.load_namespace_properties(database_name) + + def test_drop_namespace(catalog: S3TablesCatalog, database_name: str) -> None: catalog.create_namespace(namespace=database_name) assert (database_name,) in catalog.list_namespaces() @@ -164,6 +169,11 @@ def test_list_tables(catalog: S3TablesCatalog, database_name: str, table_name: s assert catalog.list_tables(namespace=database_name) +def test_list_tables_for_invalid_namespace_raises_exception(catalog: S3TablesCatalog, database_name: str) -> None: + with pytest.raises(NoSuchNamespaceError): + catalog.list_tables(namespace=database_name) + + def test_drop_table(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) From fb68b5c8952697630c026b3becc1b34e8830a0be Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 23 Feb 2025 00:52:26 +0100 Subject: [PATCH 68/71] fix: check for table bucket arn set via properties --- pyiceberg/catalog/s3tables.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 1d7b011418..496426a851 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -67,6 +67,8 @@ class S3TablesCatalog(MetastoreCatalog): def __init__(self, name: str, **properties: str): super().__init__(name, **properties) + if S3TABLES_TABLE_BUCKET_ARN not in self.properties: + raise S3TablesError(f"No table bucket arn specified. Set it via the {S3TABLES_TABLE_BUCKET_ARN} property.") self.table_bucket_arn = self.properties[S3TABLES_TABLE_BUCKET_ARN] session = boto3.Session( From c789235c1dc832d869061da4ceda43d0ab61aba5 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Sun, 23 Feb 2025 12:52:27 +0100 Subject: [PATCH 69/71] fix: support purge_table operation in favor of delete_table --- pyiceberg/catalog/s3tables.py | 42 +++++++++++++++++----------------- tests/catalog/test_s3tables.py | 17 ++++++++++++-- 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 496426a851..4f5eeb3f46 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -251,24 +251,9 @@ def drop_namespace(self, namespace: Union[str, Identifier]) -> None: raise NamespaceNotEmptyError(f"Namespace {namespace} is not empty.") from e def drop_table(self, identifier: Union[str, Identifier]) -> None: - namespace, table_name = self._validate_database_and_table_identifier(identifier) - try: - response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) - except self.s3tables.exceptions.NotFoundException as e: - raise NoSuchTableError(f"No table with identifier {identifier} exists.") from e - - version_token = response["versionToken"] - try: - self.s3tables.delete_table( - tableBucketARN=self.table_bucket_arn, - namespace=namespace, - name=table_name, - versionToken=version_token, - ) - except self.s3tables.exceptions.ConflictException as e: - raise CommitFailedException( - f"Cannot delete {namespace}.{table_name} because of a concurrent update to the table version {version_token}." - ) from e + raise NotImplementedError( + "S3 Tables does not support the delete_table operation. You can retry with the purge_table operation." + ) def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]: if namespace: @@ -371,9 +356,24 @@ def update_namespace_properties( raise NotImplementedError("Namespace properties are read only.") def purge_table(self, identifier: Union[str, Identifier]) -> None: - # purge is not supported as s3tables doesn't support delete operations - # table maintenance is automated - raise NotImplementedError + namespace, table_name = self._validate_database_and_table_identifier(identifier) + try: + response = self.s3tables.get_table(tableBucketARN=self.table_bucket_arn, namespace=namespace, name=table_name) + except self.s3tables.exceptions.NotFoundException as e: + raise NoSuchTableError(f"No table with identifier {identifier} exists.") from e + + version_token = response["versionToken"] + try: + self.s3tables.delete_table( + tableBucketARN=self.table_bucket_arn, + namespace=namespace, + name=table_name, + versionToken=version_token, + ) + except self.s3tables.exceptions.ConflictException as e: + raise CommitFailedException( + f"Cannot delete {namespace}.{table_name} because of a concurrent update to the table version {version_token}." + ) from e def register_table(self, identifier: Union[str, Identifier], metadata_location: str) -> Table: raise NotImplementedError diff --git a/tests/catalog/test_s3tables.py b/tests/catalog/test_s3tables.py index f391917204..3b744cf152 100644 --- a/tests/catalog/test_s3tables.py +++ b/tests/catalog/test_s3tables.py @@ -174,13 +174,26 @@ def test_list_tables_for_invalid_namespace_raises_exception(catalog: S3TablesCat catalog.list_tables(namespace=database_name) -def test_drop_table(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: +def test_delete_table_raises_not_implemented( + catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema +) -> None: + identifier = (database_name, table_name) + + catalog.create_namespace(namespace=database_name) + catalog.create_table(identifier=identifier, schema=table_schema_nested) + + with pytest.raises(NotImplementedError) as exc: + catalog.drop_table(identifier=identifier) + exc.match("S3 Tables does not support the delete_table operation. You can retry with the purge_table operation.") + + +def test_purge_table(catalog: S3TablesCatalog, database_name: str, table_name: str, table_schema_nested: Schema) -> None: identifier = (database_name, table_name) catalog.create_namespace(namespace=database_name) catalog.create_table(identifier=identifier, schema=table_schema_nested) - catalog.drop_table(identifier=identifier) + catalog.purge_table(identifier=identifier) with pytest.raises(NoSuchTableError): catalog.load_table(identifier=identifier) From c64370d5ee1b6eba7a79950abbe12e236525697a Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Thu, 27 Feb 2025 11:55:35 +0100 Subject: [PATCH 70/71] chore: remove deprecated_botocore_session --- pyiceberg/catalog/s3tables.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyiceberg/catalog/s3tables.py b/pyiceberg/catalog/s3tables.py index 4f5eeb3f46..50581c11ba 100644 --- a/pyiceberg/catalog/s3tables.py +++ b/pyiceberg/catalog/s3tables.py @@ -19,7 +19,7 @@ import boto3 -from pyiceberg.catalog import DEPRECATED_BOTOCORE_SESSION, MetastoreCatalog, PropertiesUpdateSummary +from pyiceberg.catalog import MetastoreCatalog, PropertiesUpdateSummary from pyiceberg.exceptions import ( CommitFailedException, InvalidNamespaceName, @@ -74,7 +74,6 @@ def __init__(self, name: str, **properties: str): session = boto3.Session( profile_name=properties.get(S3TABLES_PROFILE_NAME), region_name=get_first_property_value(properties, S3TABLES_REGION, AWS_REGION), - botocore_session=properties.get(DEPRECATED_BOTOCORE_SESSION), aws_access_key_id=get_first_property_value(properties, S3TABLES_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID), aws_secret_access_key=get_first_property_value(properties, S3TABLES_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY), aws_session_token=get_first_property_value(properties, S3TABLES_SESSION_TOKEN, AWS_SESSION_TOKEN), From b5465affe99b1a174b4b1c236cca1d2f670efb48 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Fri, 14 Mar 2025 19:08:23 +0100 Subject: [PATCH 71/71] docs: database_name -> namespace to clear up the concept --- mkdocs/docs/configuration.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index f2cb6c2f15..645d18f645 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -622,9 +622,9 @@ aws_region: str = "..." properties = {"s3tables.warehouse": table_bucket_arn, "s3tables.region": aws_region} catalog = S3TablesCatalog(name="s3tables_catalog", **properties) -database_name = "prod" +namespace = "prod" -catalog.create_namespace(namespace=database_name) +catalog.create_namespace(namespace=namespace) pyarrow_table = pa.Table.from_arrays( [ @@ -643,7 +643,7 @@ pyarrow_table = pa.Table.from_arrays( ), ) -identifier = (database_name, "orders") +identifier = (namespace, "orders") table = catalog.create_table(identifier=identifier, schema=pyarrow_table.schema) table.append(pyarrow_table) ```