From 39928c6b6d05b990e25b69fe0b93b948efb3d946 Mon Sep 17 00:00:00 2001 From: Drew Gallardo Date: Mon, 10 Feb 2025 22:49:15 -0800 Subject: [PATCH 01/10] Add support for write.metadata.path --- mkdocs/docs/configuration.md | 1 + pyiceberg/catalog/__init__.py | 31 +++++++++++++++++++++++++----- pyiceberg/catalog/dynamodb.py | 2 +- pyiceberg/catalog/sql.py | 2 +- pyiceberg/table/__init__.py | 5 +++++ pyiceberg/table/update/snapshot.py | 29 +++++----------------------- 6 files changed, 39 insertions(+), 31 deletions(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 55a0fd9f88..50cc725e6d 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -68,6 +68,7 @@ Iceberg tables support table properties to configure table behavior. | `write.object-storage.partitioned-paths` | Boolean | True | Controls whether [partition values are included in file paths](configuration.md#partition-exclusion) when object storage is enabled | | `write.py-location-provider.impl` | String of form `module.ClassName` | null | Optional, [custom `LocationProvider`](configuration.md#loading-a-custom-location-provider) implementation | | `write.data.path` | String pointing to location | `{metadata.location}/data` | Sets the location under which data is written. | +| `write.metadata.path` | String pointing to location | `{metadata.location}/metadata` | Sets the location under which metadata is written. | ### Table behavior options diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index a39f6bc711..de6f8ae66d 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -759,6 +759,24 @@ def _convert_schema_if_needed(schema: Union[Schema, "pa.Schema"]) -> Schema: pass raise ValueError(f"{type(schema)=}, but it must be pyiceberg.schema.Schema or pyarrow.Schema") + @staticmethod + def metadata_file_location(table_location: str, file_name: str, properties: Properties = EMPTY_DICT) -> str: + """Get the full path for a metadata file. + + Args: + table_location (str): The base table location + file_name (str): Name of the metadata file + properties (Properties): Table properties that may contain custom metadata path + + Returns: + str: Full path where the metadata file should be stored + """ + if metadata_path := properties.get(TableProperties.WRITE_METADATA_PATH): + base_path = metadata_path.rstrip("/") + else: + base_path = f"{table_location}/metadata" + return f"{base_path}/{file_name}" + @staticmethod def _delete_old_metadata(io: FileIO, base: TableMetadata, metadata: TableMetadata) -> None: """Delete oldest metadata if config is set to true.""" @@ -857,7 +875,7 @@ def _create_staged_table( database_name, table_name = self.identifier_to_database_and_table(identifier) location = self._resolve_table_location(location, database_name, table_name) - metadata_location = self._get_metadata_location(location=location) + metadata_location = self._get_metadata_location(table_location=location, properties=properties) metadata = new_table_metadata( location=location, schema=schema, partition_spec=partition_spec, sort_order=sort_order, properties=properties ) @@ -888,7 +906,9 @@ def _update_and_stage_table( ) new_metadata_version = self._parse_metadata_version(current_table.metadata_location) + 1 if current_table else 0 - new_metadata_location = self._get_metadata_location(updated_metadata.location, new_metadata_version) + new_metadata_location = self._get_metadata_location( + updated_metadata.location, new_metadata_version, updated_metadata.properties + ) return StagedTable( identifier=table_identifier, @@ -946,11 +966,12 @@ def _write_metadata(metadata: TableMetadata, io: FileIO, metadata_path: str) -> ToOutputFile.table_metadata(metadata, io.new_output(metadata_path)) @staticmethod - def _get_metadata_location(location: str, new_version: int = 0) -> str: + def _get_metadata_location(table_location: str, new_version: int = 0, properties: Properties = EMPTY_DICT) -> str: if new_version < 0: raise ValueError(f"Table metadata version: `{new_version}` must be a non-negative integer") - version_str = f"{new_version:05d}" - return f"{location}/metadata/{version_str}-{uuid.uuid4()}.metadata.json" + + file_name = f"{new_version:05d}-{uuid.uuid4()}.metadata.json" + return Catalog.metadata_file_location(table_location, file_name, properties) @staticmethod def _parse_metadata_version(metadata_location: str) -> int: diff --git a/pyiceberg/catalog/dynamodb.py b/pyiceberg/catalog/dynamodb.py index 5ed01df7f5..a96c8e7f19 100644 --- a/pyiceberg/catalog/dynamodb.py +++ b/pyiceberg/catalog/dynamodb.py @@ -173,7 +173,7 @@ def create_table( database_name, table_name = self.identifier_to_database_and_table(identifier) location = self._resolve_table_location(location, database_name, table_name) - metadata_location = self._get_metadata_location(location=location) + metadata_location = self._get_metadata_location(table_location=location, properties=properties) metadata = new_table_metadata( location=location, schema=schema, partition_spec=partition_spec, sort_order=sort_order, properties=properties ) diff --git a/pyiceberg/catalog/sql.py b/pyiceberg/catalog/sql.py index c72587c028..cc31d3dec3 100644 --- a/pyiceberg/catalog/sql.py +++ b/pyiceberg/catalog/sql.py @@ -207,7 +207,7 @@ def create_table( namespace = Catalog.namespace_to_string(namespace_identifier) location = self._resolve_table_location(location, namespace, table_name) - metadata_location = self._get_metadata_location(location=location) + metadata_location = self._get_metadata_location(table_location=location, properties=properties) metadata = new_table_metadata( location=location, schema=schema, partition_spec=partition_spec, sort_order=sort_order, properties=properties ) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index f16aa28844..59b3e11e65 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -200,6 +200,7 @@ class TableProperties: WRITE_OBJECT_STORE_PARTITIONED_PATHS_DEFAULT = True WRITE_DATA_PATH = "write.data.path" + WRITE_METADATA_PATH = "write.metadata.path" DELETE_MODE = "write.delete.mode" DELETE_MODE_COPY_ON_WRITE = "copy-on-write" @@ -1236,6 +1237,10 @@ def to_polars(self) -> pl.LazyFrame: return pl.scan_iceberg(self) + def metadata_file_location(self, file_name: str) -> str: + """Get the metadata file location using write.metadata.path from properties if set.""" + return self.catalog.metadata_file_location(self.metadata.location, file_name, self.metadata.properties) + class StaticTable(Table): """Load a table directly from a metadata file (i.e., without using a catalog).""" diff --git a/pyiceberg/table/update/snapshot.py b/pyiceberg/table/update/snapshot.py index c0d0056e7c..c8ee80faab 100644 --- a/pyiceberg/table/update/snapshot.py +++ b/pyiceberg/table/update/snapshot.py @@ -84,16 +84,6 @@ from pyiceberg.table import Transaction -def _new_manifest_path(location: str, num: int, commit_uuid: uuid.UUID) -> str: - return f"{location}/metadata/{commit_uuid}-m{num}.avro" - - -def _generate_manifest_list_path(location: str, snapshot_id: int, attempt: int, commit_uuid: uuid.UUID) -> str: - # Mimics the behavior in Java: - # https://github.com/apache/iceberg/blob/c862b9177af8e2d83122220764a056f3b96fd00c/core/src/main/java/org/apache/iceberg/SnapshotProducer.java#L491 - return f"{location}/metadata/snap-{snapshot_id}-{attempt}-{commit_uuid}.avro" - - class _SnapshotProducer(UpdateTableMetadata[U], Generic[U]): commit_uuid: uuid.UUID _io: FileIO @@ -243,13 +233,8 @@ def _commit(self) -> UpdatesAndRequirements: next_sequence_number = self._transaction.table_metadata.next_sequence_number() summary = self._summary(self.snapshot_properties) - - manifest_list_file_path = _generate_manifest_list_path( - location=self._transaction.table_metadata.location, - snapshot_id=self._snapshot_id, - attempt=0, - commit_uuid=self.commit_uuid, - ) + file_name = f"{self.commit_uuid}-m{self._snapshot_id}-a0.avro" + manifest_list_file_path = self._transaction._table.metadata_file_location(file_name) with write_manifest_list( format_version=self._transaction.table_metadata.format_version, output_file=self._io.new_output(manifest_list_file_path), @@ -295,13 +280,9 @@ def new_manifest_writer(self, spec: PartitionSpec) -> ManifestWriter: ) def new_manifest_output(self) -> OutputFile: - return self._io.new_output( - _new_manifest_path( - location=self._transaction.table_metadata.location, - num=next(self._manifest_num_counter), - commit_uuid=self.commit_uuid, - ) - ) + file_name = f"{self.commit_uuid}-m{next(self._manifest_num_counter)}.avro" + file_path = self._transaction._table.metadata_file_location(file_name) + return self._io.new_output(file_path) def fetch_manifest_entry(self, manifest: ManifestFile, discard_deleted: bool = True) -> List[ManifestEntry]: return manifest.fetch_manifest_entry(io=self._io, discard_deleted=discard_deleted) From f4c075c4b67059f2226ac542193aa2972339e430 Mon Sep 17 00:00:00 2001 From: Drew Gallardo Date: Tue, 11 Feb 2025 12:26:03 -0800 Subject: [PATCH 02/10] Address PR comments --- pyiceberg/catalog/__init__.py | 20 +-------------- pyiceberg/table/__init__.py | 19 ++++++++++++--- pyiceberg/table/update/snapshot.py | 26 +++++++++++++++++--- tests/catalog/test_base.py | 39 ++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 26 deletions(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index de6f8ae66d..b36121dfca 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -759,24 +759,6 @@ def _convert_schema_if_needed(schema: Union[Schema, "pa.Schema"]) -> Schema: pass raise ValueError(f"{type(schema)=}, but it must be pyiceberg.schema.Schema or pyarrow.Schema") - @staticmethod - def metadata_file_location(table_location: str, file_name: str, properties: Properties = EMPTY_DICT) -> str: - """Get the full path for a metadata file. - - Args: - table_location (str): The base table location - file_name (str): Name of the metadata file - properties (Properties): Table properties that may contain custom metadata path - - Returns: - str: Full path where the metadata file should be stored - """ - if metadata_path := properties.get(TableProperties.WRITE_METADATA_PATH): - base_path = metadata_path.rstrip("/") - else: - base_path = f"{table_location}/metadata" - return f"{base_path}/{file_name}" - @staticmethod def _delete_old_metadata(io: FileIO, base: TableMetadata, metadata: TableMetadata) -> None: """Delete oldest metadata if config is set to true.""" @@ -971,7 +953,7 @@ def _get_metadata_location(table_location: str, new_version: int = 0, properties raise ValueError(f"Table metadata version: `{new_version}` must be a non-negative integer") file_name = f"{new_version:05d}-{uuid.uuid4()}.metadata.json" - return Catalog.metadata_file_location(table_location, file_name, properties) + return Table.metadata_file_location(table_location, file_name, properties) @staticmethod def _parse_metadata_version(metadata_location: str) -> int: diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 59b3e11e65..ab97f76237 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -1237,9 +1237,22 @@ def to_polars(self) -> pl.LazyFrame: return pl.scan_iceberg(self) - def metadata_file_location(self, file_name: str) -> str: - """Get the metadata file location using write.metadata.path from properties if set.""" - return self.catalog.metadata_file_location(self.metadata.location, file_name, self.metadata.properties) + @staticmethod + def metadata_file_location(table_location: str, file_name: str, properties: Properties = EMPTY_DICT) -> str: + """Get the full path for a metadata file. + + Args: + table_location (str): The base table location + file_name (str): Name of the metadata file + properties (Properties): Table properties that may contain custom metadata path + + Returns: + str: Full path where the metadata file should be stored + """ + if metadata_path := properties.get(TableProperties.WRITE_METADATA_PATH): + return f"{metadata_path.rstrip("/")}/{file_name}" + + return f"{table_location}/metadata/{file_name}" class StaticTable(Table): diff --git a/pyiceberg/table/update/snapshot.py b/pyiceberg/table/update/snapshot.py index c8ee80faab..863b822839 100644 --- a/pyiceberg/table/update/snapshot.py +++ b/pyiceberg/table/update/snapshot.py @@ -84,6 +84,16 @@ from pyiceberg.table import Transaction +def _new_manifest_file_name(num: int, commit_uuid: uuid.UUID) -> str: + return f"{commit_uuid}-m{num}.avro" + + +def _generate_manifest_list_file_name(snapshot_id: int, attempt: int, commit_uuid: uuid.UUID) -> str: + # Mimics the behavior in Java: + # https://github.com/apache/iceberg/blob/c862b9177af8e2d83122220764a056f3b96fd00c/core/src/main/java/org/apache/iceberg/SnapshotProducer.java#L491 + return f"snap-{snapshot_id}-{attempt}-{commit_uuid}.avro" + + class _SnapshotProducer(UpdateTableMetadata[U], Generic[U]): commit_uuid: uuid.UUID _io: FileIO @@ -233,8 +243,14 @@ def _commit(self) -> UpdatesAndRequirements: next_sequence_number = self._transaction.table_metadata.next_sequence_number() summary = self._summary(self.snapshot_properties) - file_name = f"{self.commit_uuid}-m{self._snapshot_id}-a0.avro" - manifest_list_file_path = self._transaction._table.metadata_file_location(file_name) + table_location = self._transaction.table_metadata.location + properties = self._transaction.table_metadata.properties + file_name = _generate_manifest_list_file_name( + snapshot_id=self._snapshot_id, + attempt=0, + commit_uuid=self.commit_uuid, + ) + manifest_list_file_path = self._transaction._table.metadata_file_location(table_location, file_name, properties) with write_manifest_list( format_version=self._transaction.table_metadata.format_version, output_file=self._io.new_output(manifest_list_file_path), @@ -280,8 +296,10 @@ def new_manifest_writer(self, spec: PartitionSpec) -> ManifestWriter: ) def new_manifest_output(self) -> OutputFile: - file_name = f"{self.commit_uuid}-m{next(self._manifest_num_counter)}.avro" - file_path = self._transaction._table.metadata_file_location(file_name) + table_location = self._transaction.table_metadata.location + properties = self._transaction.table_metadata.properties + file_name = _new_manifest_file_name(num=next(self._manifest_num_counter), commit_uuid=self.commit_uuid) + file_path = self._transaction._table.metadata_file_location(table_location, file_name, properties) return self._io.new_output(file_path) def fetch_manifest_entry(self, manifest: ManifestFile, discard_deleted: bool = True) -> List[ManifestEntry]: diff --git a/tests/catalog/test_base.py b/tests/catalog/test_base.py index e3532c0372..2592478d86 100644 --- a/tests/catalog/test_base.py +++ b/tests/catalog/test_base.py @@ -35,10 +35,12 @@ TableAlreadyExistsError, ) from pyiceberg.io import WAREHOUSE +from pyiceberg.io.pyarrow import schema_to_pyarrow from pyiceberg.partitioning import PartitionField, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.table import ( Table, + TableProperties, ) from pyiceberg.table.update import ( AddSchemaUpdate, @@ -563,3 +565,40 @@ def test_table_properties_raise_for_none_value(catalog: InMemoryCatalog) -> None with pytest.raises(ValidationError) as exc_info: _ = given_catalog_has_a_table(catalog, properties=property_with_none) assert "None type is not a supported value in properties: property_name" in str(exc_info.value) + + +def test_table_writes_metadata_to_custom_location(catalog: InMemoryCatalog) -> None: + metadata_path = f"{catalog._warehouse_location}/custom/path" + catalog.create_namespace(TEST_TABLE_NAMESPACE) + table = catalog.create_table( + identifier=TEST_TABLE_IDENTIFIER, + schema=TEST_TABLE_SCHEMA, + partition_spec=TEST_TABLE_PARTITION_SPEC, + properties={TableProperties.WRITE_METADATA_PATH: metadata_path}, + ) + df = pa.Table.from_pylist([{"x": 123, "y": 456, "z": 789}], schema=schema_to_pyarrow(TEST_TABLE_SCHEMA)) + table.append(df) + manifests = table.current_snapshot().manifests(table.io) # type: ignore + + assert table.metadata_file_location(table.location(), "", table.properties).startswith(metadata_path) + assert manifests[0].manifest_path.startswith(metadata_path) + assert table.location() != metadata_path + assert table.metadata_location.startswith(metadata_path) + + +def test_table_writes_metadata_to_default_path(catalog: InMemoryCatalog) -> None: + catalog.create_namespace(TEST_TABLE_NAMESPACE) + table = catalog.create_table( + identifier=TEST_TABLE_IDENTIFIER, + schema=TEST_TABLE_SCHEMA, + partition_spec=TEST_TABLE_PARTITION_SPEC, + properties=TEST_TABLE_PROPERTIES, + ) + metadata_path = f"{table.location()}/metadata" + df = pa.Table.from_pylist([{"x": 123, "y": 456, "z": 789}], schema=schema_to_pyarrow(TEST_TABLE_SCHEMA)) + table.append(df) + manifests = table.current_snapshot().manifests(table.io) # type: ignore + + assert table.metadata_file_location(table.location(), "", table.properties).startswith(metadata_path) + assert manifests[0].manifest_path.startswith(metadata_path) + assert table.metadata_location.startswith(metadata_path) From 6f88749ea4d7335e4aa4242840a8fbd1fdb0b7da Mon Sep 17 00:00:00 2001 From: Drew Gallardo Date: Tue, 11 Feb 2025 17:23:30 -0800 Subject: [PATCH 03/10] move metadata file location function to Table --- pyiceberg/catalog/__init__.py | 12 ++---------- pyiceberg/catalog/dynamodb.py | 2 +- pyiceberg/catalog/sql.py | 2 +- pyiceberg/table/__init__.py | 31 +++++++++++++++++++++++++----- pyiceberg/table/update/snapshot.py | 8 ++++---- tests/catalog/test_base.py | 4 ++-- 6 files changed, 36 insertions(+), 23 deletions(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index b36121dfca..65c8e89452 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -857,7 +857,7 @@ def _create_staged_table( database_name, table_name = self.identifier_to_database_and_table(identifier) location = self._resolve_table_location(location, database_name, table_name) - metadata_location = self._get_metadata_location(table_location=location, properties=properties) + metadata_location = Table.new_table_metadata_file_location(table_location=location, properties=properties) metadata = new_table_metadata( location=location, schema=schema, partition_spec=partition_spec, sort_order=sort_order, properties=properties ) @@ -888,7 +888,7 @@ def _update_and_stage_table( ) new_metadata_version = self._parse_metadata_version(current_table.metadata_location) + 1 if current_table else 0 - new_metadata_location = self._get_metadata_location( + new_metadata_location = Table.new_table_metadata_file_location( updated_metadata.location, new_metadata_version, updated_metadata.properties ) @@ -947,14 +947,6 @@ def _get_default_warehouse_location(self, database_name: str, table_name: str) - def _write_metadata(metadata: TableMetadata, io: FileIO, metadata_path: str) -> None: ToOutputFile.table_metadata(metadata, io.new_output(metadata_path)) - @staticmethod - def _get_metadata_location(table_location: str, new_version: int = 0, properties: Properties = EMPTY_DICT) -> str: - if new_version < 0: - raise ValueError(f"Table metadata version: `{new_version}` must be a non-negative integer") - - file_name = f"{new_version:05d}-{uuid.uuid4()}.metadata.json" - return Table.metadata_file_location(table_location, file_name, properties) - @staticmethod def _parse_metadata_version(metadata_location: str) -> int: """Parse the version from the metadata location. diff --git a/pyiceberg/catalog/dynamodb.py b/pyiceberg/catalog/dynamodb.py index a96c8e7f19..629a07235f 100644 --- a/pyiceberg/catalog/dynamodb.py +++ b/pyiceberg/catalog/dynamodb.py @@ -173,7 +173,7 @@ def create_table( database_name, table_name = self.identifier_to_database_and_table(identifier) location = self._resolve_table_location(location, database_name, table_name) - metadata_location = self._get_metadata_location(table_location=location, properties=properties) + metadata_location = Table.new_table_metadata_file_location(table_location=location, properties=properties) metadata = new_table_metadata( location=location, schema=schema, partition_spec=partition_spec, sort_order=sort_order, properties=properties ) diff --git a/pyiceberg/catalog/sql.py b/pyiceberg/catalog/sql.py index cc31d3dec3..edd078f57a 100644 --- a/pyiceberg/catalog/sql.py +++ b/pyiceberg/catalog/sql.py @@ -207,7 +207,7 @@ def create_table( namespace = Catalog.namespace_to_string(namespace_identifier) location = self._resolve_table_location(location, namespace, table_name) - metadata_location = self._get_metadata_location(table_location=location, properties=properties) + metadata_location = Table.new_table_metadata_file_location(table_location=location, properties=properties) metadata = new_table_metadata( location=location, schema=schema, partition_spec=partition_spec, sort_order=sort_order, properties=properties ) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index ab97f76237..56b29ce51e 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -1238,19 +1238,40 @@ def to_polars(self) -> pl.LazyFrame: return pl.scan_iceberg(self) @staticmethod - def metadata_file_location(table_location: str, file_name: str, properties: Properties = EMPTY_DICT) -> str: - """Get the full path for a metadata file. + def new_table_metadata_file_location(table_location: str, new_version: int = 0, properties: Properties = EMPTY_DICT) -> str: + """Return a fully-qualified metadata file location for a new table version. + + Args: + table_location (str): the base table location. + new_version (int): Version number of the metadata file. + properties (Properties): Table properties that may contain a custom metadata path. + + Returns: + str: fully-qualified URI for the new table metadata file. + + Raises: + ValueError: If the version is negative. + """ + if new_version < 0: + raise ValueError(f"Table metadata version: `{new_version}` must be a non-negative integer") + + file_name = f"{new_version:05d}-{uuid.uuid4()}.metadata.json" + return Table.new_metadata_location(table_location, file_name, properties) + + @staticmethod + def new_metadata_location(table_location: str, file_name: str, properties: Properties = EMPTY_DICT) -> str: + """Return a fully-qualified metadata file location for the given filename. Args: table_location (str): The base table location file_name (str): Name of the metadata file - properties (Properties): Table properties that may contain custom metadata path + properties (Properties): Table properties that may contain a custom metadata path Returns: - str: Full path where the metadata file should be stored + str: A fully-qualified location URI for the metadata file. """ if metadata_path := properties.get(TableProperties.WRITE_METADATA_PATH): - return f"{metadata_path.rstrip("/")}/{file_name}" + return f"{metadata_path.rstrip('/')}/{file_name}" return f"{table_location}/metadata/{file_name}" diff --git a/pyiceberg/table/update/snapshot.py b/pyiceberg/table/update/snapshot.py index 863b822839..039d707bcb 100644 --- a/pyiceberg/table/update/snapshot.py +++ b/pyiceberg/table/update/snapshot.py @@ -88,7 +88,7 @@ def _new_manifest_file_name(num: int, commit_uuid: uuid.UUID) -> str: return f"{commit_uuid}-m{num}.avro" -def _generate_manifest_list_file_name(snapshot_id: int, attempt: int, commit_uuid: uuid.UUID) -> str: +def _new_manifest_list_file_name(snapshot_id: int, attempt: int, commit_uuid: uuid.UUID) -> str: # Mimics the behavior in Java: # https://github.com/apache/iceberg/blob/c862b9177af8e2d83122220764a056f3b96fd00c/core/src/main/java/org/apache/iceberg/SnapshotProducer.java#L491 return f"snap-{snapshot_id}-{attempt}-{commit_uuid}.avro" @@ -245,12 +245,12 @@ def _commit(self) -> UpdatesAndRequirements: summary = self._summary(self.snapshot_properties) table_location = self._transaction.table_metadata.location properties = self._transaction.table_metadata.properties - file_name = _generate_manifest_list_file_name( + file_name = _new_manifest_list_file_name( snapshot_id=self._snapshot_id, attempt=0, commit_uuid=self.commit_uuid, ) - manifest_list_file_path = self._transaction._table.metadata_file_location(table_location, file_name, properties) + manifest_list_file_path = self._transaction._table.new_metadata_location(table_location, file_name, properties) with write_manifest_list( format_version=self._transaction.table_metadata.format_version, output_file=self._io.new_output(manifest_list_file_path), @@ -299,7 +299,7 @@ def new_manifest_output(self) -> OutputFile: table_location = self._transaction.table_metadata.location properties = self._transaction.table_metadata.properties file_name = _new_manifest_file_name(num=next(self._manifest_num_counter), commit_uuid=self.commit_uuid) - file_path = self._transaction._table.metadata_file_location(table_location, file_name, properties) + file_path = self._transaction._table.new_metadata_location(table_location, file_name, properties) return self._io.new_output(file_path) def fetch_manifest_entry(self, manifest: ManifestFile, discard_deleted: bool = True) -> List[ManifestEntry]: diff --git a/tests/catalog/test_base.py b/tests/catalog/test_base.py index 2592478d86..556b954c20 100644 --- a/tests/catalog/test_base.py +++ b/tests/catalog/test_base.py @@ -580,7 +580,7 @@ def test_table_writes_metadata_to_custom_location(catalog: InMemoryCatalog) -> N table.append(df) manifests = table.current_snapshot().manifests(table.io) # type: ignore - assert table.metadata_file_location(table.location(), "", table.properties).startswith(metadata_path) + assert table.new_metadata_location(table.location(), "", table.properties).startswith(metadata_path) assert manifests[0].manifest_path.startswith(metadata_path) assert table.location() != metadata_path assert table.metadata_location.startswith(metadata_path) @@ -599,6 +599,6 @@ def test_table_writes_metadata_to_default_path(catalog: InMemoryCatalog) -> None table.append(df) manifests = table.current_snapshot().manifests(table.io) # type: ignore - assert table.metadata_file_location(table.location(), "", table.properties).startswith(metadata_path) + assert table.new_metadata_location(table.location(), "", table.properties).startswith(metadata_path) assert manifests[0].manifest_path.startswith(metadata_path) assert table.metadata_location.startswith(metadata_path) From 1c9f177be4bbb5a1bef78b46f69a1a3d56de427d Mon Sep 17 00:00:00 2001 From: Drew Gallardo Date: Wed, 12 Feb 2025 18:41:28 -0800 Subject: [PATCH 04/10] Move metadata path generation to the location provider --- pyiceberg/catalog/__init__.py | 9 ++++--- pyiceberg/catalog/dynamodb.py | 5 +++- pyiceberg/catalog/sql.py | 4 ++- pyiceberg/table/__init__.py | 43 ++++-------------------------- pyiceberg/table/locations.py | 38 +++++++++++++++++++++++++- pyiceberg/table/update/snapshot.py | 10 +++---- tests/catalog/test_base.py | 24 +++++++++++++++-- tests/table/test_locations.py | 24 +++++++++++++++++ 8 files changed, 104 insertions(+), 53 deletions(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 65c8e89452..f2dce81131 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -57,6 +57,7 @@ Table, TableProperties, ) +from pyiceberg.table.locations import load_location_provider from pyiceberg.table.metadata import TableMetadata, TableMetadataV1, new_table_metadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder from pyiceberg.table.update import ( @@ -857,7 +858,8 @@ def _create_staged_table( database_name, table_name = self.identifier_to_database_and_table(identifier) location = self._resolve_table_location(location, database_name, table_name) - metadata_location = Table.new_table_metadata_file_location(table_location=location, properties=properties) + provider = load_location_provider(location, properties) + metadata_location = provider.new_table_metadata_file_location() metadata = new_table_metadata( location=location, schema=schema, partition_spec=partition_spec, sort_order=sort_order, properties=properties ) @@ -888,9 +890,8 @@ def _update_and_stage_table( ) new_metadata_version = self._parse_metadata_version(current_table.metadata_location) + 1 if current_table else 0 - new_metadata_location = Table.new_table_metadata_file_location( - updated_metadata.location, new_metadata_version, updated_metadata.properties - ) + provider = load_location_provider(updated_metadata.location, updated_metadata.properties) + new_metadata_location = provider.new_table_metadata_file_location(new_metadata_version) return StagedTable( identifier=table_identifier, diff --git a/pyiceberg/catalog/dynamodb.py b/pyiceberg/catalog/dynamodb.py index 629a07235f..e3ce3c0e90 100644 --- a/pyiceberg/catalog/dynamodb.py +++ b/pyiceberg/catalog/dynamodb.py @@ -54,6 +54,7 @@ from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile from pyiceberg.table import CommitTableResponse, Table +from pyiceberg.table.locations import load_location_provider from pyiceberg.table.metadata import new_table_metadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder from pyiceberg.table.update import ( @@ -173,7 +174,9 @@ def create_table( database_name, table_name = self.identifier_to_database_and_table(identifier) location = self._resolve_table_location(location, database_name, table_name) - metadata_location = Table.new_table_metadata_file_location(table_location=location, properties=properties) + provider = load_location_provider(table_location=location, table_properties=properties) + metadata_location = provider.new_table_metadata_file_location() + metadata = new_table_metadata( location=location, schema=schema, partition_spec=partition_spec, sort_order=sort_order, properties=properties ) diff --git a/pyiceberg/catalog/sql.py b/pyiceberg/catalog/sql.py index edd078f57a..bc11e4f907 100644 --- a/pyiceberg/catalog/sql.py +++ b/pyiceberg/catalog/sql.py @@ -62,6 +62,7 @@ from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile from pyiceberg.table import CommitTableResponse, Table +from pyiceberg.table.locations import load_location_provider from pyiceberg.table.metadata import new_table_metadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder from pyiceberg.table.update import ( @@ -207,7 +208,8 @@ def create_table( namespace = Catalog.namespace_to_string(namespace_identifier) location = self._resolve_table_location(location, namespace, table_name) - metadata_location = Table.new_table_metadata_file_location(table_location=location, properties=properties) + location_provider = load_location_provider(table_location=location, table_properties=properties) + metadata_location = location_provider.new_table_metadata_file_location() metadata = new_table_metadata( location=location, schema=schema, partition_spec=partition_spec, sort_order=sort_order, properties=properties ) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 56b29ce51e..e57c3edfe3 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -79,6 +79,7 @@ ) from pyiceberg.schema import Schema from pyiceberg.table.inspect import InspectTable +from pyiceberg.table.locations import LocationProvider, load_location_provider from pyiceberg.table.metadata import ( INITIAL_SEQUENCE_NUMBER, TableMetadata, @@ -1001,6 +1002,10 @@ def location(self) -> str: """Return the table's base location.""" return self.metadata.location + def location_provider(self) -> LocationProvider: + """Return the table's location provider.""" + return load_location_provider(table_location=self.metadata.location, table_properties=self.metadata.properties) + @property def last_sequence_number(self) -> int: return self.metadata.last_sequence_number @@ -1237,44 +1242,6 @@ def to_polars(self) -> pl.LazyFrame: return pl.scan_iceberg(self) - @staticmethod - def new_table_metadata_file_location(table_location: str, new_version: int = 0, properties: Properties = EMPTY_DICT) -> str: - """Return a fully-qualified metadata file location for a new table version. - - Args: - table_location (str): the base table location. - new_version (int): Version number of the metadata file. - properties (Properties): Table properties that may contain a custom metadata path. - - Returns: - str: fully-qualified URI for the new table metadata file. - - Raises: - ValueError: If the version is negative. - """ - if new_version < 0: - raise ValueError(f"Table metadata version: `{new_version}` must be a non-negative integer") - - file_name = f"{new_version:05d}-{uuid.uuid4()}.metadata.json" - return Table.new_metadata_location(table_location, file_name, properties) - - @staticmethod - def new_metadata_location(table_location: str, file_name: str, properties: Properties = EMPTY_DICT) -> str: - """Return a fully-qualified metadata file location for the given filename. - - Args: - table_location (str): The base table location - file_name (str): Name of the metadata file - properties (Properties): Table properties that may contain a custom metadata path - - Returns: - str: A fully-qualified location URI for the metadata file. - """ - if metadata_path := properties.get(TableProperties.WRITE_METADATA_PATH): - return f"{metadata_path.rstrip('/')}/{file_name}" - - return f"{table_location}/metadata/{file_name}" - class StaticTable(Table): """Load a table directly from a metadata file (i.e., without using a catalog).""" diff --git a/pyiceberg/table/locations.py b/pyiceberg/table/locations.py index 0de4dc68b2..2d604abb6c 100644 --- a/pyiceberg/table/locations.py +++ b/pyiceberg/table/locations.py @@ -16,6 +16,7 @@ # under the License. import importlib import logging +import uuid from abc import ABC, abstractmethod from typing import Optional @@ -29,7 +30,7 @@ class LocationProvider(ABC): - """A base class for location providers, that provide data file locations for a table's write tasks. + """A base class for location providers, that provide file locations for a table's write tasks. Args: table_location (str): The table's base storage location. @@ -40,6 +41,7 @@ class LocationProvider(ABC): table_properties: Properties data_path: str + metadata_path: str def __init__(self, table_location: str, table_properties: Properties): self.table_location = table_location @@ -52,6 +54,11 @@ def __init__(self, table_location: str, table_properties: Properties): else: self.data_path = f"{self.table_location.rstrip('/')}/data" + if path := table_properties.get(TableProperties.WRITE_METADATA_PATH): + self.metadata_path = path.rstrip("/") + else: + self.metadata_path = f"{self.table_location.rstrip('/')}/metadata" + @abstractmethod def new_data_location(self, data_file_name: str, partition_key: Optional[PartitionKey] = None) -> str: """Return a fully-qualified data file location for the given filename. @@ -64,6 +71,35 @@ def new_data_location(self, data_file_name: str, partition_key: Optional[Partiti str: A fully-qualified location URI for the data file. """ + def new_table_metadata_file_location(self, new_version: int = 0) -> str: + """Return a fully-qualified metadata file location for a new table version. + + Args: + new_version (int): Version number of the metadata file. + + Returns: + str: fully-qualified URI for the new table metadata file. + + Raises: + ValueError: If the version is negative. + """ + if new_version < 0: + raise ValueError(f"Table metadata version: `{new_version}` must be a non-negative integer") + + file_name = f"{new_version:05d}-{uuid.uuid4()}.metadata.json" + return self.new_metadata_location(file_name) + + def new_metadata_location(self, metadata_file_name: str) -> str: + """Return a fully-qualified metadata file location for the given filename. + + Args: + metadata_file_name (str): Name of the metadata file. + + Returns: + str: A fully-qualified location URI for the metadata file. + """ + return f"{self.metadata_path}/{metadata_file_name}" + class SimpleLocationProvider(LocationProvider): def __init__(self, table_location: str, table_properties: Properties): diff --git a/pyiceberg/table/update/snapshot.py b/pyiceberg/table/update/snapshot.py index 039d707bcb..9652d738e6 100644 --- a/pyiceberg/table/update/snapshot.py +++ b/pyiceberg/table/update/snapshot.py @@ -243,14 +243,13 @@ def _commit(self) -> UpdatesAndRequirements: next_sequence_number = self._transaction.table_metadata.next_sequence_number() summary = self._summary(self.snapshot_properties) - table_location = self._transaction.table_metadata.location - properties = self._transaction.table_metadata.properties file_name = _new_manifest_list_file_name( snapshot_id=self._snapshot_id, attempt=0, commit_uuid=self.commit_uuid, ) - manifest_list_file_path = self._transaction._table.new_metadata_location(table_location, file_name, properties) + location_provider = self._transaction._table.location_provider() + manifest_list_file_path = location_provider.new_metadata_location(file_name) with write_manifest_list( format_version=self._transaction.table_metadata.format_version, output_file=self._io.new_output(manifest_list_file_path), @@ -296,10 +295,9 @@ def new_manifest_writer(self, spec: PartitionSpec) -> ManifestWriter: ) def new_manifest_output(self) -> OutputFile: - table_location = self._transaction.table_metadata.location - properties = self._transaction.table_metadata.properties + location_provider = self._transaction._table.location_provider() file_name = _new_manifest_file_name(num=next(self._manifest_num_counter), commit_uuid=self.commit_uuid) - file_path = self._transaction._table.new_metadata_location(table_location, file_name, properties) + file_path = location_provider.new_metadata_location(file_name) return self._io.new_output(file_path) def fetch_manifest_entry(self, manifest: ManifestFile, discard_deleted: bool = True) -> List[ManifestEntry]: diff --git a/tests/catalog/test_base.py b/tests/catalog/test_base.py index 556b954c20..c00f4fde95 100644 --- a/tests/catalog/test_base.py +++ b/tests/catalog/test_base.py @@ -579,8 +579,9 @@ def test_table_writes_metadata_to_custom_location(catalog: InMemoryCatalog) -> N df = pa.Table.from_pylist([{"x": 123, "y": 456, "z": 789}], schema=schema_to_pyarrow(TEST_TABLE_SCHEMA)) table.append(df) manifests = table.current_snapshot().manifests(table.io) # type: ignore + location_provider = table.location_provider() - assert table.new_metadata_location(table.location(), "", table.properties).startswith(metadata_path) + assert location_provider.new_metadata_location("").startswith(metadata_path) assert manifests[0].manifest_path.startswith(metadata_path) assert table.location() != metadata_path assert table.metadata_location.startswith(metadata_path) @@ -598,7 +599,26 @@ def test_table_writes_metadata_to_default_path(catalog: InMemoryCatalog) -> None df = pa.Table.from_pylist([{"x": 123, "y": 456, "z": 789}], schema=schema_to_pyarrow(TEST_TABLE_SCHEMA)) table.append(df) manifests = table.current_snapshot().manifests(table.io) # type: ignore + location_provider = table.location_provider() - assert table.new_metadata_location(table.location(), "", table.properties).startswith(metadata_path) + assert location_provider.new_metadata_location("").startswith(metadata_path) assert manifests[0].manifest_path.startswith(metadata_path) assert table.metadata_location.startswith(metadata_path) + + +def test_table_metadata_writes_reflect_latest_path(catalog: InMemoryCatalog) -> None: + catalog.create_namespace(TEST_TABLE_NAMESPACE) + table = catalog.create_table( + identifier=TEST_TABLE_IDENTIFIER, + schema=TEST_TABLE_SCHEMA, + partition_spec=TEST_TABLE_PARTITION_SPEC, + ) + + initial_metadata_path = f"{table.location()}/metadata" + assert table.location_provider().new_metadata_location("metadata.json") == f"{initial_metadata_path}/metadata.json" + + # update table with new path for metadata + new_metadata_path = f"{table.location()}/custom/path" + table.transaction().set_properties({TableProperties.WRITE_METADATA_PATH: new_metadata_path}).commit_transaction() + + assert table.location_provider().new_metadata_location("metadata.json") == f"{new_metadata_path}/metadata.json" diff --git a/tests/table/test_locations.py b/tests/table/test_locations.py index 490bf7103a..d66bf18792 100644 --- a/tests/table/test_locations.py +++ b/tests/table/test_locations.py @@ -157,3 +157,27 @@ def test_simple_location_provider_write_data_path() -> None: ) assert provider.new_data_location("file.parquet") == "s3://table-location/custom/data/path/file.parquet" + + +def test_location_provider_metadata_default_location() -> None: + provider = load_location_provider(table_location="table_location", table_properties=EMPTY_DICT) + + assert provider.new_metadata_location("manifest.avro") == "table_location/metadata/manifest.avro" + + +def test_location_provider_metadata_location_with_custom_path() -> None: + provider = load_location_provider( + table_location="table_location", + table_properties={TableProperties.WRITE_METADATA_PATH: "s3://table-location/custom/path"}, + ) + + assert provider.new_metadata_location("metadata.json") == "s3://table-location/custom/path/metadata.json" + + +def test_metadata_location_with_trailing_slash() -> None: + provider = load_location_provider( + table_location="table_location", + table_properties={TableProperties.WRITE_METADATA_PATH: "s3://table-location/custom/path/"}, + ) + + assert provider.new_metadata_location("metadata.json") == "s3://table-location/custom/path/metadata.json" From 76c0480466d87a64adb3caab517c17eee823de8b Mon Sep 17 00:00:00 2001 From: Drew Gallardo Date: Thu, 13 Feb 2025 12:26:15 -0800 Subject: [PATCH 05/10] update the location provider docs --- mkdocs/docs/configuration.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 50cc725e6d..eab52b4105 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -204,12 +204,15 @@ PyIceberg uses [S3FileSystem](https://arrow.apache.org/docs/python/generated/pya ## Location Providers -Apache Iceberg uses the concept of a `LocationProvider` to manage file paths for a table's data. In PyIceberg, the +Apache Iceberg uses the concept of a `LocationProvider` to manage file paths for a table's data and metadata files. In PyIceberg, the `LocationProvider` module is designed to be pluggable, allowing customization for specific use cases. The `LocationProvider` for a table can be specified through table properties. -PyIceberg defaults to the [`ObjectStoreLocationProvider`](configuration.md#object-store-location-provider), which generates -file paths that are optimized for object storage. +While data files can leverage provider-specific optimizations, metadata files always follow a simple path configuration. Regardless of +the `LocationProvider` used, metadata files are written to the path specified by the [`write.metadata.path` table configuration](#write-options) table property. + +PyIceberg defaults to the [`ObjectStoreLocationProvider`](configuration.md#object-store-location-provider), which generates file paths for +data files that are optimized for object storage. ### Simple Location Provider From b36df41dd28f26cdb9cbc7e58194650c51d49434 Mon Sep 17 00:00:00 2001 From: Drew Gallardo Date: Thu, 13 Feb 2025 12:26:49 -0800 Subject: [PATCH 06/10] run lint --- mkdocs/docs/configuration.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index eab52b4105..f483ed6940 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -208,10 +208,10 @@ Apache Iceberg uses the concept of a `LocationProvider` to manage file paths for `LocationProvider` module is designed to be pluggable, allowing customization for specific use cases. The `LocationProvider` for a table can be specified through table properties. -While data files can leverage provider-specific optimizations, metadata files always follow a simple path configuration. Regardless of +While data files can leverage provider-specific optimizations, metadata files always follow a simple path configuration. Regardless of the `LocationProvider` used, metadata files are written to the path specified by the [`write.metadata.path` table configuration](#write-options) table property. -PyIceberg defaults to the [`ObjectStoreLocationProvider`](configuration.md#object-store-location-provider), which generates file paths for +PyIceberg defaults to the [`ObjectStoreLocationProvider`](configuration.md#object-store-location-provider), which generates file paths for data files that are optimized for object storage. ### Simple Location Provider From 8b309d250fc734cf7a19bf02c7e203bd774d920e Mon Sep 17 00:00:00 2001 From: Drew Gallardo Date: Thu, 13 Feb 2025 17:28:10 -0800 Subject: [PATCH 07/10] update docs --- mkdocs/docs/configuration.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index f483ed6940..b6d2d04fe0 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -208,8 +208,9 @@ Apache Iceberg uses the concept of a `LocationProvider` to manage file paths for `LocationProvider` module is designed to be pluggable, allowing customization for specific use cases. The `LocationProvider` for a table can be specified through table properties. -While data files can leverage provider-specific optimizations, metadata files always follow a simple path configuration. Regardless of -the `LocationProvider` used, metadata files are written to the path specified by the [`write.metadata.path` table configuration](#write-options) table property. +Both data file and metadata file locations can be customized by configuring the table properties [write.data.path and write.metadata.path](#write-options), respectively. + +For more granular control, you can override the `LocationProvider`'s `new_data_location` and `new_metadata_location` methods to define custom logic for generating file paths. PyIceberg defaults to the [`ObjectStoreLocationProvider`](configuration.md#object-store-location-provider), which generates file paths for data files that are optimized for object storage. From 577bbdded40079952272a1304602c0ecdfe7adcc Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Fri, 14 Feb 2025 12:08:35 -0500 Subject: [PATCH 08/10] Update mkdocs/docs/configuration.md Co-authored-by: Fokko Driesprong --- mkdocs/docs/configuration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index b6d2d04fe0..205d525118 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -208,7 +208,7 @@ Apache Iceberg uses the concept of a `LocationProvider` to manage file paths for `LocationProvider` module is designed to be pluggable, allowing customization for specific use cases. The `LocationProvider` for a table can be specified through table properties. -Both data file and metadata file locations can be customized by configuring the table properties [write.data.path and write.metadata.path](#write-options), respectively. +Both data file and metadata file locations can be customized by configuring the table properties [`write.data.path` and `write.metadata.path`](#write-options), respectively. For more granular control, you can override the `LocationProvider`'s `new_data_location` and `new_metadata_location` methods to define custom logic for generating file paths. From 5b0b85b066941850615c4534ccb921b66f7c76f8 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Fri, 14 Feb 2025 12:09:31 -0500 Subject: [PATCH 09/10] Update mkdocs/docs/configuration.md Co-authored-by: smaheshwar-pltr --- mkdocs/docs/configuration.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 205d525118..d80189a672 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -204,8 +204,8 @@ PyIceberg uses [S3FileSystem](https://arrow.apache.org/docs/python/generated/pya ## Location Providers -Apache Iceberg uses the concept of a `LocationProvider` to manage file paths for a table's data and metadata files. In PyIceberg, the -`LocationProvider` module is designed to be pluggable, allowing customization for specific use cases. The +Apache Iceberg uses the concept of a `LocationProvider` to manage file paths for a table's data files. In PyIceberg, the +`LocationProvider` module is designed to be pluggable, allowing customization for specific use cases, and to additionally determine metadata file locations. The `LocationProvider` for a table can be specified through table properties. Both data file and metadata file locations can be customized by configuring the table properties [`write.data.path` and `write.metadata.path`](#write-options), respectively. From f75642ba16402ccd8fa2df5445fa0da00227a81f Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Fri, 14 Feb 2025 12:13:46 -0500 Subject: [PATCH 10/10] Update mkdocs/docs/configuration.md --- mkdocs/docs/configuration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index d80189a672..c7c26c4912 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -210,7 +210,7 @@ Apache Iceberg uses the concept of a `LocationProvider` to manage file paths for Both data file and metadata file locations can be customized by configuring the table properties [`write.data.path` and `write.metadata.path`](#write-options), respectively. -For more granular control, you can override the `LocationProvider`'s `new_data_location` and `new_metadata_location` methods to define custom logic for generating file paths. +For more granular control, you can override the `LocationProvider`'s `new_data_location` and `new_metadata_location` methods to define custom logic for generating file paths. See [`Loading a Custom Location Provider`](configuration.md#loading-a-custom-location-provider). PyIceberg defaults to the [`ObjectStoreLocationProvider`](configuration.md#object-store-location-provider), which generates file paths for data files that are optimized for object storage.