From 610f3666669f2fb6de97dba5f38fd6799e06e9ef Mon Sep 17 00:00:00 2001 From: Arnaud Briche Date: Mon, 7 Apr 2025 16:00:44 +0200 Subject: [PATCH 1/3] Use version-hint.text for StaticTable when metadata_location does not ends with '.metadata.json' --- pyiceberg/table/__init__.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 8f7b45f532..555fd12ecd 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -16,6 +16,7 @@ # under the License. from __future__ import annotations +import os import itertools import uuid import warnings @@ -1378,8 +1379,27 @@ def refresh(self) -> Table: """Refresh the current table metadata.""" raise NotImplementedError("To be implemented") + @classmethod + def _metadata_location_from_version_hint(cls, metadata_location: str, properties: Properties = EMPTY_DICT) -> str: + version_hint_location = os.path.join(metadata_location, 'metadata', 'version-hint.text') + io = load_file_io(properties=properties, location=version_hint_location) + file = io.new_input(version_hint_location) + + with file.open() as stream: + content = file.open().read().decode("utf-8") + + if content.endswith('.metadata.json'): + return os.path.join(metadata_location, 'metadata', content) + elif content.isnumeric(): + return os.path.join(metadata_location, 'metadata', 'v%s.metadata.json'.format(content)) + else: + return os.path.join(metadata_location, 'metadata', '%s.metadata.json'.format(content)) + @classmethod def from_metadata(cls, metadata_location: str, properties: Properties = EMPTY_DICT) -> StaticTable: + if not metadata_location.endswith('.metadata.json'): + metadata_location = StaticTable._metadata_location_from_version_hint(metadata_location, properties) + io = load_file_io(properties=properties, location=metadata_location) file = io.new_input(metadata_location) From 375d9a300b0800cb709f4379cb72319d6db48326 Mon Sep 17 00:00:00 2001 From: Arnaud Briche Date: Wed, 9 Apr 2025 16:19:13 +0200 Subject: [PATCH 2/3] Fix unused stream variable --- pyiceberg/table/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 555fd12ecd..6b77f4dff0 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -1386,7 +1386,7 @@ def _metadata_location_from_version_hint(cls, metadata_location: str, properties file = io.new_input(version_hint_location) with file.open() as stream: - content = file.open().read().decode("utf-8") + content = stream.read().decode("utf-8") if content.endswith('.metadata.json'): return os.path.join(metadata_location, 'metadata', content) From 9f7aa223c030695f96dfe8b641b4ccc6815e9970 Mon Sep 17 00:00:00 2001 From: Arnaud Briche Date: Wed, 16 Apr 2025 16:01:05 +0200 Subject: [PATCH 3/3] Add test for version-hint.text --- mkdocs/docs/api.md | 11 +++++++++++ pyiceberg/table/__init__.py | 16 ++++++++-------- tests/conftest.py | 16 ++++++++++++++++ tests/table/test_init.py | 6 ++++++ 4 files changed, 41 insertions(+), 8 deletions(-) diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index 7978fdc9b4..d84c82ec2a 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -215,6 +215,17 @@ static_table = StaticTable.from_metadata( The static-table is considered read-only. +Alternatively, if your table metadata directory contains a `version-hint.text` file, you can just specify +the table root path, and the latest metadata file will be picked automatically. + +```python +from pyiceberg.table import StaticTable + +static_table = StaticTable.from_metadata( + "s3://warehouse/wh/nyc.db/taxis +) +``` + ## Check if a table exists To check whether the `bids` table exists: diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 6b77f4dff0..8dd52c4be2 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -16,8 +16,8 @@ # under the License. from __future__ import annotations -import os import itertools +import os import uuid import warnings from abc import ABC, abstractmethod @@ -1381,23 +1381,23 @@ def refresh(self) -> Table: @classmethod def _metadata_location_from_version_hint(cls, metadata_location: str, properties: Properties = EMPTY_DICT) -> str: - version_hint_location = os.path.join(metadata_location, 'metadata', 'version-hint.text') + version_hint_location = os.path.join(metadata_location, "metadata", "version-hint.text") io = load_file_io(properties=properties, location=version_hint_location) file = io.new_input(version_hint_location) - + with file.open() as stream: content = stream.read().decode("utf-8") - if content.endswith('.metadata.json'): - return os.path.join(metadata_location, 'metadata', content) + if content.endswith(".metadata.json"): + return os.path.join(metadata_location, "metadata", content) elif content.isnumeric(): - return os.path.join(metadata_location, 'metadata', 'v%s.metadata.json'.format(content)) + return os.path.join(metadata_location, "metadata", "v%s.metadata.json").format(content) else: - return os.path.join(metadata_location, 'metadata', '%s.metadata.json'.format(content)) + return os.path.join(metadata_location, "metadata", "%s.metadata.json").format(content) @classmethod def from_metadata(cls, metadata_location: str, properties: Properties = EMPTY_DICT) -> StaticTable: - if not metadata_location.endswith('.metadata.json'): + if not metadata_location.endswith(".metadata.json"): metadata_location = StaticTable._metadata_location_from_version_hint(metadata_location, properties) io = load_file_io(properties=properties, location=metadata_location) diff --git a/tests/conftest.py b/tests/conftest.py index a290b5d834..095b139a3e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1121,6 +1121,22 @@ def example_table_metadata_v3() -> Dict[str, Any]: return EXAMPLE_TABLE_METADATA_V3 +@pytest.fixture(scope="session") +def table_location(tmp_path_factory: pytest.TempPathFactory) -> str: + from pyiceberg.io.pyarrow import PyArrowFileIO + + metadata_filename = f"{uuid.uuid4()}.metadata.json" + metadata_location = str(tmp_path_factory.getbasetemp() / "metadata" / metadata_filename) + version_hint_location = str(tmp_path_factory.getbasetemp() / "metadata" / "version-hint.text") + metadata = TableMetadataV2(**EXAMPLE_TABLE_METADATA_V2) + ToOutputFile.table_metadata(metadata, PyArrowFileIO().new_output(location=metadata_location), overwrite=True) + + with PyArrowFileIO().new_output(location=version_hint_location).create(overwrite=True) as s: + s.write(metadata_filename.encode("utf-8")) + + return str(tmp_path_factory.getbasetemp()) + + @pytest.fixture(scope="session") def metadata_location(tmp_path_factory: pytest.TempPathFactory) -> str: from pyiceberg.io.pyarrow import PyArrowFileIO diff --git a/tests/table/test_init.py b/tests/table/test_init.py index 69bbab527e..8de6f4123d 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -383,6 +383,12 @@ def test_static_table_gz_same_as_table(table_v2: Table, metadata_location_gz: st assert static_table.metadata == table_v2.metadata +def test_static_table_version_hint_same_as_table(table_v2: Table, table_location: str) -> None: + static_table = StaticTable.from_metadata(table_location) + assert isinstance(static_table, Table) + assert static_table.metadata == table_v2.metadata + + def test_static_table_io_does_not_exist(metadata_location: str) -> None: with pytest.raises(ValueError): StaticTable.from_metadata(metadata_location, {PY_IO_IMPL: "pyiceberg.does.not.exist.FileIO"})