|
17 | 17 | from __future__ import annotations |
18 | 18 |
|
19 | 19 | from datetime import datetime, timezone |
20 | | -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple |
| 20 | +from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Set, Tuple |
21 | 21 |
|
22 | 22 | from pyiceberg.conversions import from_bytes |
23 | 23 | from pyiceberg.manifest import DataFile, DataFileContent, ManifestContent, PartitionFieldSummary |
24 | 24 | from pyiceberg.partitioning import PartitionSpec |
25 | 25 | from pyiceberg.table.snapshots import Snapshot, ancestors_of |
26 | 26 | from pyiceberg.types import PrimitiveType |
| 27 | +from pyiceberg.utils.concurrent import ExecutorFactory |
27 | 28 | from pyiceberg.utils.singleton import _convert_to_hashable_type |
28 | 29 |
|
29 | 30 | if TYPE_CHECKING: |
@@ -346,7 +347,7 @@ def update_partitions_map( |
346 | 347 | schema=table_schema, |
347 | 348 | ) |
348 | 349 |
|
349 | | - def manifests(self) -> "pa.Table": |
| 350 | + def _get_manifests_schema(self) -> "pa.Schema": |
350 | 351 | import pyarrow as pa |
351 | 352 |
|
352 | 353 | partition_summary_schema = pa.struct( |
@@ -374,6 +375,17 @@ def manifests(self) -> "pa.Table": |
374 | 375 | pa.field("partition_summaries", pa.list_(partition_summary_schema), nullable=False), |
375 | 376 | ] |
376 | 377 | ) |
| 378 | + return manifest_schema |
| 379 | + |
| 380 | + def _get_all_manifests_schema(self) -> "pa.Schema": |
| 381 | + import pyarrow as pa |
| 382 | + |
| 383 | + all_manifests_schema = self._get_manifests_schema() |
| 384 | + all_manifests_schema = all_manifests_schema.append(pa.field("reference_snapshot_id", pa.int64(), nullable=False)) |
| 385 | + return all_manifests_schema |
| 386 | + |
| 387 | + def _generate_manifests_table(self, snapshot: Optional[Snapshot], is_all_manifests_table: bool = False) -> "pa.Table": |
| 388 | + import pyarrow as pa |
377 | 389 |
|
378 | 390 | def _partition_summaries_to_rows( |
379 | 391 | spec: PartitionSpec, partition_summaries: List[PartitionFieldSummary] |
@@ -412,36 +424,38 @@ def _partition_summaries_to_rows( |
412 | 424 |
|
413 | 425 | specs = self.tbl.metadata.specs() |
414 | 426 | manifests = [] |
415 | | - if snapshot := self.tbl.metadata.current_snapshot(): |
| 427 | + if snapshot: |
416 | 428 | for manifest in snapshot.manifests(self.tbl.io): |
417 | 429 | is_data_file = manifest.content == ManifestContent.DATA |
418 | 430 | is_delete_file = manifest.content == ManifestContent.DELETES |
419 | | - manifests.append( |
420 | | - { |
421 | | - "content": manifest.content, |
422 | | - "path": manifest.manifest_path, |
423 | | - "length": manifest.manifest_length, |
424 | | - "partition_spec_id": manifest.partition_spec_id, |
425 | | - "added_snapshot_id": manifest.added_snapshot_id, |
426 | | - "added_data_files_count": manifest.added_files_count if is_data_file else 0, |
427 | | - "existing_data_files_count": manifest.existing_files_count if is_data_file else 0, |
428 | | - "deleted_data_files_count": manifest.deleted_files_count if is_data_file else 0, |
429 | | - "added_delete_files_count": manifest.added_files_count if is_delete_file else 0, |
430 | | - "existing_delete_files_count": manifest.existing_files_count if is_delete_file else 0, |
431 | | - "deleted_delete_files_count": manifest.deleted_files_count if is_delete_file else 0, |
432 | | - "partition_summaries": _partition_summaries_to_rows( |
433 | | - specs[manifest.partition_spec_id], manifest.partitions |
434 | | - ) |
435 | | - if manifest.partitions |
436 | | - else [], |
437 | | - } |
438 | | - ) |
| 431 | + manifest_row = { |
| 432 | + "content": manifest.content, |
| 433 | + "path": manifest.manifest_path, |
| 434 | + "length": manifest.manifest_length, |
| 435 | + "partition_spec_id": manifest.partition_spec_id, |
| 436 | + "added_snapshot_id": manifest.added_snapshot_id, |
| 437 | + "added_data_files_count": manifest.added_files_count if is_data_file else 0, |
| 438 | + "existing_data_files_count": manifest.existing_files_count if is_data_file else 0, |
| 439 | + "deleted_data_files_count": manifest.deleted_files_count if is_data_file else 0, |
| 440 | + "added_delete_files_count": manifest.added_files_count if is_delete_file else 0, |
| 441 | + "existing_delete_files_count": manifest.existing_files_count if is_delete_file else 0, |
| 442 | + "deleted_delete_files_count": manifest.deleted_files_count if is_delete_file else 0, |
| 443 | + "partition_summaries": _partition_summaries_to_rows(specs[manifest.partition_spec_id], manifest.partitions) |
| 444 | + if manifest.partitions |
| 445 | + else [], |
| 446 | + } |
| 447 | + if is_all_manifests_table: |
| 448 | + manifest_row["reference_snapshot_id"] = snapshot.snapshot_id |
| 449 | + manifests.append(manifest_row) |
439 | 450 |
|
440 | 451 | return pa.Table.from_pylist( |
441 | 452 | manifests, |
442 | | - schema=manifest_schema, |
| 453 | + schema=self._get_all_manifests_schema() if is_all_manifests_table else self._get_manifests_schema(), |
443 | 454 | ) |
444 | 455 |
|
| 456 | + def manifests(self) -> "pa.Table": |
| 457 | + return self._generate_manifests_table(self.tbl.current_snapshot()) |
| 458 | + |
445 | 459 | def metadata_log_entries(self) -> "pa.Table": |
446 | 460 | import pyarrow as pa |
447 | 461 |
|
@@ -630,3 +644,16 @@ def data_files(self, snapshot_id: Optional[int] = None) -> "pa.Table": |
630 | 644 |
|
631 | 645 | def delete_files(self, snapshot_id: Optional[int] = None) -> "pa.Table": |
632 | 646 | return self._files(snapshot_id, {DataFileContent.POSITION_DELETES, DataFileContent.EQUALITY_DELETES}) |
| 647 | + |
| 648 | + def all_manifests(self) -> "pa.Table": |
| 649 | + import pyarrow as pa |
| 650 | + |
| 651 | + snapshots = self.tbl.snapshots() |
| 652 | + if not snapshots: |
| 653 | + return pa.Table.from_pylist([], schema=self._get_all_manifests_schema()) |
| 654 | + |
| 655 | + executor = ExecutorFactory.get_or_create() |
| 656 | + manifests_by_snapshots: Iterator["pa.Table"] = executor.map( |
| 657 | + lambda args: self._generate_manifests_table(*args), [(snapshot, True) for snapshot in snapshots] |
| 658 | + ) |
| 659 | + return pa.concat_tables(manifests_by_snapshots) |
0 commit comments