Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pyiceberg/table/snapshots.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
TOTAL_FILE_SIZE = "total-files-size"
CHANGED_PARTITION_COUNT_PROP = "changed-partition-count"
CHANGED_PARTITION_PREFIX = "partitions."
PARTITION_SUMMARY_PROP = "partition-summaries-included"
OPERATION = "operation"

INITIAL_SEQUENCE_NUMBER = 0
Expand Down Expand Up @@ -306,6 +307,8 @@ def build(self) -> Dict[str, str]:
changed_partitions_size = len(self.partition_metrics)
set_when_positive(properties, changed_partitions_size, CHANGED_PARTITION_COUNT_PROP)
if changed_partitions_size <= self.max_changed_partitions_for_summaries:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i want to point out that java also gate on the trustPartitionMetrics variable

if changed_partitions_size > 0:
properties[PARTITION_SUMMARY_PROP] = "true"
for partition_path, update_metrics_partition in self.partition_metrics.items():
if (summary := self._partition_summary(update_metrics_partition)) and len(summary) != 0:
properties[CHANGED_PARTITION_PREFIX + partition_path] = summary
Expand Down
22 changes: 22 additions & 0 deletions tests/table/test_snapshots.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ def test_snapshot_summary_collector_with_partition() -> None:
"added-records": "100",
"deleted-records": "300",
"changed-partition-count": "2",
"partition-summaries-included": "true",
"partitions.int_field=1": "added-files-size=1234,removed-files-size=1234,added-data-files=1,deleted-data-files=1,added-records=100,deleted-records=100",
"partitions.int_field=2": "removed-files-size=4321,deleted-data-files=1,deleted-records=200",
}
Expand Down Expand Up @@ -259,11 +260,32 @@ def test_snapshot_summary_collector_with_partition_limit_in_constructor() -> Non
"added-records": "100",
"deleted-records": "300",
"changed-partition-count": "2",
"partition-summaries-included": "true",
"partitions.int_field=1": "added-files-size=1234,removed-files-size=1234,added-data-files=1,deleted-data-files=1,added-records=100,deleted-records=100",
"partitions.int_field=2": "removed-files-size=4321,deleted-data-files=1,deleted-records=200",
}


@pytest.mark.integration
def test_partition_summaries_included_not_set_when_no_change() -> None:
ssc = SnapshotSummaryCollector()
# No files added, so no partition_metrics
ssc.set_partition_summary_limit(10)
result = ssc.build()
assert "partition-summaries-included" not in result
assert result == {} # Should be empty dict


@pytest.mark.integration
def test_partition_summaries_included_not_set_when_unpartitioned_files(table_schema_simple) -> None:
ssc = SnapshotSummaryCollector()
data_file = DataFile.from_args(content=DataFileContent.DATA, record_count=100, file_size_in_bytes=1234, partition=Record())
ssc.add_file(data_file, schema=table_schema_simple)
ssc.set_partition_summary_limit(10)
result = ssc.build()
assert "partition-summaries-included" not in result


def test_merge_snapshot_summaries_empty() -> None:
assert update_snapshot_summaries(Summary(Operation.APPEND)) == Summary(
operation=Operation.APPEND,
Expand Down
Loading