From 1399e833732507dcdef802a4dfe4a6bacd4cc02e Mon Sep 17 00:00:00 2001 From: Anton-Tarazi Date: Tue, 28 Oct 2025 22:24:42 -0400 Subject: [PATCH] perf: do not copy metadata for each data file in summary f Please enter the commit message for your changes. Lines starting --- pyiceberg/table/update/snapshot.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pyiceberg/table/update/snapshot.py b/pyiceberg/table/update/snapshot.py index aed7ec0449..a73961b56c 100644 --- a/pyiceberg/table/update/snapshot.py +++ b/pyiceberg/table/update/snapshot.py @@ -240,8 +240,11 @@ def _write_delete_manifest() -> List[ManifestFile]: def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> Summary: from pyiceberg.table import TableProperties + # avoid copying metadata for each data file + table_metadata = self._transaction.table_metadata + partition_summary_limit = int( - self._transaction.table_metadata.properties.get( + table_metadata.properties.get( TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, TableProperties.WRITE_PARTITION_SUMMARY_LIMIT_DEFAULT ) ) @@ -250,23 +253,21 @@ def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> Summary: for data_file in self._added_data_files: ssc.add_file( data_file=data_file, - partition_spec=self._transaction.table_metadata.spec(), - schema=self._transaction.table_metadata.schema(), + partition_spec=table_metadata.spec(), + schema=table_metadata.schema(), ) if len(self._deleted_data_files) > 0: - specs = self._transaction.table_metadata.specs() + specs = table_metadata.specs() for data_file in self._deleted_data_files: ssc.remove_file( data_file=data_file, partition_spec=specs[data_file.spec_id], - schema=self._transaction.table_metadata.schema(), + schema=table_metadata.schema(), ) previous_snapshot = ( - self._transaction.table_metadata.snapshot_by_id(self._parent_snapshot_id) - if self._parent_snapshot_id is not None - else None + table_metadata.snapshot_by_id(self._parent_snapshot_id) if self._parent_snapshot_id is not None else None ) return update_snapshot_summaries(