Skip to content

Commit 3b53edc

Browse files
ndrluisFokko
andauthored
Mark snapshot-id as deprecated in SetStatisticsUpdate (#1566)
Closes #1556 --------- Co-authored-by: Fokko Driesprong <fokko@apache.org>
1 parent a93e300 commit 3b53edc

File tree

4 files changed

+19
-25
lines changed

4 files changed

+19
-25
lines changed

pyiceberg/table/update/__init__.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@
2020
from abc import ABC, abstractmethod
2121
from datetime import datetime
2222
from functools import singledispatch
23-
from typing import TYPE_CHECKING, Any, Dict, Generic, List, Literal, Optional, Tuple, TypeVar, Union
23+
from typing import TYPE_CHECKING, Any, Dict, Generic, List, Literal, Optional, Tuple, TypeVar, Union, cast
2424

25-
from pydantic import Field, field_validator
25+
from pydantic import Field, field_validator, model_validator
2626
from typing_extensions import Annotated
2727

2828
from pyiceberg.exceptions import CommitFailedException
@@ -177,8 +177,20 @@ class RemovePropertiesUpdate(IcebergBaseModel):
177177

178178
class SetStatisticsUpdate(IcebergBaseModel):
179179
action: Literal["set-statistics"] = Field(default="set-statistics")
180-
snapshot_id: int = Field(alias="snapshot-id")
181180
statistics: StatisticsFile
181+
snapshot_id: Optional[int] = Field(
182+
None,
183+
alias="snapshot-id",
184+
description="snapshot-id is **DEPRECATED for REMOVAL** since it contains redundant information. Use `statistics.snapshot-id` field instead.",
185+
)
186+
187+
@model_validator(mode="before")
188+
def validate_snapshot_id(cls, data: Dict[str, Any]) -> Dict[str, Any]:
189+
stats = cast(StatisticsFile, data["statistics"])
190+
191+
data["snapshot_id"] = stats.snapshot_id
192+
193+
return data
182194

183195

184196
class RemoveStatisticsUpdate(IcebergBaseModel):
@@ -491,10 +503,7 @@ def _(
491503

492504
@_apply_table_update.register(SetStatisticsUpdate)
493505
def _(update: SetStatisticsUpdate, base_metadata: TableMetadata, context: _TableMetadataUpdateContext) -> TableMetadata:
494-
if update.snapshot_id != update.statistics.snapshot_id:
495-
raise ValueError("Snapshot id in statistics does not match the snapshot id in the update")
496-
497-
statistics = filter_statistics_by_snapshot_id(base_metadata.statistics, update.snapshot_id)
506+
statistics = filter_statistics_by_snapshot_id(base_metadata.statistics, update.statistics.snapshot_id)
498507
context.add_update(update)
499508

500509
return base_metadata.model_copy(update={"statistics": statistics + [update.statistics]})

pyiceberg/table/update/statistics.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,9 @@ class UpdateStatistics(UpdateTableMetadata["UpdateStatistics"]):
5252
def __init__(self, transaction: "Transaction") -> None:
5353
super().__init__(transaction)
5454

55-
def set_statistics(self, snapshot_id: int, statistics_file: StatisticsFile) -> "UpdateStatistics":
55+
def set_statistics(self, statistics_file: StatisticsFile) -> "UpdateStatistics":
5656
self._updates += (
5757
SetStatisticsUpdate(
58-
snapshot_id=snapshot_id,
5958
statistics=statistics_file,
6059
),
6160
)

tests/integration/test_statistics_operations.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ def create_statistics_file(snapshot_id: int, type_name: str) -> StatisticsFile:
7373
statistics_file_snap_2 = create_statistics_file(add_snapshot_id_2, "deletion-vector-v1")
7474

7575
with tbl.update_statistics() as update:
76-
update.set_statistics(add_snapshot_id_1, statistics_file_snap_1)
77-
update.set_statistics(add_snapshot_id_2, statistics_file_snap_2)
76+
update.set_statistics(statistics_file_snap_1)
77+
update.set_statistics(statistics_file_snap_2)
7878

7979
assert len(tbl.metadata.statistics) == 2
8080

tests/table/test_init.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1310,20 +1310,6 @@ def test_set_statistics_update(table_v2_with_statistics: Table) -> None:
13101310
assert len(updated_statistics) == 1
13111311
assert json.loads(updated_statistics[0].model_dump_json()) == json.loads(expected)
13121312

1313-
update = SetStatisticsUpdate(
1314-
snapshot_id=123456789,
1315-
statistics=statistics_file,
1316-
)
1317-
1318-
with pytest.raises(
1319-
ValueError,
1320-
match="Snapshot id in statistics does not match the snapshot id in the update",
1321-
):
1322-
update_table_metadata(
1323-
table_v2_with_statistics.metadata,
1324-
(update,),
1325-
)
1326-
13271313

13281314
def test_remove_statistics_update(table_v2_with_statistics: Table) -> None:
13291315
update = RemoveStatisticsUpdate(

0 commit comments

Comments
 (0)