Merge branch 'main' into ehsan/bodo_support

ehsantn · ehsantn · commit 447487bf8e99 · 2025-07-15T09:58:08.000-04:00
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -1852,13 +1852,11 @@ def _build_metrics_evaluator(self) -> Callable[[DataFile], bool]:
     def _build_residual_evaluator(self, spec_id: int) -> Callable[[DataFile], ResidualEvaluator]:
         spec = self.table_metadata.specs()[spec_id]
 
+        from pyiceberg.expressions.visitors import residual_evaluator_of
+
         # The lambda created here is run in multiple threads.
         # So we avoid creating _EvaluatorExpression methods bound to a single
         # shared instance across multiple threads.
-        # return lambda data_file: (partition_schema, partition_expr, self.case_sensitive)(data_file.partition)
-        from pyiceberg.expressions.visitors import residual_evaluator_of
-
-        # assert self.row_filter == False
         return lambda datafile: (
             residual_evaluator_of(
                 spec=spec,
@@ -1868,7 +1866,8 @@ def _build_residual_evaluator(self, spec_id: int) -> Callable[[DataFile], Residu
             )
         )
 
-    def _check_sequence_number(self, min_sequence_number: int, manifest: ManifestFile) -> bool:
+    @staticmethod
+    def _check_sequence_number(min_sequence_number: int, manifest: ManifestFile) -> bool:
         """Ensure that no manifests are loaded that contain deletes that are older than the data.
 
         Args:
diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py
@@ -58,6 +58,7 @@
 TOTAL_FILE_SIZE = "total-files-size"
 CHANGED_PARTITION_COUNT_PROP = "changed-partition-count"
 CHANGED_PARTITION_PREFIX = "partitions."
+PARTITION_SUMMARY_PROP = "partition-summaries-included"
 OPERATION = "operation"
 
 INITIAL_SEQUENCE_NUMBER = 0
@@ -306,6 +307,8 @@ def build(self) -> Dict[str, str]:
         changed_partitions_size = len(self.partition_metrics)
         set_when_positive(properties, changed_partitions_size, CHANGED_PARTITION_COUNT_PROP)
         if changed_partitions_size <= self.max_changed_partitions_for_summaries:
+            if changed_partitions_size > 0:
+                properties[PARTITION_SUMMARY_PROP] = "true"
             for partition_path, update_metrics_partition in self.partition_metrics.items():
                 if (summary := self._partition_summary(update_metrics_partition)) and len(summary) != 0:
                     properties[CHANGED_PARTITION_PREFIX + partition_path] = summary
diff --git a/pyiceberg/table/statistics.py b/pyiceberg/table/statistics.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-from typing import Dict, List, Literal, Optional
+from typing import Dict, List, Literal, Optional, Union
 
 from pydantic import Field
 
@@ -48,7 +48,7 @@ class PartitionStatisticsFile(StatisticsCommonFields):
 
 
 def filter_statistics_by_snapshot_id(
-    statistics: List[StatisticsFile],
+    statistics: List[Union[StatisticsFile, PartitionStatisticsFile]],
     reject_snapshot_id: int,
-) -> List[StatisticsFile]:
+) -> List[Union[StatisticsFile, PartitionStatisticsFile]]:
     return [stat for stat in statistics if stat.snapshot_id != reject_snapshot_id]
diff --git a/pyiceberg/table/update/__init__.py b/pyiceberg/table/update/__init__.py
@@ -36,7 +36,11 @@
     SnapshotLogEntry,
 )
 from pyiceberg.table.sorting import SortOrder
-from pyiceberg.table.statistics import StatisticsFile, filter_statistics_by_snapshot_id
+from pyiceberg.table.statistics import (
+    PartitionStatisticsFile,
+    StatisticsFile,
+    filter_statistics_by_snapshot_id,
+)
 from pyiceberg.typedef import (
     IcebergBaseModel,
     Properties,
@@ -198,6 +202,16 @@ class RemoveStatisticsUpdate(IcebergBaseModel):
     snapshot_id: int = Field(alias="snapshot-id")
 
 
+class SetPartitionStatisticsUpdate(IcebergBaseModel):
+    action: Literal["set-partition-statistics"] = Field(default="set-partition-statistics")
+    partition_statistics: PartitionStatisticsFile
+
+
+class RemovePartitionStatisticsUpdate(IcebergBaseModel):
+    action: Literal["remove-partition-statistics"] = Field(default="remove-partition-statistics")
+    snapshot_id: int = Field(alias="snapshot-id")
+
+
 TableUpdate = Annotated[
     Union[
         AssignUUIDUpdate,
@@ -217,6 +231,8 @@ class RemoveStatisticsUpdate(IcebergBaseModel):
         RemovePropertiesUpdate,
         SetStatisticsUpdate,
         RemoveStatisticsUpdate,
+        SetPartitionStatisticsUpdate,
+        RemovePartitionStatisticsUpdate,
     ],
     Field(discriminator="action"),
 ]
@@ -582,6 +598,29 @@ def _(update: RemoveStatisticsUpdate, base_metadata: TableMetadata, context: _Ta
     return base_metadata.model_copy(update={"statistics": statistics})
 
 
+@_apply_table_update.register(SetPartitionStatisticsUpdate)
+def _(update: SetPartitionStatisticsUpdate, base_metadata: TableMetadata, context: _TableMetadataUpdateContext) -> TableMetadata:
+    partition_statistics = filter_statistics_by_snapshot_id(
+        base_metadata.partition_statistics, update.partition_statistics.snapshot_id
+    )
+    context.add_update(update)
+
+    return base_metadata.model_copy(update={"partition_statistics": partition_statistics + [update.partition_statistics]})
+
+
+@_apply_table_update.register(RemovePartitionStatisticsUpdate)
+def _(
+    update: RemovePartitionStatisticsUpdate, base_metadata: TableMetadata, context: _TableMetadataUpdateContext
+) -> TableMetadata:
+    if not any(part_stat.snapshot_id == update.snapshot_id for part_stat in base_metadata.partition_statistics):
+        raise ValueError(f"Partition Statistics with snapshot id {update.snapshot_id} does not exist")
+
+    statistics = filter_statistics_by_snapshot_id(base_metadata.partition_statistics, update.snapshot_id)
+    context.add_update(update)
+
+    return base_metadata.model_copy(update={"partition_statistics": statistics})
+
+
 def update_table_metadata(
     base_metadata: TableMetadata,
     updates: Tuple[TableUpdate, ...],
diff --git a/pyiceberg/utils/schema_conversion.py b/pyiceberg/utils/schema_conversion.py
@@ -69,8 +69,10 @@
 LOGICAL_FIELD_TYPE_MAPPING: Dict[Tuple[str, str], PrimitiveType] = {
     ("date", "int"): DateType(),
     ("time-micros", "long"): TimeType(),
+    ("timestamp-millis", "int"): TimestampType(),
     ("timestamp-micros", "long"): TimestampType(),
     ("uuid", "fixed"): UUIDType(),
+    ("uuid", "string"): UUIDType(),
 }
 
 AvroType = Union[str, Any]
diff --git a/pyproject.toml b/pyproject.toml
@@ -319,6 +319,8 @@ pyiceberg-core = ["pyiceberg-core"]
 datafusion = ["datafusion"]
 
 [tool.pytest.ini_options]
+testpaths = ["tests"]
+
 markers = [
   "unmarked: marks a test as a unittest",
   "s3: marks a test as requiring access to s3 compliant storage (use with --aws-access-key-id, --aws-secret-access-key, and --endpoint args)",
diff --git a/tests/table/test_init.py b/tests/table/test_init.py
@@ -64,7 +64,7 @@
     SortField,
     SortOrder,
 )
-from pyiceberg.table.statistics import BlobMetadata, StatisticsFile
+from pyiceberg.table.statistics import BlobMetadata, PartitionStatisticsFile, StatisticsFile
 from pyiceberg.table.update import (
     AddSnapshotUpdate,
     AddSortOrderUpdate,
@@ -76,11 +76,13 @@
     AssertLastAssignedPartitionId,
     AssertRefSnapshotId,
     AssertTableUUID,
+    RemovePartitionStatisticsUpdate,
     RemovePropertiesUpdate,
     RemoveSnapshotRefUpdate,
     RemoveSnapshotsUpdate,
     RemoveStatisticsUpdate,
     SetDefaultSortOrderUpdate,
+    SetPartitionStatisticsUpdate,
     SetPropertiesUpdate,
     SetSnapshotRefUpdate,
     SetStatisticsUpdate,
@@ -1359,3 +1361,79 @@ def test_remove_statistics_update(table_v2_with_statistics: Table) -> None:
             table_v2_with_statistics.metadata,
             (RemoveStatisticsUpdate(snapshot_id=123456789),),
         )
+
+
+def test_set_partition_statistics_update(table_v2_with_statistics: Table) -> None:
+    snapshot_id = table_v2_with_statistics.metadata.current_snapshot_id
+
+    partition_statistics_file = PartitionStatisticsFile(
+        snapshot_id=snapshot_id,
+        statistics_path="s3://bucket/warehouse/stats.puffin",
+        file_size_in_bytes=124,
+    )
+
+    update = SetPartitionStatisticsUpdate(
+        partition_statistics=partition_statistics_file,
+    )
+
+    new_metadata = update_table_metadata(
+        table_v2_with_statistics.metadata,
+        (update,),
+    )
+
+    expected = """
+    {
+      "snapshot-id": 3055729675574597004,
+      "statistics-path": "s3://bucket/warehouse/stats.puffin",
+      "file-size-in-bytes": 124
+    }"""
+
+    assert len(new_metadata.partition_statistics) == 1
+
+    updated_statistics = [stat for stat in new_metadata.partition_statistics if stat.snapshot_id == snapshot_id]
+
+    assert len(updated_statistics) == 1
+    assert json.loads(updated_statistics[0].model_dump_json()) == json.loads(expected)
+
+
+def test_remove_partition_statistics_update(table_v2_with_statistics: Table) -> None:
+    # Add partition statistics file.
+    snapshot_id = table_v2_with_statistics.metadata.current_snapshot_id
+
+    partition_statistics_file = PartitionStatisticsFile(
+        snapshot_id=snapshot_id,
+        statistics_path="s3://bucket/warehouse/stats.puffin",
+        file_size_in_bytes=124,
+    )
+
+    update = SetPartitionStatisticsUpdate(
+        partition_statistics=partition_statistics_file,
+    )
+
+    new_metadata = update_table_metadata(
+        table_v2_with_statistics.metadata,
+        (update,),
+    )
+    assert len(new_metadata.partition_statistics) == 1
+
+    # Remove the same partition statistics file.
+    remove_update = RemovePartitionStatisticsUpdate(snapshot_id=snapshot_id)
+
+    remove_metadata = update_table_metadata(
+        new_metadata,
+        (remove_update,),
+    )
+
+    assert len(remove_metadata.partition_statistics) == 0
+
+
+def test_remove_partition_statistics_update_with_invalid_snapshot_id(table_v2_with_statistics: Table) -> None:
+    # Remove the same partition statistics file.
+    with pytest.raises(
+        ValueError,
+        match="Partition Statistics with snapshot id 123456789 does not exist",
+    ):
+        update_table_metadata(
+            table_v2_with_statistics.metadata,
+            (RemovePartitionStatisticsUpdate(snapshot_id=123456789),),
+        )
diff --git a/tests/table/test_snapshots.py b/tests/table/test_snapshots.py
@@ -224,6 +224,7 @@ def test_snapshot_summary_collector_with_partition() -> None:
         "added-records": "100",
         "deleted-records": "300",
         "changed-partition-count": "2",
+        "partition-summaries-included": "true",
         "partitions.int_field=1": "added-files-size=1234,removed-files-size=1234,added-data-files=1,deleted-data-files=1,added-records=100,deleted-records=100",
         "partitions.int_field=2": "removed-files-size=4321,deleted-data-files=1,deleted-records=200",
     }
@@ -259,11 +260,32 @@ def test_snapshot_summary_collector_with_partition_limit_in_constructor() -> Non
         "added-records": "100",
         "deleted-records": "300",
         "changed-partition-count": "2",
+        "partition-summaries-included": "true",
         "partitions.int_field=1": "added-files-size=1234,removed-files-size=1234,added-data-files=1,deleted-data-files=1,added-records=100,deleted-records=100",
         "partitions.int_field=2": "removed-files-size=4321,deleted-data-files=1,deleted-records=200",
     }
 
 
+@pytest.mark.integration
+def test_partition_summaries_included_not_set_when_no_change() -> None:
+    ssc = SnapshotSummaryCollector()
+    # No files added, so no partition_metrics
+    ssc.set_partition_summary_limit(10)
+    result = ssc.build()
+    assert "partition-summaries-included" not in result
+    assert result == {}  # Should be empty dict
+
+
+@pytest.mark.integration
+def test_partition_summaries_included_not_set_when_unpartitioned_files(table_schema_simple: Schema) -> None:
+    ssc = SnapshotSummaryCollector()
+    data_file = DataFile.from_args(content=DataFileContent.DATA, record_count=100, file_size_in_bytes=1234, partition=Record())
+    ssc.add_file(data_file, schema=table_schema_simple)
+    ssc.set_partition_summary_limit(10)
+    result = ssc.build()
+    assert "partition-summaries-included" not in result
+
+
 def test_merge_snapshot_summaries_empty() -> None:
     assert update_snapshot_summaries(Summary(Operation.APPEND)) == Summary(
         operation=Operation.APPEND,
diff --git a/tests/utils/test_schema_conversion.py b/tests/utils/test_schema_conversion.py
@@ -33,7 +33,9 @@
     NestedField,
     StringType,
     StructType,
+    TimestampType,
     UnknownType,
+    UUIDType,
 )
 from pyiceberg.utils.schema_conversion import AvroSchemaConversion
 
@@ -327,6 +329,30 @@ def test_convert_date_type() -> None:
     assert actual == DateType()
 
 
+def test_convert_uuid_str_type() -> None:
+    avro_logical_type = {"type": "string", "logicalType": "uuid"}
+    actual = AvroSchemaConversion()._convert_logical_type(avro_logical_type)
+    assert actual == UUIDType()
+
+
+def test_convert_uuid_fixed_type() -> None:
+    avro_logical_type = {"type": "fixed", "logicalType": "uuid"}
+    actual = AvroSchemaConversion()._convert_logical_type(avro_logical_type)
+    assert actual == UUIDType()
+
+
+def test_convert_timestamp_millis_type() -> None:
+    avro_logical_type = {"type": "int", "logicalType": "timestamp-millis"}
+    actual = AvroSchemaConversion()._convert_logical_type(avro_logical_type)
+    assert actual == TimestampType()
+
+
+def test_convert_timestamp_micros_type() -> None:
+    avro_logical_type = {"type": "int", "logicalType": "timestamp-micros"}
+    actual = AvroSchemaConversion()._convert_logical_type(avro_logical_type)
+    assert actual == TimestampType()
+
+
 def test_unknown_logical_type() -> None:
     """Test raising a ValueError when converting an unknown logical type as part of an Avro schema conversion"""
     avro_logical_type = {"type": "bytes", "logicalType": "date"}