|
22 | 22 |
|
23 | 23 | from pyiceberg.exceptions import ValidationException |
24 | 24 | from pyiceberg.io import FileIO |
25 | | -from pyiceberg.manifest import ManifestContent, ManifestEntry, ManifestEntryStatus, ManifestFile |
| 25 | +from pyiceberg.manifest import DataFile, DataFileContent, ManifestContent, ManifestEntry, ManifestEntryStatus, ManifestFile |
26 | 26 | from pyiceberg.table import Table |
27 | 27 | from pyiceberg.table.snapshots import Operation, Snapshot, Summary |
28 | 28 | from pyiceberg.table.update.validate import ( |
29 | 29 | _added_data_files, |
| 30 | + _added_delete_files, |
30 | 31 | _deleted_data_files, |
31 | 32 | _validate_added_data_files, |
32 | 33 | _validate_deleted_data_files, |
| 34 | + _validate_no_new_delete_files, |
| 35 | + _validate_no_new_delete_files_for_data_files, |
33 | 36 | _validation_history, |
34 | 37 | ) |
35 | 38 |
|
@@ -350,3 +353,159 @@ class DummyEntry: |
350 | 353 | data_filter=None, |
351 | 354 | parent_snapshot=oldest_snapshot, |
352 | 355 | ) |
| 356 | + |
| 357 | + |
| 358 | +@pytest.mark.parametrize("operation", [Operation.APPEND, Operation.REPLACE]) |
| 359 | +def test_validate_added_delete_files_non_conflicting_count( |
| 360 | + table_v2_with_extensive_snapshots_and_manifests: tuple[Table, dict[int, list[ManifestFile]]], |
| 361 | + operation: Operation, |
| 362 | +) -> None: |
| 363 | + table, mock_manifests = table_v2_with_extensive_snapshots_and_manifests |
| 364 | + |
| 365 | + snapshot_history = 100 |
| 366 | + snapshots = table.snapshots() |
| 367 | + for i in range(1, snapshot_history + 1): |
| 368 | + altered_snapshot = snapshots[-i] |
| 369 | + altered_snapshot = altered_snapshot.model_copy(update={"summary": Summary(operation=operation)}) |
| 370 | + snapshots[-i] = altered_snapshot |
| 371 | + |
| 372 | + table.metadata = table.metadata.model_copy( |
| 373 | + update={"snapshots": snapshots}, |
| 374 | + ) |
| 375 | + |
| 376 | + oldest_snapshot = table.snapshots()[-snapshot_history] |
| 377 | + newest_snapshot = cast(Snapshot, table.current_snapshot()) |
| 378 | + |
| 379 | + def mock_read_manifest_side_effect(self: Snapshot, io: FileIO) -> list[ManifestFile]: |
| 380 | + """Mock the manifests method to use the snapshot_id for lookup.""" |
| 381 | + snapshot_id = self.snapshot_id |
| 382 | + if snapshot_id in mock_manifests: |
| 383 | + return mock_manifests[snapshot_id] |
| 384 | + return [] |
| 385 | + |
| 386 | + def mock_fetch_manifest_entry(self: ManifestFile, io: FileIO, discard_deleted: bool = True) -> list[ManifestEntry]: |
| 387 | + return [ |
| 388 | + ManifestEntry.from_args( |
| 389 | + status=ManifestEntryStatus.ADDED, snapshot_id=self.added_snapshot_id, sequence_number=self.sequence_number |
| 390 | + ) |
| 391 | + ] |
| 392 | + |
| 393 | + with ( |
| 394 | + patch("pyiceberg.table.snapshots.Snapshot.manifests", new=mock_read_manifest_side_effect), |
| 395 | + patch("pyiceberg.manifest.ManifestFile.fetch_manifest_entry", new=mock_fetch_manifest_entry), |
| 396 | + ): |
| 397 | + dfi = _added_delete_files( |
| 398 | + table=table, |
| 399 | + starting_snapshot=newest_snapshot, |
| 400 | + data_filter=None, |
| 401 | + parent_snapshot=oldest_snapshot, |
| 402 | + partition_set=None, |
| 403 | + ) |
| 404 | + |
| 405 | + assert dfi.is_empty() |
| 406 | + assert len(dfi.referenced_data_files()) == 0 |
| 407 | + |
| 408 | + |
| 409 | +@pytest.mark.parametrize("operation", [Operation.DELETE, Operation.OVERWRITE]) |
| 410 | +def test_validate_added_delete_files_conflicting_count( |
| 411 | + table_v2_with_extensive_snapshots_and_manifests: tuple[Table, dict[int, list[ManifestFile]]], |
| 412 | + operation: Operation, |
| 413 | +) -> None: |
| 414 | + table, mock_manifests = table_v2_with_extensive_snapshots_and_manifests |
| 415 | + |
| 416 | + snapshot_history = 100 |
| 417 | + snapshots = table.snapshots() |
| 418 | + for i in range(1, snapshot_history + 1): |
| 419 | + altered_snapshot = snapshots[-i] |
| 420 | + altered_snapshot = altered_snapshot.model_copy(update={"summary": Summary(operation=operation)}) |
| 421 | + snapshots[-i] = altered_snapshot |
| 422 | + |
| 423 | + table.metadata = table.metadata.model_copy( |
| 424 | + update={"snapshots": snapshots}, |
| 425 | + ) |
| 426 | + |
| 427 | + oldest_snapshot = table.snapshots()[-snapshot_history] |
| 428 | + newest_snapshot = cast(Snapshot, table.current_snapshot()) |
| 429 | + |
| 430 | + mock_delete_file = DataFile.from_args( |
| 431 | + content=DataFileContent.POSITION_DELETES, |
| 432 | + file_path="s3://dummy/path", |
| 433 | + ) |
| 434 | + |
| 435 | + mock_delete_file.spec_id = 0 |
| 436 | + |
| 437 | + def mock_read_manifest_side_effect(self: Snapshot, io: FileIO) -> list[ManifestFile]: |
| 438 | + """Mock the manifests method to use the snapshot_id for lookup.""" |
| 439 | + snapshot_id = self.snapshot_id |
| 440 | + if snapshot_id in mock_manifests: |
| 441 | + return mock_manifests[snapshot_id] |
| 442 | + return [] |
| 443 | + |
| 444 | + def mock_fetch_manifest_entry(self: ManifestFile, io: FileIO, discard_deleted: bool = True) -> list[ManifestEntry]: |
| 445 | + result = [ |
| 446 | + ManifestEntry.from_args( |
| 447 | + status=ManifestEntryStatus.ADDED, snapshot_id=self.added_snapshot_id, sequence_number=self.min_sequence_number |
| 448 | + ) |
| 449 | + ] |
| 450 | + |
| 451 | + result[-1] = ManifestEntry.from_args( |
| 452 | + status=ManifestEntryStatus.ADDED, |
| 453 | + snapshot_id=self.added_snapshot_id, |
| 454 | + sequence_number=10000, |
| 455 | + data_file=mock_delete_file, |
| 456 | + ) |
| 457 | + |
| 458 | + return result |
| 459 | + |
| 460 | + with ( |
| 461 | + patch("pyiceberg.table.snapshots.Snapshot.manifests", new=mock_read_manifest_side_effect), |
| 462 | + patch("pyiceberg.manifest.ManifestFile.fetch_manifest_entry", new=mock_fetch_manifest_entry), |
| 463 | + ): |
| 464 | + dfi = _added_delete_files( |
| 465 | + table=table, |
| 466 | + starting_snapshot=newest_snapshot, |
| 467 | + data_filter=None, |
| 468 | + parent_snapshot=oldest_snapshot, |
| 469 | + partition_set=None, |
| 470 | + ) |
| 471 | + |
| 472 | + assert not dfi.is_empty() |
| 473 | + assert dfi.referenced_data_files()[0] == mock_delete_file |
| 474 | + |
| 475 | + |
| 476 | +def test_validate_no_new_delete_files_raises_on_conflict( |
| 477 | + table_v2_with_extensive_snapshots_and_manifests: tuple[Table, dict[int, list[ManifestFile]]], |
| 478 | +) -> None: |
| 479 | + table, _ = table_v2_with_extensive_snapshots_and_manifests |
| 480 | + oldest_snapshot = table.snapshots()[0] |
| 481 | + newest_snapshot = cast(Snapshot, table.current_snapshot()) |
| 482 | + |
| 483 | + with patch("pyiceberg.table.update.validate.DeleteFileIndex.is_empty", return_value=False): |
| 484 | + with pytest.raises(ValidationException): |
| 485 | + _validate_no_new_delete_files( |
| 486 | + table=table, |
| 487 | + starting_snapshot=newest_snapshot, |
| 488 | + data_filter=None, |
| 489 | + partition_set=None, |
| 490 | + parent_snapshot=oldest_snapshot, |
| 491 | + ) |
| 492 | + |
| 493 | + |
| 494 | +def test_validate_no_new_delete_files_for_data_files_raises_on_conflict( |
| 495 | + table_v2_with_extensive_snapshots_and_manifests: tuple[Table, dict[int, list[ManifestFile]]], |
| 496 | +) -> None: |
| 497 | + table, _ = table_v2_with_extensive_snapshots_and_manifests |
| 498 | + oldest_snapshot = table.snapshots()[0] |
| 499 | + newest_snapshot = cast(Snapshot, table.current_snapshot()) |
| 500 | + |
| 501 | + mocked_data_file = DataFile.from_args() |
| 502 | + |
| 503 | + with patch("pyiceberg.table.update.validate.DeleteFileIndex.for_data_file", return_value=[mocked_data_file]): |
| 504 | + with pytest.raises(ValidationException): |
| 505 | + _validate_no_new_delete_files_for_data_files( |
| 506 | + table=table, |
| 507 | + starting_snapshot=newest_snapshot, |
| 508 | + data_filter=None, |
| 509 | + data_files={mocked_data_file}, |
| 510 | + parent_snapshot=oldest_snapshot, |
| 511 | + ) |
0 commit comments