-
Notifications
You must be signed in to change notification settings - Fork 76
feat: implement delete file index #435
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
a23aa2b to
67d3b69
Compare
6549a62 to
9345afc
Compare
9345afc to
0b8f579
Compare
| /// \file iceberg/delete_file_index.h | ||
| /// An index of delete files by sequence number. | ||
|
|
||
| #include <algorithm> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| #include <algorithm> |
|
|
||
| std::vector<std::shared_ptr<DataFile>> PositionDeletes::ReferencedDeleteFiles() { | ||
| IndexIfNeeded(); | ||
| return std::ranges::transform_view(files_, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use std::views::transform
| std::ranges::sort(files_, [](const auto& a, const auto& b) { | ||
| return a.sequence_number.value() < b.sequence_number.value(); | ||
| }); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| std::ranges::sort(files_, [](const auto& a, const auto& b) { | |
| return a.sequence_number.value() < b.sequence_number.value(); | |
| }); | |
| std::ranges::sort(files_, std::ranges::less{}, &ManifestEntry::sequence_number); |
| }); | ||
|
|
||
| // Build sequence number array for binary search | ||
| seqs_ = std::ranges::transform_view( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto
| size_t start = FindStartIndex(seqs_, seq); | ||
| if (start >= files_.size()) { | ||
| return {}; | ||
| } | ||
|
|
||
| return files_ | std::views::drop(start) | | ||
| std::views::transform(&ManifestEntry::data_file) | | ||
| std::ranges::to<std::vector<std::shared_ptr<DataFile>>>(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| size_t start = FindStartIndex(seqs_, seq); | |
| if (start >= files_.size()) { | |
| return {}; | |
| } | |
| return files_ | std::views::drop(start) | | |
| std::views::transform(&ManifestEntry::data_file) | | |
| std::ranges::to<std::vector<std::shared_ptr<DataFile>>>(); | |
| auto iter = std::ranges::lower_bound(seqs_, seq); | |
| if (iter == seqs_.end()) { | |
| return {}; | |
| } | |
| return files_ | std::views::drop(iter - seqs_.begin()) | | |
| std::views::transform(&ManifestEntry::data_file) | | |
| std::ranges::to<std::vector<std::shared_ptr<DataFile>>>(); |
I think adding FindStartIndex is not needed.
| size_t start = FindStartIndex(seqs_, seq); | ||
| if (start >= files_.size()) { | ||
| return {}; | ||
| } | ||
|
|
||
| std::vector<std::shared_ptr<DataFile>> result; | ||
| result.reserve(files_.size() - start); | ||
| for (size_t i = start; i < files_.size(); ++i) { | ||
| const auto& delete_file = files_[i]; | ||
| ICEBERG_ASSIGN_OR_RAISE(bool may_contain, | ||
| CanContainEqDeletesForFile(data_file, delete_file)); | ||
| if (may_contain) { | ||
| result.push_back(delete_file.wrapped.data_file); | ||
| } | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| size_t start = FindStartIndex(seqs_, seq); | |
| if (start >= files_.size()) { | |
| return {}; | |
| } | |
| std::vector<std::shared_ptr<DataFile>> result; | |
| result.reserve(files_.size() - start); | |
| for (size_t i = start; i < files_.size(); ++i) { | |
| const auto& delete_file = files_[i]; | |
| ICEBERG_ASSIGN_OR_RAISE(bool may_contain, | |
| CanContainEqDeletesForFile(data_file, delete_file)); | |
| if (may_contain) { | |
| result.push_back(delete_file.wrapped.data_file); | |
| } | |
| } | |
| auto iter = std::ranges::lower_bound(seqs_, seq); | |
| if (iter == seqs_.end()) { | |
| return {}; | |
| } | |
| std::vector<std::shared_ptr<DataFile>> result; | |
| result.reserve(seqs_.end() - iter); | |
| for (auto& delete_file : files_ | std::views::drop(iter - seqs_.begin())) { | |
| ICEBERG_ASSIGN_OR_RAISE(bool may_contain, | |
| CanContainEqDeletesForFile(data_file, delete_file)); | |
| if (may_contain) { | |
| result.push_back(delete_file.wrapped.data_file); | |
| } | |
| } |
|
|
||
| std::vector<std::shared_ptr<DataFile>> EqualityDeletes::ReferencedDeleteFiles() { | ||
| IndexIfNeeded(); | ||
| return std::ranges::transform_view( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto
| } | ||
|
|
||
| // Sort by apply sequence number | ||
| std::ranges::sort(files_, [](const auto& a, const auto& b) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto
| }); | ||
|
|
||
| // Build sequence number array for binary search | ||
| seqs_ = std::ranges::transform_view( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto
Implemented the DeleteFileIndex and Builder to manage and efficiently filter
delete files (equality deletes, position deletes, and deletion vectors)
based on sequence numbers and partitions.
Key changes:
DeleteFileIndexandDeleteFileIndex::Builderinsrc/iceberg/delete_file_index.{h,cc}.ContentFileUtilfor helper functions related to content files and DVs.ManifestReaderto support dropping stats viaTryDropStats().src/iceberg/test/delete_file_index_test.cc.