Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion src/iceberg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ set(ICEBERG_SOURCES
manifest/manifest_entry.cc
manifest/manifest_list.cc
manifest/manifest_reader.cc
manifest/manifest_reader_internal.cc
manifest/manifest_writer.cc
manifest/v1_metadata.cc
manifest/v2_metadata.cc
Expand Down
152 changes: 90 additions & 62 deletions src/iceberg/manifest/manifest_entry.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,94 +178,114 @@ struct ICEBERG_EXPORT DataFile {
/// present
std::optional<int64_t> content_size_in_bytes;

inline static constexpr int32_t kContentFieldId = 134;
inline static const SchemaField kContent = SchemaField::MakeOptional(
134, "content", iceberg::int32(),
kContentFieldId, "content", int32(),
"Contents of the file: 0=data, 1=position deletes, 2=equality deletes");

inline static constexpr int32_t kFilePathFieldId = 100;
inline static const SchemaField kFilePath = SchemaField::MakeRequired(
100, "file_path", iceberg::string(), "Location URI with FS scheme");
inline static const SchemaField kFileFormat = SchemaField::MakeRequired(
101, "file_format", iceberg::string(), "File format name: avro, orc, or parquet");
inline static const int32_t kPartitionFieldId = 102;
kFilePathFieldId, "file_path", string(), "Location URI with FS scheme");

inline static constexpr int32_t kFileFormatFieldId = 101;
inline static const SchemaField kFileFormat =
SchemaField::MakeRequired(kFileFormatFieldId, "file_format", string(),
"File format name: avro, orc, or parquet");

inline static constexpr int32_t kPartitionFieldId = 102;
inline static const std::string kPartitionField = "partition";
inline static const std::string kPartitionDoc =
"Partition data tuple, schema based on the partition spec";

inline static constexpr int32_t kRecordCountFieldId = 103;
inline static const SchemaField kRecordCount = SchemaField::MakeRequired(
103, "record_count", iceberg::int64(), "Number of records in the file");
kRecordCountFieldId, "record_count", int64(), "Number of records in the file");

inline static constexpr int32_t kFileSizeFieldId = 104;
inline static const SchemaField kFileSize = SchemaField::MakeRequired(
104, "file_size_in_bytes", iceberg::int64(), "Total file size in bytes");
kFileSizeFieldId, "file_size_in_bytes", int64(), "Total file size in bytes");

inline static constexpr int32_t kColumnSizesFieldId = 108;
inline static const SchemaField kColumnSizes = SchemaField::MakeOptional(
108, "column_sizes",
std::make_shared<MapType>(
SchemaField::MakeRequired(117, std::string(MapType::kKeyName),
iceberg::int32()),
SchemaField::MakeRequired(118, std::string(MapType::kValueName),
iceberg::int64())),
kColumnSizesFieldId, "column_sizes",
map(SchemaField::MakeRequired(117, std::string(MapType::kKeyName), int32()),
SchemaField::MakeRequired(118, std::string(MapType::kValueName), int64())),
"Map of column id to total size on disk");

inline static constexpr int32_t kValueCountsFieldId = 109;
inline static const SchemaField kValueCounts = SchemaField::MakeOptional(
109, "value_counts",
std::make_shared<MapType>(
SchemaField::MakeRequired(119, std::string(MapType::kKeyName),
iceberg::int32()),
SchemaField::MakeRequired(120, std::string(MapType::kValueName),
iceberg::int64())),
kValueCountsFieldId, "value_counts",
map(SchemaField::MakeRequired(119, std::string(MapType::kKeyName), int32()),
SchemaField::MakeRequired(120, std::string(MapType::kValueName), int64())),
"Map of column id to total count, including null and NaN");

inline static constexpr int32_t kNullValueCountsFieldId = 110;
inline static const SchemaField kNullValueCounts = SchemaField::MakeOptional(
110, "null_value_counts",
std::make_shared<MapType>(
SchemaField::MakeRequired(121, std::string(MapType::kKeyName),
iceberg::int32()),
SchemaField::MakeRequired(122, std::string(MapType::kValueName),
iceberg::int64())),
kNullValueCountsFieldId, "null_value_counts",
map(SchemaField::MakeRequired(121, std::string(MapType::kKeyName), int32()),
SchemaField::MakeRequired(122, std::string(MapType::kValueName), int64())),
"Map of column id to null value count");

inline static constexpr int32_t kNanValueCountsFieldId = 137;
inline static const SchemaField kNanValueCounts = SchemaField::MakeOptional(
137, "nan_value_counts",
std::make_shared<MapType>(
SchemaField::MakeRequired(138, std::string(MapType::kKeyName),
iceberg::int32()),
SchemaField::MakeRequired(139, std::string(MapType::kValueName),
iceberg::int64())),
kNanValueCountsFieldId, "nan_value_counts",
map(SchemaField::MakeRequired(138, std::string(MapType::kKeyName), int32()),
SchemaField::MakeRequired(139, std::string(MapType::kValueName), int64())),
"Map of column id to number of NaN values in the column");

inline static constexpr int32_t kLowerBoundsFieldId = 125;
inline static const SchemaField kLowerBounds = SchemaField::MakeOptional(
125, "lower_bounds",
std::make_shared<MapType>(
SchemaField::MakeRequired(126, std::string(MapType::kKeyName),
iceberg::int32()),
SchemaField::MakeRequired(127, std::string(MapType::kValueName),
iceberg::binary())),
kLowerBoundsFieldId, "lower_bounds",
map(SchemaField::MakeRequired(126, std::string(MapType::kKeyName), int32()),
SchemaField::MakeRequired(127, std::string(MapType::kValueName), binary())),
"Map of column id to lower bound");

inline static constexpr int32_t kUpperBoundsFieldId = 128;
inline static const SchemaField kUpperBounds = SchemaField::MakeOptional(
128, "upper_bounds",
std::make_shared<MapType>(
SchemaField::MakeRequired(129, std::string(MapType::kKeyName),
iceberg::int32()),
SchemaField::MakeRequired(130, std::string(MapType::kValueName),
iceberg::binary())),
kUpperBoundsFieldId, "upper_bounds",
map(SchemaField::MakeRequired(129, std::string(MapType::kKeyName), int32()),
SchemaField::MakeRequired(130, std::string(MapType::kValueName), binary())),
"Map of column id to upper bound");

inline static constexpr int32_t kKeyMetadataFieldId = 131;
inline static const SchemaField kKeyMetadata = SchemaField::MakeOptional(
131, "key_metadata", iceberg::binary(), "Encryption key metadata blob");
kKeyMetadataFieldId, "key_metadata", binary(), "Encryption key metadata blob");

inline static constexpr int32_t kSplitOffsetsFieldId = 132;
inline static const SchemaField kSplitOffsets = SchemaField::MakeOptional(
132, "split_offsets",
std::make_shared<ListType>(SchemaField::MakeRequired(
133, std::string(ListType::kElementName), iceberg::int64())),
kSplitOffsetsFieldId, "split_offsets",
list(SchemaField::MakeRequired(133, std::string(ListType::kElementName), int64())),
"Splittable offsets");

inline static constexpr int32_t kEqualityIdsFieldId = 135;
inline static const SchemaField kEqualityIds = SchemaField::MakeOptional(
135, "equality_ids",
std::make_shared<ListType>(SchemaField::MakeRequired(
136, std::string(ListType::kElementName), iceberg::int32())),
kEqualityIdsFieldId, "equality_ids",
list(SchemaField::MakeRequired(136, std::string(ListType::kElementName), int32())),
"Equality comparison field IDs");
inline static const SchemaField kSortOrderId =
SchemaField::MakeOptional(140, "sort_order_id", iceberg::int32(), "Sort order ID");
inline static const SchemaField kFirstRowId = SchemaField::MakeOptional(
142, "first_row_id", iceberg::int64(), "Starting row ID to assign to new rows");

inline static constexpr int32_t kSortOrderIdFieldId = 140;
inline static const SchemaField kSortOrderId = SchemaField::MakeOptional(
kSortOrderIdFieldId, "sort_order_id", int32(), "Sort order ID");

inline static constexpr int32_t kFirstRowIdFieldId = 142;
inline static const SchemaField kFirstRowId =
SchemaField::MakeOptional(kFirstRowIdFieldId, "first_row_id", int64(),
"Starting row ID to assign to new rows");

inline static constexpr int32_t kReferencedDataFileFieldId = 143;
inline static const SchemaField kReferencedDataFile = SchemaField::MakeOptional(
143, "referenced_data_file", iceberg::string(),
kReferencedDataFileFieldId, "referenced_data_file", string(),
"Fully qualified location (URI with FS scheme) of a data file that all deletes "
"reference");

inline static constexpr int32_t kContentOffsetFieldId = 144;
inline static const SchemaField kContentOffset =
SchemaField::MakeOptional(144, "content_offset", iceberg::int64(),
SchemaField::MakeOptional(kContentOffsetFieldId, "content_offset", int64(),
"The offset in the file where the content starts");

inline static constexpr int32_t kContentSizeFieldId = 145;
inline static const SchemaField kContentSize =
SchemaField::MakeOptional(145, "content_size_in_bytes", iceberg::int64(),
SchemaField::MakeOptional(kContentSizeFieldId, "content_size_in_bytes", int64(),
"The length of referenced content stored in the file");

bool operator==(const DataFile& other) const = default;
Expand Down Expand Up @@ -298,16 +318,24 @@ struct ICEBERG_EXPORT ManifestEntry {
/// File path, partition tuple, metrics, ...
std::shared_ptr<DataFile> data_file;

inline static constexpr int32_t kStatusFieldId = 0;
inline static const SchemaField kStatus =
SchemaField::MakeRequired(0, "status", iceberg::int32());
SchemaField::MakeRequired(kStatusFieldId, "status", int32());

inline static constexpr int32_t kSnapshotIdFieldId = 1;
inline static const SchemaField kSnapshotId =
SchemaField::MakeOptional(1, "snapshot_id", iceberg::int64());
SchemaField::MakeOptional(kSnapshotIdFieldId, "snapshot_id", int64());

inline static const int32_t kDataFileFieldId = 2;
inline static const std::string kDataFileField = "data_file";

inline static constexpr int32_t kSequenceNumberFieldId = 3;
inline static const SchemaField kSequenceNumber =
SchemaField::MakeOptional(3, "sequence_number", iceberg::int64());
inline static const SchemaField kFileSequenceNumber =
SchemaField::MakeOptional(4, "file_sequence_number", iceberg::int64());
SchemaField::MakeOptional(kSequenceNumberFieldId, "sequence_number", int64());

inline static constexpr int32_t kFileSequenceNumberFieldId = 4;
inline static const SchemaField kFileSequenceNumber = SchemaField::MakeOptional(
kFileSequenceNumberFieldId, "file_sequence_number", int64());

/// \brief Check if this manifest entry is deleted.
constexpr bool IsAlive() const {
Expand Down
14 changes: 8 additions & 6 deletions src/iceberg/manifest/manifest_list.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,24 @@

#include "iceberg/manifest/manifest_list.h"

#include "iceberg/schema.h"
#include <memory>

#include "iceberg/type.h"

namespace iceberg {

const StructType& PartitionFieldSummary::Type() {
static const StructType kInstance{{
const std::shared_ptr<StructType>& PartitionFieldSummary::Type() {
static const auto kInstance = std::make_shared<StructType>(std::vector<SchemaField>{
PartitionFieldSummary::kContainsNull,
PartitionFieldSummary::kContainsNaN,
PartitionFieldSummary::kLowerBound,
PartitionFieldSummary::kUpperBound,
}};
});
return kInstance;
}

const std::shared_ptr<Schema>& ManifestFile::Type() {
static const auto kInstance = std::make_shared<Schema>(std::vector<SchemaField>{
const std::shared_ptr<StructType>& ManifestFile::Type() {
static const auto kInstance = std::make_shared<StructType>(std::vector<SchemaField>{
kManifestPath,
kManifestLength,
kPartitionSpecId,
Expand Down
85 changes: 60 additions & 25 deletions src/iceberg/manifest/manifest_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ struct ICEBERG_EXPORT PartitionFieldSummary {

bool operator==(const PartitionFieldSummary& other) const = default;

static const StructType& Type();
static const std::shared_ptr<StructType>& Type();
};

/// \brief The type of files tracked by the manifest, either data or delete files; 0 for
Expand Down Expand Up @@ -153,51 +153,86 @@ struct ICEBERG_EXPORT ManifestFile {
/// \brief Checks if this manifest file contains entries with DELETED status
bool has_deleted_files() const { return deleted_files_count.value_or(1) > 0; }

inline static const int32_t kManifestPathFieldId = 500;
inline static const SchemaField kManifestPath = SchemaField::MakeRequired(
500, "manifest_path", iceberg::string(), "Location URI with FS scheme");
kManifestPathFieldId, "manifest_path", string(), "Location URI with FS scheme");

inline static const int32_t kManifestLengthFieldId = 501;
inline static const SchemaField kManifestLength = SchemaField::MakeRequired(
501, "manifest_length", iceberg::int64(), "Total file size in bytes");
kManifestLengthFieldId, "manifest_length", int64(), "Total file size in bytes");

inline static const int32_t kPartitionSpecIdFieldId = 502;
inline static const SchemaField kPartitionSpecId = SchemaField::MakeRequired(
502, "partition_spec_id", iceberg::int32(), "Spec ID used to write");
kPartitionSpecIdFieldId, "partition_spec_id", int32(), "Spec ID used to write");

inline static const int32_t kContentFieldId = 517;
inline static const SchemaField kContent = SchemaField::MakeOptional(
517, "content", iceberg::int32(), "Contents of the manifest: 0=data, 1=deletes");
kContentFieldId, "content", int32(), "Contents of the manifest: 0=data, 1=deletes");

inline static const int32_t kSequenceNumberFieldId = 515;
inline static const SchemaField kSequenceNumber =
SchemaField::MakeOptional(515, "sequence_number", iceberg::int64(),
SchemaField::MakeOptional(kSequenceNumberFieldId, "sequence_number", int64(),
"Sequence number when the manifest was added");

inline static const int32_t kMinSequenceNumberFieldId = 516;
inline static const SchemaField kMinSequenceNumber =
SchemaField::MakeOptional(516, "min_sequence_number", iceberg::int64(),
SchemaField::MakeOptional(kMinSequenceNumberFieldId, "min_sequence_number", int64(),
"Lowest sequence number in the manifest");
inline static const SchemaField kAddedSnapshotId = SchemaField::MakeRequired(
503, "added_snapshot_id", iceberg::int64(), "Snapshot ID that added the manifest");

inline static const int32_t kAddedSnapshotIdFieldId = 503;
inline static const SchemaField kAddedSnapshotId =
SchemaField::MakeRequired(kAddedSnapshotIdFieldId, "added_snapshot_id", int64(),
"Snapshot ID that added the manifest");

inline static const int32_t kAddedFilesCountFieldId = 504;
inline static const SchemaField kAddedFilesCount = SchemaField::MakeOptional(
504, "added_files_count", iceberg::int32(), "Added entry count");
inline static const SchemaField kExistingFilesCount = SchemaField::MakeOptional(
505, "existing_files_count", iceberg::int32(), "Existing entry count");
kAddedFilesCountFieldId, "added_files_count", int32(), "Added entry count");

inline static const int32_t kExistingFilesCountFieldId = 505;
inline static const SchemaField kExistingFilesCount =
SchemaField::MakeOptional(kExistingFilesCountFieldId, "existing_files_count",
int32(), "Existing entry count");

inline static const int32_t kDeletedFilesCountFieldId = 506;
inline static const SchemaField kDeletedFilesCount = SchemaField::MakeOptional(
506, "deleted_files_count", iceberg::int32(), "Deleted entry count");
kDeletedFilesCountFieldId, "deleted_files_count", int32(), "Deleted entry count");

inline static const int32_t kAddedRowsCountFieldId = 512;
inline static const SchemaField kAddedRowsCount = SchemaField::MakeOptional(
512, "added_rows_count", iceberg::int64(), "Added rows count");
kAddedRowsCountFieldId, "added_rows_count", int64(), "Added rows count");

inline static const int32_t kExistingRowsCountFieldId = 513;
inline static const SchemaField kExistingRowsCount = SchemaField::MakeOptional(
513, "existing_rows_count", iceberg::int64(), "Existing rows count");
kExistingRowsCountFieldId, "existing_rows_count", int64(), "Existing rows count");

inline static const int32_t kDeletedRowsCountFieldId = 514;
inline static const SchemaField kDeletedRowsCount = SchemaField::MakeOptional(
514, "deleted_rows_count", iceberg::int64(), "Deleted rows count");
kDeletedRowsCountFieldId, "deleted_rows_count", int64(), "Deleted rows count");

inline static const int32_t kPartitionSummaryFieldId = 507;
inline static const SchemaField kPartitions = SchemaField::MakeOptional(
507, "partitions",
std::make_shared<ListType>(SchemaField::MakeRequired(
508, std::string(ListType::kElementName),
struct_(
{PartitionFieldSummary::kContainsNull, PartitionFieldSummary::kContainsNaN,
PartitionFieldSummary::kLowerBound, PartitionFieldSummary::kUpperBound}))),
kPartitionSummaryFieldId, "partitions",
list(SchemaField::MakeRequired(508, std::string(ListType::kElementName),
struct_({
PartitionFieldSummary::kContainsNull,
PartitionFieldSummary::kContainsNaN,
PartitionFieldSummary::kLowerBound,
PartitionFieldSummary::kUpperBound,
}))),
"Summary for each partition");

inline static const int32_t kKeyMetadataFieldId = 519;
inline static const SchemaField kKeyMetadata = SchemaField::MakeOptional(
519, "key_metadata", iceberg::binary(), "Encryption key metadata blob");
kKeyMetadataFieldId, "key_metadata", binary(), "Encryption key metadata blob");

inline static const int32_t kFirstRowIdFieldId = 520;
inline static const SchemaField kFirstRowId = SchemaField::MakeOptional(
520, "first_row_id", iceberg::int64(),
kFirstRowIdFieldId, "first_row_id", int64(),
"Starting row ID to assign to new rows in ADDED data files");

bool operator==(const ManifestFile& other) const = default;

static const std::shared_ptr<Schema>& Type();
static const std::shared_ptr<StructType>& Type();
};

/// Snapshots are embedded in table metadata, but the list of manifests for a snapshot are
Expand Down
Loading
Loading