diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7286616c4fb..6ca1d058c79 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -724,6 +724,12 @@ install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/../LICENSE.txt install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/gdb_arrow.py DESTINATION "${ARROW_GDB_DIR}") +# Install generated Thrift and FlatBuffers headers +install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/src/generated/ + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/generated" + FILES_MATCHING + PATTERN "*.h") + # # Validate and print out Arrow configuration options # diff --git a/cpp/build-support/update-flatbuffers.sh b/cpp/build-support/update-flatbuffers.sh index 6738f81a560..9804868da41 100755 --- a/cpp/build-support/update-flatbuffers.sh +++ b/cpp/build-support/update-flatbuffers.sh @@ -32,5 +32,6 @@ FLATC="flatc --cpp --cpp-std c++11 --scoped-enums" OUT_DIR="$SOURCE_DIR/generated" FILES=($(find $FORMAT_DIR -name '*.fbs')) FILES+=("$SOURCE_DIR/arrow/ipc/feather.fbs") +FILES+=("$SOURCE_DIR/parquet/parquet3.fbs") $FLATC -o "$OUT_DIR" "${FILES[@]}" diff --git a/cpp/src/generated/parquet3_generated.h b/cpp/src/generated/parquet3_generated.h new file mode 100644 index 00000000000..0e36b055b9d --- /dev/null +++ b/cpp/src/generated/parquet3_generated.h @@ -0,0 +1,1918 @@ +// automatically generated by the FlatBuffers compiler, do not modify + + +#ifndef FLATBUFFERS_GENERATED_PARQUET3_PARQUET_FORMAT3_H_ +#define FLATBUFFERS_GENERATED_PARQUET3_PARQUET_FORMAT3_H_ + +#include "flatbuffers/flatbuffers.h" + +// Ensure the included flatbuffers.h is the same version as when this file was +// generated, otherwise it may not be compatible. +static_assert(FLATBUFFERS_VERSION_MAJOR == 24 && + FLATBUFFERS_VERSION_MINOR == 3 && + FLATBUFFERS_VERSION_REVISION == 6, + "Non-compatible flatbuffers version included"); + +namespace parquet { +namespace format3 { + +struct Empty; +struct EmptyBuilder; + +struct DecimalOpts; +struct DecimalOptsBuilder; + +struct TimeOpts; +struct TimeOptsBuilder; + +struct IntOpts; +struct IntOptsBuilder; + +struct GeometryType; +struct GeometryTypeBuilder; + +struct GeographyType; +struct GeographyTypeBuilder; + +struct Statistics; +struct StatisticsBuilder; + +struct SchemaElement; +struct SchemaElementBuilder; + +struct KV; +struct KVBuilder; + +struct ColumnMetadata; +struct ColumnMetadataBuilder; + +struct ColumnChunk; +struct ColumnChunkBuilder; + +struct SortingColumn; +struct SortingColumnBuilder; + +struct RowGroup; +struct RowGroupBuilder; + +struct FileMetaData; +struct FileMetaDataBuilder; + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// +enum class Type : int8_t { + BOOLEAN = 0, + INT32 = 1, + INT64 = 2, + INT96 = 3, + FLOAT = 4, + DOUBLE = 5, + BYTE_ARRAY = 6, + FIXED_LEN_BYTE_ARRAY = 7, + MIN = BOOLEAN, + MAX = FIXED_LEN_BYTE_ARRAY +}; + +inline const Type (&EnumValuesType())[8] { + static const Type values[] = { + Type::BOOLEAN, + Type::INT32, + Type::INT64, + Type::INT96, + Type::FLOAT, + Type::DOUBLE, + Type::BYTE_ARRAY, + Type::FIXED_LEN_BYTE_ARRAY + }; + return values; +} + +inline const char * const *EnumNamesType() { + static const char * const names[9] = { + "BOOLEAN", + "INT32", + "INT64", + "INT96", + "FLOAT", + "DOUBLE", + "BYTE_ARRAY", + "FIXED_LEN_BYTE_ARRAY", + nullptr + }; + return names; +} + +inline const char *EnumNameType(Type e) { + if (::flatbuffers::IsOutRange(e, Type::BOOLEAN, Type::FIXED_LEN_BYTE_ARRAY)) return ""; + const size_t index = static_cast(e); + return EnumNamesType()[index]; +} + +enum class FieldRepetitionType : int8_t { + REQUIRED = 0, + OPTIONAL = 1, + REPEATED = 2, + MIN = REQUIRED, + MAX = REPEATED +}; + +inline const FieldRepetitionType (&EnumValuesFieldRepetitionType())[3] { + static const FieldRepetitionType values[] = { + FieldRepetitionType::REQUIRED, + FieldRepetitionType::OPTIONAL, + FieldRepetitionType::REPEATED + }; + return values; +} + +inline const char * const *EnumNamesFieldRepetitionType() { + static const char * const names[4] = { + "REQUIRED", + "OPTIONAL", + "REPEATED", + nullptr + }; + return names; +} + +inline const char *EnumNameFieldRepetitionType(FieldRepetitionType e) { + if (::flatbuffers::IsOutRange(e, FieldRepetitionType::REQUIRED, FieldRepetitionType::REPEATED)) return ""; + const size_t index = static_cast(e); + return EnumNamesFieldRepetitionType()[index]; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// +enum class Encoding : int8_t { + PLAIN = 0, + PLAIN_DICTIONARY = 2, + RLE = 3, + DELTA_BINARY_PACKED = 5, + DELTA_LENGTH_BYTE_ARRAY = 6, + DELTA_BYTE_ARRAY = 7, + RLE_DICTIONARY = 8, + BYTE_STREAM_SPLIT = 9, + MIN = PLAIN, + MAX = BYTE_STREAM_SPLIT +}; + +inline const Encoding (&EnumValuesEncoding())[8] { + static const Encoding values[] = { + Encoding::PLAIN, + Encoding::PLAIN_DICTIONARY, + Encoding::RLE, + Encoding::DELTA_BINARY_PACKED, + Encoding::DELTA_LENGTH_BYTE_ARRAY, + Encoding::DELTA_BYTE_ARRAY, + Encoding::RLE_DICTIONARY, + Encoding::BYTE_STREAM_SPLIT + }; + return values; +} + +inline const char * const *EnumNamesEncoding() { + static const char * const names[11] = { + "PLAIN", + "", + "PLAIN_DICTIONARY", + "RLE", + "", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "RLE_DICTIONARY", + "BYTE_STREAM_SPLIT", + nullptr + }; + return names; +} + +inline const char *EnumNameEncoding(Encoding e) { + if (::flatbuffers::IsOutRange(e, Encoding::PLAIN, Encoding::BYTE_STREAM_SPLIT)) return ""; + const size_t index = static_cast(e); + return EnumNamesEncoding()[index]; +} + +enum class CompressionCodec : int8_t { + UNCOMPRESSED = 0, + SNAPPY = 1, + GZIP = 2, + LZO = 3, + BROTLI = 4, + ZSTD = 6, + LZ4_RAW = 7, + MIN = UNCOMPRESSED, + MAX = LZ4_RAW +}; + +inline const CompressionCodec (&EnumValuesCompressionCodec())[7] { + static const CompressionCodec values[] = { + CompressionCodec::UNCOMPRESSED, + CompressionCodec::SNAPPY, + CompressionCodec::GZIP, + CompressionCodec::LZO, + CompressionCodec::BROTLI, + CompressionCodec::ZSTD, + CompressionCodec::LZ4_RAW + }; + return values; +} + +inline const char * const *EnumNamesCompressionCodec() { + static const char * const names[9] = { + "UNCOMPRESSED", + "SNAPPY", + "GZIP", + "LZO", + "BROTLI", + "", + "ZSTD", + "LZ4_RAW", + nullptr + }; + return names; +} + +inline const char *EnumNameCompressionCodec(CompressionCodec e) { + if (::flatbuffers::IsOutRange(e, CompressionCodec::UNCOMPRESSED, CompressionCodec::LZ4_RAW)) return ""; + const size_t index = static_cast(e); + return EnumNamesCompressionCodec()[index]; +} + +enum class TimeUnit : int8_t { + MS = 0, + US = 1, + NS = 2, + MIN = MS, + MAX = NS +}; + +inline const TimeUnit (&EnumValuesTimeUnit())[3] { + static const TimeUnit values[] = { + TimeUnit::MS, + TimeUnit::US, + TimeUnit::NS + }; + return values; +} + +inline const char * const *EnumNamesTimeUnit() { + static const char * const names[4] = { + "MS", + "US", + "NS", + nullptr + }; + return names; +} + +inline const char *EnumNameTimeUnit(TimeUnit e) { + if (::flatbuffers::IsOutRange(e, TimeUnit::MS, TimeUnit::NS)) return ""; + const size_t index = static_cast(e); + return EnumNamesTimeUnit()[index]; +} + +enum class EdgeInterpolationAlgorithm : int8_t { + SPHERICAL = 0, + VINCENTY = 1, + THOMAS = 2, + ANDOYER = 3, + KARNEY = 4, + MIN = SPHERICAL, + MAX = KARNEY +}; + +inline const EdgeInterpolationAlgorithm (&EnumValuesEdgeInterpolationAlgorithm())[5] { + static const EdgeInterpolationAlgorithm values[] = { + EdgeInterpolationAlgorithm::SPHERICAL, + EdgeInterpolationAlgorithm::VINCENTY, + EdgeInterpolationAlgorithm::THOMAS, + EdgeInterpolationAlgorithm::ANDOYER, + EdgeInterpolationAlgorithm::KARNEY + }; + return values; +} + +inline const char * const *EnumNamesEdgeInterpolationAlgorithm() { + static const char * const names[6] = { + "SPHERICAL", + "VINCENTY", + "THOMAS", + "ANDOYER", + "KARNEY", + nullptr + }; + return names; +} + +inline const char *EnumNameEdgeInterpolationAlgorithm(EdgeInterpolationAlgorithm e) { + if (::flatbuffers::IsOutRange(e, EdgeInterpolationAlgorithm::SPHERICAL, EdgeInterpolationAlgorithm::KARNEY)) return ""; + const size_t index = static_cast(e); + return EnumNamesEdgeInterpolationAlgorithm()[index]; +} + +enum class LogicalType : uint8_t { + NONE = 0, + StringType = 1, + MapType = 2, + ListType = 3, + EnumType = 4, + DecimalType = 5, + DateType = 6, + TimeType = 7, + TimestampType = 8, + IntType = 9, + NullType = 10, + JsonType = 11, + BsonType = 12, + UUIDType = 13, + Float16Type = 14, + VariantType = 15, + GeometryType = 16, + GeographyType = 17, + MIN = NONE, + MAX = GeographyType +}; + +inline const LogicalType (&EnumValuesLogicalType())[18] { + static const LogicalType values[] = { + LogicalType::NONE, + LogicalType::StringType, + LogicalType::MapType, + LogicalType::ListType, + LogicalType::EnumType, + LogicalType::DecimalType, + LogicalType::DateType, + LogicalType::TimeType, + LogicalType::TimestampType, + LogicalType::IntType, + LogicalType::NullType, + LogicalType::JsonType, + LogicalType::BsonType, + LogicalType::UUIDType, + LogicalType::Float16Type, + LogicalType::VariantType, + LogicalType::GeometryType, + LogicalType::GeographyType + }; + return values; +} + +inline const char * const *EnumNamesLogicalType() { + static const char * const names[19] = { + "NONE", + "StringType", + "MapType", + "ListType", + "EnumType", + "DecimalType", + "DateType", + "TimeType", + "TimestampType", + "IntType", + "NullType", + "JsonType", + "BsonType", + "UUIDType", + "Float16Type", + "VariantType", + "GeometryType", + "GeographyType", + nullptr + }; + return names; +} + +inline const char *EnumNameLogicalType(LogicalType e) { + if (::flatbuffers::IsOutRange(e, LogicalType::NONE, LogicalType::GeographyType)) return ""; + const size_t index = static_cast(e); + return EnumNamesLogicalType()[index]; +} + +bool VerifyLogicalType(::flatbuffers::Verifier &verifier, const void *obj, LogicalType type); +bool VerifyLogicalTypeVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset> *values, const ::flatbuffers::Vector *types); + +enum class ColumnOrder : uint8_t { + NONE = 0, + TypeDefinedOrder = 1, + MIN = NONE, + MAX = TypeDefinedOrder +}; + +inline const ColumnOrder (&EnumValuesColumnOrder())[2] { + static const ColumnOrder values[] = { + ColumnOrder::NONE, + ColumnOrder::TypeDefinedOrder + }; + return values; +} + +inline const char * const *EnumNamesColumnOrder() { + static const char * const names[3] = { + "NONE", + "TypeDefinedOrder", + nullptr + }; + return names; +} + +inline const char *EnumNameColumnOrder(ColumnOrder e) { + if (::flatbuffers::IsOutRange(e, ColumnOrder::NONE, ColumnOrder::TypeDefinedOrder)) return ""; + const size_t index = static_cast(e); + return EnumNamesColumnOrder()[index]; +} + +template struct ColumnOrderTraits { + static const ColumnOrder enum_value = ColumnOrder::NONE; +}; + +template<> struct ColumnOrderTraits { + static const ColumnOrder enum_value = ColumnOrder::TypeDefinedOrder; +}; + +bool VerifyColumnOrder(::flatbuffers::Verifier &verifier, const void *obj, ColumnOrder type); +bool VerifyColumnOrderVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset> *values, const ::flatbuffers::Vector *types); + +enum class PageType : int8_t { + DATA_PAGE = 0, + INDEX_PAGE = 1, + DICTIONARY_PAGE = 2, + DATA_PAGE_V2 = 3, + MIN = DATA_PAGE, + MAX = DATA_PAGE_V2 +}; + +inline const PageType (&EnumValuesPageType())[4] { + static const PageType values[] = { + PageType::DATA_PAGE, + PageType::INDEX_PAGE, + PageType::DICTIONARY_PAGE, + PageType::DATA_PAGE_V2 + }; + return values; +} + +inline const char * const *EnumNamesPageType() { + static const char * const names[5] = { + "DATA_PAGE", + "INDEX_PAGE", + "DICTIONARY_PAGE", + "DATA_PAGE_V2", + nullptr + }; + return names; +} + +inline const char *EnumNamePageType(PageType e) { + if (::flatbuffers::IsOutRange(e, PageType::DATA_PAGE, PageType::DATA_PAGE_V2)) return ""; + const size_t index = static_cast(e); + return EnumNamesPageType()[index]; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// +struct Empty FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { + typedef EmptyBuilder Builder; + bool Verify(::flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + verifier.EndTable(); + } +}; + +struct EmptyBuilder { + typedef Empty Table; + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + explicit EmptyBuilder(::flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ::flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = ::flatbuffers::Offset(end); + return o; + } +}; + +inline ::flatbuffers::Offset CreateEmpty( + ::flatbuffers::FlatBufferBuilder &_fbb) { + EmptyBuilder builder_(_fbb); + return builder_.Finish(); +} + +struct DecimalOpts FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { + typedef DecimalOptsBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_PRECISION = 4, + VT_SCALE = 6 + }; + int32_t precision() const { + return GetField(VT_PRECISION, 0); + } + int32_t scale() const { + return GetField(VT_SCALE, 0); + } + bool Verify(::flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyField(verifier, VT_PRECISION, 4) && + VerifyField(verifier, VT_SCALE, 4) && + verifier.EndTable(); + } +}; + +struct DecimalOptsBuilder { + typedef DecimalOpts Table; + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_precision(int32_t precision) { + fbb_.AddElement(DecimalOpts::VT_PRECISION, precision, 0); + } + void add_scale(int32_t scale) { + fbb_.AddElement(DecimalOpts::VT_SCALE, scale, 0); + } + explicit DecimalOptsBuilder(::flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ::flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = ::flatbuffers::Offset(end); + return o; + } +}; + +inline ::flatbuffers::Offset CreateDecimalOpts( + ::flatbuffers::FlatBufferBuilder &_fbb, + int32_t precision = 0, + int32_t scale = 0) { + DecimalOptsBuilder builder_(_fbb); + builder_.add_scale(scale); + builder_.add_precision(precision); + return builder_.Finish(); +} + +struct TimeOpts FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { + typedef TimeOptsBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_IS_ADJUSTED_TO_UTC = 4, + VT_UNIT = 6 + }; + bool is_adjusted_to_utc() const { + return GetField(VT_IS_ADJUSTED_TO_UTC, 0) != 0; + } + parquet::format3::TimeUnit unit() const { + return static_cast(GetField(VT_UNIT, 0)); + } + bool Verify(::flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyField(verifier, VT_IS_ADJUSTED_TO_UTC, 1) && + VerifyField(verifier, VT_UNIT, 1) && + verifier.EndTable(); + } +}; + +struct TimeOptsBuilder { + typedef TimeOpts Table; + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_is_adjusted_to_utc(bool is_adjusted_to_utc) { + fbb_.AddElement(TimeOpts::VT_IS_ADJUSTED_TO_UTC, static_cast(is_adjusted_to_utc), 0); + } + void add_unit(parquet::format3::TimeUnit unit) { + fbb_.AddElement(TimeOpts::VT_UNIT, static_cast(unit), 0); + } + explicit TimeOptsBuilder(::flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ::flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = ::flatbuffers::Offset(end); + return o; + } +}; + +inline ::flatbuffers::Offset CreateTimeOpts( + ::flatbuffers::FlatBufferBuilder &_fbb, + bool is_adjusted_to_utc = false, + parquet::format3::TimeUnit unit = parquet::format3::TimeUnit::MS) { + TimeOptsBuilder builder_(_fbb); + builder_.add_unit(unit); + builder_.add_is_adjusted_to_utc(is_adjusted_to_utc); + return builder_.Finish(); +} + +struct IntOpts FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { + typedef IntOptsBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_BIT_WIDTH = 4, + VT_IS_SIGNED = 6 + }; + int8_t bit_width() const { + return GetField(VT_BIT_WIDTH, 8); + } + bool is_signed() const { + return GetField(VT_IS_SIGNED, 0) != 0; + } + bool Verify(::flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyField(verifier, VT_BIT_WIDTH, 1) && + VerifyField(verifier, VT_IS_SIGNED, 1) && + verifier.EndTable(); + } +}; + +struct IntOptsBuilder { + typedef IntOpts Table; + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_bit_width(int8_t bit_width) { + fbb_.AddElement(IntOpts::VT_BIT_WIDTH, bit_width, 8); + } + void add_is_signed(bool is_signed) { + fbb_.AddElement(IntOpts::VT_IS_SIGNED, static_cast(is_signed), 0); + } + explicit IntOptsBuilder(::flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ::flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = ::flatbuffers::Offset(end); + return o; + } +}; + +inline ::flatbuffers::Offset CreateIntOpts( + ::flatbuffers::FlatBufferBuilder &_fbb, + int8_t bit_width = 8, + bool is_signed = false) { + IntOptsBuilder builder_(_fbb); + builder_.add_is_signed(is_signed); + builder_.add_bit_width(bit_width); + return builder_.Finish(); +} + +struct GeometryType FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { + typedef GeometryTypeBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_CRS = 4 + }; + const ::flatbuffers::String *crs() const { + return GetPointer(VT_CRS); + } + bool Verify(::flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_CRS) && + verifier.VerifyString(crs()) && + verifier.EndTable(); + } +}; + +struct GeometryTypeBuilder { + typedef GeometryType Table; + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_crs(::flatbuffers::Offset<::flatbuffers::String> crs) { + fbb_.AddOffset(GeometryType::VT_CRS, crs); + } + explicit GeometryTypeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ::flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = ::flatbuffers::Offset(end); + return o; + } +}; + +inline ::flatbuffers::Offset CreateGeometryType( + ::flatbuffers::FlatBufferBuilder &_fbb, + ::flatbuffers::Offset<::flatbuffers::String> crs = 0) { + GeometryTypeBuilder builder_(_fbb); + builder_.add_crs(crs); + return builder_.Finish(); +} + +inline ::flatbuffers::Offset CreateGeometryTypeDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, + const char *crs = nullptr) { + auto crs__ = crs ? _fbb.CreateString(crs) : 0; + return parquet::format3::CreateGeometryType( + _fbb, + crs__); +} + +struct GeographyType FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { + typedef GeographyTypeBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_CRS = 4, + VT_ALGORITHM = 6 + }; + const ::flatbuffers::String *crs() const { + return GetPointer(VT_CRS); + } + parquet::format3::EdgeInterpolationAlgorithm algorithm() const { + return static_cast(GetField(VT_ALGORITHM, 0)); + } + bool Verify(::flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_CRS) && + verifier.VerifyString(crs()) && + VerifyField(verifier, VT_ALGORITHM, 1) && + verifier.EndTable(); + } +}; + +struct GeographyTypeBuilder { + typedef GeographyType Table; + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_crs(::flatbuffers::Offset<::flatbuffers::String> crs) { + fbb_.AddOffset(GeographyType::VT_CRS, crs); + } + void add_algorithm(parquet::format3::EdgeInterpolationAlgorithm algorithm) { + fbb_.AddElement(GeographyType::VT_ALGORITHM, static_cast(algorithm), 0); + } + explicit GeographyTypeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ::flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = ::flatbuffers::Offset(end); + return o; + } +}; + +inline ::flatbuffers::Offset CreateGeographyType( + ::flatbuffers::FlatBufferBuilder &_fbb, + ::flatbuffers::Offset<::flatbuffers::String> crs = 0, + parquet::format3::EdgeInterpolationAlgorithm algorithm = parquet::format3::EdgeInterpolationAlgorithm::SPHERICAL) { + GeographyTypeBuilder builder_(_fbb); + builder_.add_crs(crs); + builder_.add_algorithm(algorithm); + return builder_.Finish(); +} + +inline ::flatbuffers::Offset CreateGeographyTypeDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, + const char *crs = nullptr, + parquet::format3::EdgeInterpolationAlgorithm algorithm = parquet::format3::EdgeInterpolationAlgorithm::SPHERICAL) { + auto crs__ = crs ? _fbb.CreateString(crs) : 0; + return parquet::format3::CreateGeographyType( + _fbb, + crs__, + algorithm); +} + +struct Statistics FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { + typedef StatisticsBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_NULL_COUNT = 4, + VT_MIN_LO4 = 6, + VT_MIN_LO8 = 8, + VT_MIN_HI8 = 10, + VT_MIN_LEN = 12, + VT_MAX_LO4 = 14, + VT_MAX_LO8 = 16, + VT_MAX_HI8 = 18, + VT_MAX_LEN = 20, + VT_PREFIX = 22 + }; + ::flatbuffers::Optional null_count() const { + return GetOptional(VT_NULL_COUNT); + } + uint32_t min_lo4() const { + return GetField(VT_MIN_LO4, 0); + } + uint64_t min_lo8() const { + return GetField(VT_MIN_LO8, 0); + } + uint64_t min_hi8() const { + return GetField(VT_MIN_HI8, 0); + } + ::flatbuffers::Optional min_len() const { + return GetOptional(VT_MIN_LEN); + } + uint32_t max_lo4() const { + return GetField(VT_MAX_LO4, 0); + } + uint64_t max_lo8() const { + return GetField(VT_MAX_LO8, 0); + } + uint64_t max_hi8() const { + return GetField(VT_MAX_HI8, 0); + } + ::flatbuffers::Optional max_len() const { + return GetOptional(VT_MAX_LEN); + } + const ::flatbuffers::String *prefix() const { + return GetPointer(VT_PREFIX); + } + bool Verify(::flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyField(verifier, VT_NULL_COUNT, 4) && + VerifyField(verifier, VT_MIN_LO4, 4) && + VerifyField(verifier, VT_MIN_LO8, 8) && + VerifyField(verifier, VT_MIN_HI8, 8) && + VerifyField(verifier, VT_MIN_LEN, 1) && + VerifyField(verifier, VT_MAX_LO4, 4) && + VerifyField(verifier, VT_MAX_LO8, 8) && + VerifyField(verifier, VT_MAX_HI8, 8) && + VerifyField(verifier, VT_MAX_LEN, 1) && + VerifyOffset(verifier, VT_PREFIX) && + verifier.VerifyString(prefix()) && + verifier.EndTable(); + } +}; + +struct StatisticsBuilder { + typedef Statistics Table; + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_null_count(int32_t null_count) { + fbb_.AddElement(Statistics::VT_NULL_COUNT, null_count); + } + void add_min_lo4(uint32_t min_lo4) { + fbb_.AddElement(Statistics::VT_MIN_LO4, min_lo4, 0); + } + void add_min_lo8(uint64_t min_lo8) { + fbb_.AddElement(Statistics::VT_MIN_LO8, min_lo8, 0); + } + void add_min_hi8(uint64_t min_hi8) { + fbb_.AddElement(Statistics::VT_MIN_HI8, min_hi8, 0); + } + void add_min_len(int8_t min_len) { + fbb_.AddElement(Statistics::VT_MIN_LEN, min_len); + } + void add_max_lo4(uint32_t max_lo4) { + fbb_.AddElement(Statistics::VT_MAX_LO4, max_lo4, 0); + } + void add_max_lo8(uint64_t max_lo8) { + fbb_.AddElement(Statistics::VT_MAX_LO8, max_lo8, 0); + } + void add_max_hi8(uint64_t max_hi8) { + fbb_.AddElement(Statistics::VT_MAX_HI8, max_hi8, 0); + } + void add_max_len(int8_t max_len) { + fbb_.AddElement(Statistics::VT_MAX_LEN, max_len); + } + void add_prefix(::flatbuffers::Offset<::flatbuffers::String> prefix) { + fbb_.AddOffset(Statistics::VT_PREFIX, prefix); + } + explicit StatisticsBuilder(::flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ::flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = ::flatbuffers::Offset(end); + return o; + } +}; + +inline ::flatbuffers::Offset CreateStatistics( + ::flatbuffers::FlatBufferBuilder &_fbb, + ::flatbuffers::Optional null_count = ::flatbuffers::nullopt, + uint32_t min_lo4 = 0, + uint64_t min_lo8 = 0, + uint64_t min_hi8 = 0, + ::flatbuffers::Optional min_len = ::flatbuffers::nullopt, + uint32_t max_lo4 = 0, + uint64_t max_lo8 = 0, + uint64_t max_hi8 = 0, + ::flatbuffers::Optional max_len = ::flatbuffers::nullopt, + ::flatbuffers::Offset<::flatbuffers::String> prefix = 0) { + StatisticsBuilder builder_(_fbb); + builder_.add_max_hi8(max_hi8); + builder_.add_max_lo8(max_lo8); + builder_.add_min_hi8(min_hi8); + builder_.add_min_lo8(min_lo8); + builder_.add_prefix(prefix); + builder_.add_max_lo4(max_lo4); + builder_.add_min_lo4(min_lo4); + if(null_count) { builder_.add_null_count(*null_count); } + if(max_len) { builder_.add_max_len(*max_len); } + if(min_len) { builder_.add_min_len(*min_len); } + return builder_.Finish(); +} + +inline ::flatbuffers::Offset CreateStatisticsDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, + ::flatbuffers::Optional null_count = ::flatbuffers::nullopt, + uint32_t min_lo4 = 0, + uint64_t min_lo8 = 0, + uint64_t min_hi8 = 0, + ::flatbuffers::Optional min_len = ::flatbuffers::nullopt, + uint32_t max_lo4 = 0, + uint64_t max_lo8 = 0, + uint64_t max_hi8 = 0, + ::flatbuffers::Optional max_len = ::flatbuffers::nullopt, + const char *prefix = nullptr) { + auto prefix__ = prefix ? _fbb.CreateString(prefix) : 0; + return parquet::format3::CreateStatistics( + _fbb, + null_count, + min_lo4, + min_lo8, + min_hi8, + min_len, + max_lo4, + max_lo8, + max_hi8, + max_len, + prefix__); +} + +struct SchemaElement FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { + typedef SchemaElementBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_NAME = 4, + VT_TYPE = 6, + VT_REPETITION_TYPE = 8, + VT_LOGICAL_TYPE_TYPE = 10, + VT_LOGICAL_TYPE = 12, + VT_TYPE_LENGTH = 14, + VT_NUM_CHILDREN = 16, + VT_FIELD_ID = 18, + VT_COLUMN_ORDER_TYPE = 20, + VT_COLUMN_ORDER = 22 + }; + const ::flatbuffers::String *name() const { + return GetPointer(VT_NAME); + } + ::flatbuffers::Optional type() const { + return GetOptional(VT_TYPE); + } + parquet::format3::FieldRepetitionType repetition_type() const { + return static_cast(GetField(VT_REPETITION_TYPE, 0)); + } + parquet::format3::LogicalType logical_type_type() const { + return static_cast(GetField(VT_LOGICAL_TYPE_TYPE, 0)); + } + const void *logical_type() const { + return GetPointer(VT_LOGICAL_TYPE); + } + const parquet::format3::Empty *logical_type_as_StringType() const { + return logical_type_type() == parquet::format3::LogicalType::StringType ? static_cast(logical_type()) : nullptr; + } + const parquet::format3::Empty *logical_type_as_MapType() const { + return logical_type_type() == parquet::format3::LogicalType::MapType ? static_cast(logical_type()) : nullptr; + } + const parquet::format3::Empty *logical_type_as_ListType() const { + return logical_type_type() == parquet::format3::LogicalType::ListType ? static_cast(logical_type()) : nullptr; + } + const parquet::format3::Empty *logical_type_as_EnumType() const { + return logical_type_type() == parquet::format3::LogicalType::EnumType ? static_cast(logical_type()) : nullptr; + } + const parquet::format3::DecimalOpts *logical_type_as_DecimalType() const { + return logical_type_type() == parquet::format3::LogicalType::DecimalType ? static_cast(logical_type()) : nullptr; + } + const parquet::format3::Empty *logical_type_as_DateType() const { + return logical_type_type() == parquet::format3::LogicalType::DateType ? static_cast(logical_type()) : nullptr; + } + const parquet::format3::TimeOpts *logical_type_as_TimeType() const { + return logical_type_type() == parquet::format3::LogicalType::TimeType ? static_cast(logical_type()) : nullptr; + } + const parquet::format3::TimeOpts *logical_type_as_TimestampType() const { + return logical_type_type() == parquet::format3::LogicalType::TimestampType ? static_cast(logical_type()) : nullptr; + } + const parquet::format3::IntOpts *logical_type_as_IntType() const { + return logical_type_type() == parquet::format3::LogicalType::IntType ? static_cast(logical_type()) : nullptr; + } + const parquet::format3::Empty *logical_type_as_NullType() const { + return logical_type_type() == parquet::format3::LogicalType::NullType ? static_cast(logical_type()) : nullptr; + } + const parquet::format3::Empty *logical_type_as_JsonType() const { + return logical_type_type() == parquet::format3::LogicalType::JsonType ? static_cast(logical_type()) : nullptr; + } + const parquet::format3::Empty *logical_type_as_BsonType() const { + return logical_type_type() == parquet::format3::LogicalType::BsonType ? static_cast(logical_type()) : nullptr; + } + const parquet::format3::Empty *logical_type_as_UUIDType() const { + return logical_type_type() == parquet::format3::LogicalType::UUIDType ? static_cast(logical_type()) : nullptr; + } + const parquet::format3::Empty *logical_type_as_Float16Type() const { + return logical_type_type() == parquet::format3::LogicalType::Float16Type ? static_cast(logical_type()) : nullptr; + } + const parquet::format3::Empty *logical_type_as_VariantType() const { + return logical_type_type() == parquet::format3::LogicalType::VariantType ? static_cast(logical_type()) : nullptr; + } + const parquet::format3::GeometryType *logical_type_as_GeometryType() const { + return logical_type_type() == parquet::format3::LogicalType::GeometryType ? static_cast(logical_type()) : nullptr; + } + const parquet::format3::GeographyType *logical_type_as_GeographyType() const { + return logical_type_type() == parquet::format3::LogicalType::GeographyType ? static_cast(logical_type()) : nullptr; + } + ::flatbuffers::Optional type_length() const { + return GetOptional(VT_TYPE_LENGTH); + } + int32_t num_children() const { + return GetField(VT_NUM_CHILDREN, 0); + } + ::flatbuffers::Optional field_id() const { + return GetOptional(VT_FIELD_ID); + } + parquet::format3::ColumnOrder column_order_type() const { + return static_cast(GetField(VT_COLUMN_ORDER_TYPE, 0)); + } + const void *column_order() const { + return GetPointer(VT_COLUMN_ORDER); + } + template const T *column_order_as() const; + const parquet::format3::Empty *column_order_as_TypeDefinedOrder() const { + return column_order_type() == parquet::format3::ColumnOrder::TypeDefinedOrder ? static_cast(column_order()) : nullptr; + } + bool Verify(::flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_NAME) && + verifier.VerifyString(name()) && + VerifyField(verifier, VT_TYPE, 1) && + VerifyField(verifier, VT_REPETITION_TYPE, 1) && + VerifyField(verifier, VT_LOGICAL_TYPE_TYPE, 1) && + VerifyOffset(verifier, VT_LOGICAL_TYPE) && + VerifyLogicalType(verifier, logical_type(), logical_type_type()) && + VerifyField(verifier, VT_TYPE_LENGTH, 4) && + VerifyField(verifier, VT_NUM_CHILDREN, 4) && + VerifyField(verifier, VT_FIELD_ID, 4) && + VerifyField(verifier, VT_COLUMN_ORDER_TYPE, 1) && + VerifyOffset(verifier, VT_COLUMN_ORDER) && + VerifyColumnOrder(verifier, column_order(), column_order_type()) && + verifier.EndTable(); + } +}; + +template<> inline const parquet::format3::Empty *SchemaElement::column_order_as() const { + return column_order_as_TypeDefinedOrder(); +} + +struct SchemaElementBuilder { + typedef SchemaElement Table; + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_name(::flatbuffers::Offset<::flatbuffers::String> name) { + fbb_.AddOffset(SchemaElement::VT_NAME, name); + } + void add_type(parquet::format3::Type type) { + fbb_.AddElement(SchemaElement::VT_TYPE, static_cast(type)); + } + void add_repetition_type(parquet::format3::FieldRepetitionType repetition_type) { + fbb_.AddElement(SchemaElement::VT_REPETITION_TYPE, static_cast(repetition_type), 0); + } + void add_logical_type_type(parquet::format3::LogicalType logical_type_type) { + fbb_.AddElement(SchemaElement::VT_LOGICAL_TYPE_TYPE, static_cast(logical_type_type), 0); + } + void add_logical_type(::flatbuffers::Offset logical_type) { + fbb_.AddOffset(SchemaElement::VT_LOGICAL_TYPE, logical_type); + } + void add_type_length(int32_t type_length) { + fbb_.AddElement(SchemaElement::VT_TYPE_LENGTH, type_length); + } + void add_num_children(int32_t num_children) { + fbb_.AddElement(SchemaElement::VT_NUM_CHILDREN, num_children, 0); + } + void add_field_id(int32_t field_id) { + fbb_.AddElement(SchemaElement::VT_FIELD_ID, field_id); + } + void add_column_order_type(parquet::format3::ColumnOrder column_order_type) { + fbb_.AddElement(SchemaElement::VT_COLUMN_ORDER_TYPE, static_cast(column_order_type), 0); + } + void add_column_order(::flatbuffers::Offset column_order) { + fbb_.AddOffset(SchemaElement::VT_COLUMN_ORDER, column_order); + } + explicit SchemaElementBuilder(::flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ::flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = ::flatbuffers::Offset(end); + return o; + } +}; + +inline ::flatbuffers::Offset CreateSchemaElement( + ::flatbuffers::FlatBufferBuilder &_fbb, + ::flatbuffers::Offset<::flatbuffers::String> name = 0, + ::flatbuffers::Optional type = ::flatbuffers::nullopt, + parquet::format3::FieldRepetitionType repetition_type = parquet::format3::FieldRepetitionType::REQUIRED, + parquet::format3::LogicalType logical_type_type = parquet::format3::LogicalType::NONE, + ::flatbuffers::Offset logical_type = 0, + ::flatbuffers::Optional type_length = ::flatbuffers::nullopt, + int32_t num_children = 0, + ::flatbuffers::Optional field_id = ::flatbuffers::nullopt, + parquet::format3::ColumnOrder column_order_type = parquet::format3::ColumnOrder::NONE, + ::flatbuffers::Offset column_order = 0) { + SchemaElementBuilder builder_(_fbb); + builder_.add_column_order(column_order); + if(field_id) { builder_.add_field_id(*field_id); } + builder_.add_num_children(num_children); + if(type_length) { builder_.add_type_length(*type_length); } + builder_.add_logical_type(logical_type); + builder_.add_name(name); + builder_.add_column_order_type(column_order_type); + builder_.add_logical_type_type(logical_type_type); + builder_.add_repetition_type(repetition_type); + if(type) { builder_.add_type(*type); } + return builder_.Finish(); +} + +inline ::flatbuffers::Offset CreateSchemaElementDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, + const char *name = nullptr, + ::flatbuffers::Optional type = ::flatbuffers::nullopt, + parquet::format3::FieldRepetitionType repetition_type = parquet::format3::FieldRepetitionType::REQUIRED, + parquet::format3::LogicalType logical_type_type = parquet::format3::LogicalType::NONE, + ::flatbuffers::Offset logical_type = 0, + ::flatbuffers::Optional type_length = ::flatbuffers::nullopt, + int32_t num_children = 0, + ::flatbuffers::Optional field_id = ::flatbuffers::nullopt, + parquet::format3::ColumnOrder column_order_type = parquet::format3::ColumnOrder::NONE, + ::flatbuffers::Offset column_order = 0) { + auto name__ = name ? _fbb.CreateString(name) : 0; + return parquet::format3::CreateSchemaElement( + _fbb, + name__, + type, + repetition_type, + logical_type_type, + logical_type, + type_length, + num_children, + field_id, + column_order_type, + column_order); +} + +struct KV FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { + typedef KVBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_KEY = 4, + VT_VAL = 6 + }; + const ::flatbuffers::String *key() const { + return GetPointer(VT_KEY); + } + const ::flatbuffers::String *val() const { + return GetPointer(VT_VAL); + } + bool Verify(::flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_KEY) && + verifier.VerifyString(key()) && + VerifyOffset(verifier, VT_VAL) && + verifier.VerifyString(val()) && + verifier.EndTable(); + } +}; + +struct KVBuilder { + typedef KV Table; + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_key(::flatbuffers::Offset<::flatbuffers::String> key) { + fbb_.AddOffset(KV::VT_KEY, key); + } + void add_val(::flatbuffers::Offset<::flatbuffers::String> val) { + fbb_.AddOffset(KV::VT_VAL, val); + } + explicit KVBuilder(::flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ::flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = ::flatbuffers::Offset(end); + return o; + } +}; + +inline ::flatbuffers::Offset CreateKV( + ::flatbuffers::FlatBufferBuilder &_fbb, + ::flatbuffers::Offset<::flatbuffers::String> key = 0, + ::flatbuffers::Offset<::flatbuffers::String> val = 0) { + KVBuilder builder_(_fbb); + builder_.add_val(val); + builder_.add_key(key); + return builder_.Finish(); +} + +inline ::flatbuffers::Offset CreateKVDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, + const char *key = nullptr, + const char *val = nullptr) { + auto key__ = key ? _fbb.CreateString(key) : 0; + auto val__ = val ? _fbb.CreateString(val) : 0; + return parquet::format3::CreateKV( + _fbb, + key__, + val__); +} + +struct ColumnMetadata FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { + typedef ColumnMetadataBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_CODEC = 4, + VT_NUM_VALUES = 6, + VT_TOTAL_UNCOMPRESSED_SIZE = 8, + VT_TOTAL_COMPRESSED_SIZE = 10, + VT_KEY_VALUE_METADATA = 12, + VT_DATA_PAGE_OFFSET = 14, + VT_INDEX_PAGE_OFFSET = 16, + VT_DICTIONARY_PAGE_OFFSET = 18, + VT_STATISTICS = 20, + VT_IS_FULLY_DICT_ENCODED = 22, + VT_BLOOM_FILTER_OFFSET = 24, + VT_BLOOM_FILTER_LENGTH = 26 + }; + parquet::format3::CompressionCodec codec() const { + return static_cast(GetField(VT_CODEC, 0)); + } + ::flatbuffers::Optional num_values() const { + return GetOptional(VT_NUM_VALUES); + } + int64_t total_uncompressed_size() const { + return GetField(VT_TOTAL_UNCOMPRESSED_SIZE, 0); + } + int64_t total_compressed_size() const { + return GetField(VT_TOTAL_COMPRESSED_SIZE, 0); + } + const ::flatbuffers::Vector<::flatbuffers::Offset> *key_value_metadata() const { + return GetPointer> *>(VT_KEY_VALUE_METADATA); + } + int64_t data_page_offset() const { + return GetField(VT_DATA_PAGE_OFFSET, 0); + } + ::flatbuffers::Optional index_page_offset() const { + return GetOptional(VT_INDEX_PAGE_OFFSET); + } + ::flatbuffers::Optional dictionary_page_offset() const { + return GetOptional(VT_DICTIONARY_PAGE_OFFSET); + } + const parquet::format3::Statistics *statistics() const { + return GetPointer(VT_STATISTICS); + } + bool is_fully_dict_encoded() const { + return GetField(VT_IS_FULLY_DICT_ENCODED, 0) != 0; + } + ::flatbuffers::Optional bloom_filter_offset() const { + return GetOptional(VT_BLOOM_FILTER_OFFSET); + } + ::flatbuffers::Optional bloom_filter_length() const { + return GetOptional(VT_BLOOM_FILTER_LENGTH); + } + bool Verify(::flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyField(verifier, VT_CODEC, 1) && + VerifyField(verifier, VT_NUM_VALUES, 8) && + VerifyField(verifier, VT_TOTAL_UNCOMPRESSED_SIZE, 8) && + VerifyField(verifier, VT_TOTAL_COMPRESSED_SIZE, 8) && + VerifyOffset(verifier, VT_KEY_VALUE_METADATA) && + verifier.VerifyVector(key_value_metadata()) && + verifier.VerifyVectorOfTables(key_value_metadata()) && + VerifyField(verifier, VT_DATA_PAGE_OFFSET, 8) && + VerifyField(verifier, VT_INDEX_PAGE_OFFSET, 8) && + VerifyField(verifier, VT_DICTIONARY_PAGE_OFFSET, 8) && + VerifyOffset(verifier, VT_STATISTICS) && + verifier.VerifyTable(statistics()) && + VerifyField(verifier, VT_IS_FULLY_DICT_ENCODED, 1) && + VerifyField(verifier, VT_BLOOM_FILTER_OFFSET, 8) && + VerifyField(verifier, VT_BLOOM_FILTER_LENGTH, 4) && + verifier.EndTable(); + } +}; + +struct ColumnMetadataBuilder { + typedef ColumnMetadata Table; + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_codec(parquet::format3::CompressionCodec codec) { + fbb_.AddElement(ColumnMetadata::VT_CODEC, static_cast(codec), 0); + } + void add_num_values(int64_t num_values) { + fbb_.AddElement(ColumnMetadata::VT_NUM_VALUES, num_values); + } + void add_total_uncompressed_size(int64_t total_uncompressed_size) { + fbb_.AddElement(ColumnMetadata::VT_TOTAL_UNCOMPRESSED_SIZE, total_uncompressed_size, 0); + } + void add_total_compressed_size(int64_t total_compressed_size) { + fbb_.AddElement(ColumnMetadata::VT_TOTAL_COMPRESSED_SIZE, total_compressed_size, 0); + } + void add_key_value_metadata(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> key_value_metadata) { + fbb_.AddOffset(ColumnMetadata::VT_KEY_VALUE_METADATA, key_value_metadata); + } + void add_data_page_offset(int64_t data_page_offset) { + fbb_.AddElement(ColumnMetadata::VT_DATA_PAGE_OFFSET, data_page_offset, 0); + } + void add_index_page_offset(int64_t index_page_offset) { + fbb_.AddElement(ColumnMetadata::VT_INDEX_PAGE_OFFSET, index_page_offset); + } + void add_dictionary_page_offset(int64_t dictionary_page_offset) { + fbb_.AddElement(ColumnMetadata::VT_DICTIONARY_PAGE_OFFSET, dictionary_page_offset); + } + void add_statistics(::flatbuffers::Offset statistics) { + fbb_.AddOffset(ColumnMetadata::VT_STATISTICS, statistics); + } + void add_is_fully_dict_encoded(bool is_fully_dict_encoded) { + fbb_.AddElement(ColumnMetadata::VT_IS_FULLY_DICT_ENCODED, static_cast(is_fully_dict_encoded), 0); + } + void add_bloom_filter_offset(int64_t bloom_filter_offset) { + fbb_.AddElement(ColumnMetadata::VT_BLOOM_FILTER_OFFSET, bloom_filter_offset); + } + void add_bloom_filter_length(int32_t bloom_filter_length) { + fbb_.AddElement(ColumnMetadata::VT_BLOOM_FILTER_LENGTH, bloom_filter_length); + } + explicit ColumnMetadataBuilder(::flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ::flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = ::flatbuffers::Offset(end); + return o; + } +}; + +inline ::flatbuffers::Offset CreateColumnMetadata( + ::flatbuffers::FlatBufferBuilder &_fbb, + parquet::format3::CompressionCodec codec = parquet::format3::CompressionCodec::UNCOMPRESSED, + ::flatbuffers::Optional num_values = ::flatbuffers::nullopt, + int64_t total_uncompressed_size = 0, + int64_t total_compressed_size = 0, + ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> key_value_metadata = 0, + int64_t data_page_offset = 0, + ::flatbuffers::Optional index_page_offset = ::flatbuffers::nullopt, + ::flatbuffers::Optional dictionary_page_offset = ::flatbuffers::nullopt, + ::flatbuffers::Offset statistics = 0, + bool is_fully_dict_encoded = false, + ::flatbuffers::Optional bloom_filter_offset = ::flatbuffers::nullopt, + ::flatbuffers::Optional bloom_filter_length = ::flatbuffers::nullopt) { + ColumnMetadataBuilder builder_(_fbb); + if(bloom_filter_offset) { builder_.add_bloom_filter_offset(*bloom_filter_offset); } + if(dictionary_page_offset) { builder_.add_dictionary_page_offset(*dictionary_page_offset); } + if(index_page_offset) { builder_.add_index_page_offset(*index_page_offset); } + builder_.add_data_page_offset(data_page_offset); + builder_.add_total_compressed_size(total_compressed_size); + builder_.add_total_uncompressed_size(total_uncompressed_size); + if(num_values) { builder_.add_num_values(*num_values); } + if(bloom_filter_length) { builder_.add_bloom_filter_length(*bloom_filter_length); } + builder_.add_statistics(statistics); + builder_.add_key_value_metadata(key_value_metadata); + builder_.add_is_fully_dict_encoded(is_fully_dict_encoded); + builder_.add_codec(codec); + return builder_.Finish(); +} + +inline ::flatbuffers::Offset CreateColumnMetadataDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, + parquet::format3::CompressionCodec codec = parquet::format3::CompressionCodec::UNCOMPRESSED, + ::flatbuffers::Optional num_values = ::flatbuffers::nullopt, + int64_t total_uncompressed_size = 0, + int64_t total_compressed_size = 0, + const std::vector<::flatbuffers::Offset> *key_value_metadata = nullptr, + int64_t data_page_offset = 0, + ::flatbuffers::Optional index_page_offset = ::flatbuffers::nullopt, + ::flatbuffers::Optional dictionary_page_offset = ::flatbuffers::nullopt, + ::flatbuffers::Offset statistics = 0, + bool is_fully_dict_encoded = false, + ::flatbuffers::Optional bloom_filter_offset = ::flatbuffers::nullopt, + ::flatbuffers::Optional bloom_filter_length = ::flatbuffers::nullopt) { + auto key_value_metadata__ = key_value_metadata ? _fbb.CreateVector<::flatbuffers::Offset>(*key_value_metadata) : 0; + return parquet::format3::CreateColumnMetadata( + _fbb, + codec, + num_values, + total_uncompressed_size, + total_compressed_size, + key_value_metadata__, + data_page_offset, + index_page_offset, + dictionary_page_offset, + statistics, + is_fully_dict_encoded, + bloom_filter_offset, + bloom_filter_length); +} + +struct ColumnChunk FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { + typedef ColumnChunkBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_FILE_PATH = 4, + VT_META_DATA = 6 + }; + const ::flatbuffers::String *file_path() const { + return GetPointer(VT_FILE_PATH); + } + const parquet::format3::ColumnMetadata *meta_data() const { + return GetPointer(VT_META_DATA); + } + bool Verify(::flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_FILE_PATH) && + verifier.VerifyString(file_path()) && + VerifyOffset(verifier, VT_META_DATA) && + verifier.VerifyTable(meta_data()) && + verifier.EndTable(); + } +}; + +struct ColumnChunkBuilder { + typedef ColumnChunk Table; + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_file_path(::flatbuffers::Offset<::flatbuffers::String> file_path) { + fbb_.AddOffset(ColumnChunk::VT_FILE_PATH, file_path); + } + void add_meta_data(::flatbuffers::Offset meta_data) { + fbb_.AddOffset(ColumnChunk::VT_META_DATA, meta_data); + } + explicit ColumnChunkBuilder(::flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ::flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = ::flatbuffers::Offset(end); + return o; + } +}; + +inline ::flatbuffers::Offset CreateColumnChunk( + ::flatbuffers::FlatBufferBuilder &_fbb, + ::flatbuffers::Offset<::flatbuffers::String> file_path = 0, + ::flatbuffers::Offset meta_data = 0) { + ColumnChunkBuilder builder_(_fbb); + builder_.add_meta_data(meta_data); + builder_.add_file_path(file_path); + return builder_.Finish(); +} + +inline ::flatbuffers::Offset CreateColumnChunkDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, + const char *file_path = nullptr, + ::flatbuffers::Offset meta_data = 0) { + auto file_path__ = file_path ? _fbb.CreateString(file_path) : 0; + return parquet::format3::CreateColumnChunk( + _fbb, + file_path__, + meta_data); +} + +struct SortingColumn FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { + typedef SortingColumnBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_COLUMN_IDX = 4, + VT_DESCENDING = 6, + VT_NULLS_FIRST = 8 + }; + int32_t column_idx() const { + return GetField(VT_COLUMN_IDX, 0); + } + bool descending() const { + return GetField(VT_DESCENDING, 0) != 0; + } + bool nulls_first() const { + return GetField(VT_NULLS_FIRST, 0) != 0; + } + bool Verify(::flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyField(verifier, VT_COLUMN_IDX, 4) && + VerifyField(verifier, VT_DESCENDING, 1) && + VerifyField(verifier, VT_NULLS_FIRST, 1) && + verifier.EndTable(); + } +}; + +struct SortingColumnBuilder { + typedef SortingColumn Table; + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_column_idx(int32_t column_idx) { + fbb_.AddElement(SortingColumn::VT_COLUMN_IDX, column_idx, 0); + } + void add_descending(bool descending) { + fbb_.AddElement(SortingColumn::VT_DESCENDING, static_cast(descending), 0); + } + void add_nulls_first(bool nulls_first) { + fbb_.AddElement(SortingColumn::VT_NULLS_FIRST, static_cast(nulls_first), 0); + } + explicit SortingColumnBuilder(::flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ::flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = ::flatbuffers::Offset(end); + return o; + } +}; + +inline ::flatbuffers::Offset CreateSortingColumn( + ::flatbuffers::FlatBufferBuilder &_fbb, + int32_t column_idx = 0, + bool descending = false, + bool nulls_first = false) { + SortingColumnBuilder builder_(_fbb); + builder_.add_column_idx(column_idx); + builder_.add_nulls_first(nulls_first); + builder_.add_descending(descending); + return builder_.Finish(); +} + +struct RowGroup FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { + typedef RowGroupBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_COLUMNS = 4, + VT_TOTAL_BYTE_SIZE = 6, + VT_NUM_ROWS = 8, + VT_SORTING_COLUMNS = 10, + VT_FILE_OFFSET = 12, + VT_TOTAL_COMPRESSED_SIZE = 14, + VT_ORDINAL = 16 + }; + const ::flatbuffers::Vector<::flatbuffers::Offset> *columns() const { + return GetPointer> *>(VT_COLUMNS); + } + int64_t total_byte_size() const { + return GetField(VT_TOTAL_BYTE_SIZE, 0); + } + int64_t num_rows() const { + return GetField(VT_NUM_ROWS, 0); + } + const ::flatbuffers::Vector<::flatbuffers::Offset> *sorting_columns() const { + return GetPointer> *>(VT_SORTING_COLUMNS); + } + int64_t file_offset() const { + return GetField(VT_FILE_OFFSET, 0); + } + int64_t total_compressed_size() const { + return GetField(VT_TOTAL_COMPRESSED_SIZE, 0); + } + ::flatbuffers::Optional ordinal() const { + return GetOptional(VT_ORDINAL); + } + bool Verify(::flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_COLUMNS) && + verifier.VerifyVector(columns()) && + verifier.VerifyVectorOfTables(columns()) && + VerifyField(verifier, VT_TOTAL_BYTE_SIZE, 8) && + VerifyField(verifier, VT_NUM_ROWS, 8) && + VerifyOffset(verifier, VT_SORTING_COLUMNS) && + verifier.VerifyVector(sorting_columns()) && + verifier.VerifyVectorOfTables(sorting_columns()) && + VerifyField(verifier, VT_FILE_OFFSET, 8) && + VerifyField(verifier, VT_TOTAL_COMPRESSED_SIZE, 8) && + VerifyField(verifier, VT_ORDINAL, 2) && + verifier.EndTable(); + } +}; + +struct RowGroupBuilder { + typedef RowGroup Table; + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_columns(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> columns) { + fbb_.AddOffset(RowGroup::VT_COLUMNS, columns); + } + void add_total_byte_size(int64_t total_byte_size) { + fbb_.AddElement(RowGroup::VT_TOTAL_BYTE_SIZE, total_byte_size, 0); + } + void add_num_rows(int64_t num_rows) { + fbb_.AddElement(RowGroup::VT_NUM_ROWS, num_rows, 0); + } + void add_sorting_columns(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> sorting_columns) { + fbb_.AddOffset(RowGroup::VT_SORTING_COLUMNS, sorting_columns); + } + void add_file_offset(int64_t file_offset) { + fbb_.AddElement(RowGroup::VT_FILE_OFFSET, file_offset, 0); + } + void add_total_compressed_size(int64_t total_compressed_size) { + fbb_.AddElement(RowGroup::VT_TOTAL_COMPRESSED_SIZE, total_compressed_size, 0); + } + void add_ordinal(int16_t ordinal) { + fbb_.AddElement(RowGroup::VT_ORDINAL, ordinal); + } + explicit RowGroupBuilder(::flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ::flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = ::flatbuffers::Offset(end); + return o; + } +}; + +inline ::flatbuffers::Offset CreateRowGroup( + ::flatbuffers::FlatBufferBuilder &_fbb, + ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> columns = 0, + int64_t total_byte_size = 0, + int64_t num_rows = 0, + ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> sorting_columns = 0, + int64_t file_offset = 0, + int64_t total_compressed_size = 0, + ::flatbuffers::Optional ordinal = ::flatbuffers::nullopt) { + RowGroupBuilder builder_(_fbb); + builder_.add_total_compressed_size(total_compressed_size); + builder_.add_file_offset(file_offset); + builder_.add_num_rows(num_rows); + builder_.add_total_byte_size(total_byte_size); + builder_.add_sorting_columns(sorting_columns); + builder_.add_columns(columns); + if(ordinal) { builder_.add_ordinal(*ordinal); } + return builder_.Finish(); +} + +inline ::flatbuffers::Offset CreateRowGroupDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, + const std::vector<::flatbuffers::Offset> *columns = nullptr, + int64_t total_byte_size = 0, + int64_t num_rows = 0, + const std::vector<::flatbuffers::Offset> *sorting_columns = nullptr, + int64_t file_offset = 0, + int64_t total_compressed_size = 0, + ::flatbuffers::Optional ordinal = ::flatbuffers::nullopt) { + auto columns__ = columns ? _fbb.CreateVector<::flatbuffers::Offset>(*columns) : 0; + auto sorting_columns__ = sorting_columns ? _fbb.CreateVector<::flatbuffers::Offset>(*sorting_columns) : 0; + return parquet::format3::CreateRowGroup( + _fbb, + columns__, + total_byte_size, + num_rows, + sorting_columns__, + file_offset, + total_compressed_size, + ordinal); +} + +struct FileMetaData FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { + typedef FileMetaDataBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_VERSION = 4, + VT_SCHEMA = 6, + VT_NUM_ROWS = 8, + VT_ROW_GROUPS = 10, + VT_KV = 12, + VT_CREATED_BY = 14 + }; + int32_t version() const { + return GetField(VT_VERSION, 0); + } + const ::flatbuffers::Vector<::flatbuffers::Offset> *schema() const { + return GetPointer> *>(VT_SCHEMA); + } + int64_t num_rows() const { + return GetField(VT_NUM_ROWS, 0); + } + const ::flatbuffers::Vector<::flatbuffers::Offset> *row_groups() const { + return GetPointer> *>(VT_ROW_GROUPS); + } + const ::flatbuffers::Vector<::flatbuffers::Offset> *kv() const { + return GetPointer> *>(VT_KV); + } + const ::flatbuffers::String *created_by() const { + return GetPointer(VT_CREATED_BY); + } + bool Verify(::flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyField(verifier, VT_VERSION, 4) && + VerifyOffset(verifier, VT_SCHEMA) && + verifier.VerifyVector(schema()) && + verifier.VerifyVectorOfTables(schema()) && + VerifyField(verifier, VT_NUM_ROWS, 8) && + VerifyOffset(verifier, VT_ROW_GROUPS) && + verifier.VerifyVector(row_groups()) && + verifier.VerifyVectorOfTables(row_groups()) && + VerifyOffset(verifier, VT_KV) && + verifier.VerifyVector(kv()) && + verifier.VerifyVectorOfTables(kv()) && + VerifyOffset(verifier, VT_CREATED_BY) && + verifier.VerifyString(created_by()) && + verifier.EndTable(); + } +}; + +struct FileMetaDataBuilder { + typedef FileMetaData Table; + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_version(int32_t version) { + fbb_.AddElement(FileMetaData::VT_VERSION, version, 0); + } + void add_schema(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> schema) { + fbb_.AddOffset(FileMetaData::VT_SCHEMA, schema); + } + void add_num_rows(int64_t num_rows) { + fbb_.AddElement(FileMetaData::VT_NUM_ROWS, num_rows, 0); + } + void add_row_groups(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> row_groups) { + fbb_.AddOffset(FileMetaData::VT_ROW_GROUPS, row_groups); + } + void add_kv(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> kv) { + fbb_.AddOffset(FileMetaData::VT_KV, kv); + } + void add_created_by(::flatbuffers::Offset<::flatbuffers::String> created_by) { + fbb_.AddOffset(FileMetaData::VT_CREATED_BY, created_by); + } + explicit FileMetaDataBuilder(::flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ::flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = ::flatbuffers::Offset(end); + return o; + } +}; + +inline ::flatbuffers::Offset CreateFileMetaData( + ::flatbuffers::FlatBufferBuilder &_fbb, + int32_t version = 0, + ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> schema = 0, + int64_t num_rows = 0, + ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> row_groups = 0, + ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> kv = 0, + ::flatbuffers::Offset<::flatbuffers::String> created_by = 0) { + FileMetaDataBuilder builder_(_fbb); + builder_.add_num_rows(num_rows); + builder_.add_created_by(created_by); + builder_.add_kv(kv); + builder_.add_row_groups(row_groups); + builder_.add_schema(schema); + builder_.add_version(version); + return builder_.Finish(); +} + +inline ::flatbuffers::Offset CreateFileMetaDataDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, + int32_t version = 0, + const std::vector<::flatbuffers::Offset> *schema = nullptr, + int64_t num_rows = 0, + const std::vector<::flatbuffers::Offset> *row_groups = nullptr, + const std::vector<::flatbuffers::Offset> *kv = nullptr, + const char *created_by = nullptr) { + auto schema__ = schema ? _fbb.CreateVector<::flatbuffers::Offset>(*schema) : 0; + auto row_groups__ = row_groups ? _fbb.CreateVector<::flatbuffers::Offset>(*row_groups) : 0; + auto kv__ = kv ? _fbb.CreateVector<::flatbuffers::Offset>(*kv) : 0; + auto created_by__ = created_by ? _fbb.CreateString(created_by) : 0; + return parquet::format3::CreateFileMetaData( + _fbb, + version, + schema__, + num_rows, + row_groups__, + kv__, + created_by__); +} + +inline bool VerifyLogicalType(::flatbuffers::Verifier &verifier, const void *obj, LogicalType type) { + switch (type) { + case LogicalType::NONE: { + return true; + } + case LogicalType::StringType: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case LogicalType::MapType: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case LogicalType::ListType: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case LogicalType::EnumType: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case LogicalType::DecimalType: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case LogicalType::DateType: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case LogicalType::TimeType: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case LogicalType::TimestampType: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case LogicalType::IntType: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case LogicalType::NullType: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case LogicalType::JsonType: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case LogicalType::BsonType: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case LogicalType::UUIDType: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case LogicalType::Float16Type: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case LogicalType::VariantType: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case LogicalType::GeometryType: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case LogicalType::GeographyType: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + default: return true; + } +} + +inline bool VerifyLogicalTypeVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset> *values, const ::flatbuffers::Vector *types) { + if (!values || !types) return !values && !types; + if (values->size() != types->size()) return false; + for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) { + if (!VerifyLogicalType( + verifier, values->Get(i), types->GetEnum(i))) { + return false; + } + } + return true; +} + +inline bool VerifyColumnOrder(::flatbuffers::Verifier &verifier, const void *obj, ColumnOrder type) { + switch (type) { + case ColumnOrder::NONE: { + return true; + } + case ColumnOrder::TypeDefinedOrder: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + default: return true; + } +} + +inline bool VerifyColumnOrderVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset> *values, const ::flatbuffers::Vector *types) { + if (!values || !types) return !values && !types; + if (values->size() != types->size()) return false; + for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) { + if (!VerifyColumnOrder( + verifier, values->Get(i), types->GetEnum(i))) { + return false; + } + } + return true; +} + +inline const parquet::format3::FileMetaData *GetFileMetaData(const void *buf) { + return ::flatbuffers::GetRoot(buf); +} + +inline const parquet::format3::FileMetaData *GetSizePrefixedFileMetaData(const void *buf) { + return ::flatbuffers::GetSizePrefixedRoot(buf); +} + +inline bool VerifyFileMetaDataBuffer( + ::flatbuffers::Verifier &verifier) { + return verifier.VerifyBuffer(nullptr); +} + +inline bool VerifySizePrefixedFileMetaDataBuffer( + ::flatbuffers::Verifier &verifier) { + return verifier.VerifySizePrefixedBuffer(nullptr); +} + +inline void FinishFileMetaDataBuffer( + ::flatbuffers::FlatBufferBuilder &fbb, + ::flatbuffers::Offset root) { + fbb.Finish(root); +} + +inline void FinishSizePrefixedFileMetaDataBuffer( + ::flatbuffers::FlatBufferBuilder &fbb, + ::flatbuffers::Offset root) { + fbb.FinishSizePrefixed(root); +} + +} // namespace format3 +} // namespace parquet + +#endif // FLATBUFFERS_GENERATED_PARQUET3_PARQUET_FORMAT3_H_ diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index dc7d40d2a38..a976b156a18 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -147,6 +147,13 @@ if(NOT MSVC) PROPERTIES COMPILE_FLAGS -Wno-unused-variable) endif() +# +# Generated Flatbuffer sources +set(PARQUET_FLATBUFFER_SOURCE_DIR "${ARROW_SOURCE_DIR}/src/generated/") + +set_source_files_properties("${PARQUET_FLATBUFFER_SOURCE_DIR}/parquet3_generated.h" + PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) + # # Library config @@ -178,6 +185,7 @@ set(PARQUET_SRCS level_comparison.cc level_conversion.cc metadata.cc + metadata3.cc xxhasher.cc page_index.cc "${PARQUET_THRIFT_SOURCE_DIR}/parquet_types.cpp" @@ -306,6 +314,8 @@ add_arrow_lib(parquet STATIC_INSTALL_INTERFACE_LIBS ${PARQUET_STATIC_INSTALL_INTERFACE_LIBS}) +target_include_directories(parquet_objlib SYSTEM PRIVATE ${ARROW_SOURCE_DIR}/thirdparty/flatbuffers/include) + if(WIN32 AND NOT (ARROW_TEST_LINKAGE STREQUAL "static")) add_library(parquet_test_support STATIC "${PARQUET_THRIFT_SOURCE_DIR}/parquet_types.cpp") @@ -319,7 +329,7 @@ if(ARROW_TESTING) # Even though this is still just an object library we still need to # "link" our dependencies so that include paths are configured # correctly - target_link_libraries(parquet_testing PUBLIC ${ARROW_GTEST_GMOCK}) + target_link_libraries(parquet_testing PUBLIC ${ARROW_GTEST_GMOCK} thrift::thrift) list(APPEND PARQUET_TEST_LINK_LIBS parquet_testing RapidJSON) endif() @@ -338,6 +348,8 @@ endif() add_definitions(-DPARQUET_THRIFT_VERSION_MAJOR=${Thrift_VERSION_MAJOR}) add_definitions(-DPARQUET_THRIFT_VERSION_MINOR=${Thrift_VERSION_MINOR}) +add_definitions(-DPARQUET_FLATBUFFERS_VERSION_MAJOR=${flatbuffers_VERSION_MAJOR}) +add_definitions(-DPARQUET_FLATBUFFERS_VERSION_MINOR=${flatbuffers_VERSION_MINOR}) # Thrift requires these definitions for some types that we use foreach(LIB_TARGET ${PARQUET_LIBRARIES}) @@ -377,6 +389,7 @@ add_parquet_test(internals-test geospatial/statistics_test.cc geospatial/util_internal_test.cc metadata_test.cc + metadata3_test.cc page_index_test.cc properties_test.cc public_api_test.cc @@ -444,6 +457,7 @@ add_parquet_benchmark(column_io_benchmark) add_parquet_benchmark(encoding_benchmark) add_parquet_benchmark(level_conversion_benchmark) add_parquet_benchmark(metadata_benchmark) +add_parquet_benchmark(metadata3_benchmark) add_parquet_benchmark(page_index_benchmark SOURCES page_index_benchmark.cc benchmark_util.cc) add_parquet_benchmark(arrow/reader_writer_benchmark PREFIX "parquet-arrow") diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index b246feaf732..dbba69beb2a 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -45,6 +45,7 @@ #include "parquet/exception.h" #include "parquet/file_writer.h" #include "parquet/metadata.h" +#include "parquet/metadata3.h" #include "parquet/page_index.h" #include "parquet/platform.h" #include "parquet/properties.h" @@ -436,6 +437,34 @@ class SerializedFile : public ParquetFileReader::Contents { PARQUET_ASSIGN_OR_THROW( auto footer_buffer, source_->ReadAt(source_size_ - footer_read_size, footer_read_size)); + if (properties_.read_metadata3()) { + // Try to extract flatbuffer metadata from footer + std::string flatbuffer_data; + auto result = ExtractFlatbuffer(footer_buffer, &flatbuffer_data); + if (result.ok()) { + int32_t required_or_consumed = *result; + if (required_or_consumed > static_cast(footer_buffer->size())) { + PARQUET_ASSIGN_OR_THROW( + footer_buffer, + source_->ReadAt(source_size_ - required_or_consumed, required_or_consumed)); + footer_read_size = required_or_consumed; + result = ExtractFlatbuffer(footer_buffer, &flatbuffer_data); + } + // If successfully extracted flatbuffer data, parse it and return + if (result.ok() && *result > 0 && !flatbuffer_data.empty()) { + // Get flatbuffer metadata and convert to thrift + const format3::FileMetaData* fb_metadata = + format3::GetFileMetaData(flatbuffer_data.data()); + auto thrift_metadata = + std::make_unique(FromFlatbuffer(fb_metadata)); + file_metadata_ = FileMetaData::Make( + std::move(thrift_metadata), static_cast(*result), properties_); + return; + } + } + // If extraction failed or returned 0 (no flatbuffer), fall through to standard + // parsing + } uint32_t metadata_len = ParseFooterLength(footer_buffer, footer_read_size); int64_t metadata_start = source_size_ - kFooterSize - metadata_len; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index ddec2c0a560..675a8adf69a 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -339,7 +339,7 @@ class FileSerializer : public ParquetFileWriter::Contents { if (file_encryption_properties == nullptr) { // Non encrypted file. file_metadata_ = metadata_->Finish(key_value_metadata_); - WriteFileMetaData(*file_metadata_, sink_.get()); + WriteFileMetaData(*file_metadata_, sink_.get(), properties_->write_metadata3()); } else { // Encrypted file CloseEncryptedFile(file_encryption_properties); } @@ -541,12 +541,17 @@ std::unique_ptr ParquetFileWriter::Open( return result; } -void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink) { +void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink, + bool use_metadata3) { // Write MetaData PARQUET_ASSIGN_OR_THROW(int64_t position, sink->Tell()); uint32_t metadata_len = static_cast(position); - file_metadata.WriteTo(sink); + if (use_metadata3) { + file_metadata.WriteToWithMetadata3(sink); + } else { + file_metadata.WriteTo(sink); + } PARQUET_ASSIGN_OR_THROW(position, sink->Tell()); metadata_len = static_cast(position) - metadata_len; diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index d5ea1d7c98a..10270cdec45 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -111,7 +111,8 @@ class PARQUET_EXPORT RowGroupWriter { PARQUET_EXPORT void WriteFileMetaData(const FileMetaData& file_metadata, - ::arrow::io::OutputStream* sink); + ::arrow::io::OutputStream* sink, + bool use_metadata3 = false); PARQUET_EXPORT void WriteMetaDataFile(const FileMetaData& file_metadata, diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 42dd8e52ee9..f3f1ebbe867 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -35,6 +35,7 @@ #include "parquet/encryption/encryption_internal.h" #include "parquet/encryption/internal_file_decryptor.h" #include "parquet/exception.h" +#include "parquet/metadata3.h" #include "parquet/schema.h" #include "parquet/schema_internal.h" #include "parquet/size_statistics.h" @@ -799,6 +800,23 @@ class FileMetaData::FileMetaDataImpl { InitKeyValueMetadata(); } + // Constructor that accepts an already-deserialized thrift object + explicit FileMetaDataImpl(std::unique_ptr metadata, + uint32_t metadata_len, ReaderProperties properties) + : metadata_len_(metadata_len), + metadata_(std::move(metadata)), + properties_(std::move(properties)) { + if (metadata_->__isset.created_by) { + writer_version_ = ApplicationVersion(metadata_->created_by); + } else { + writer_version_ = ApplicationVersion("unknown 0.0.0"); + } + + InitSchema(); + InitColumnOrders(); + InitKeyValueMetadata(); + } + bool VerifySignature(const void* signature) { // verify decryption properties are set if (file_decryptor_ == nullptr) { @@ -858,6 +876,20 @@ class FileMetaData::FileMetaDataImpl { const ApplicationVersion& writer_version() const { return writer_version_; } + void WriteToWithMetadata3(::arrow::io::OutputStream* dst) const { + std::string flatbuffer; + if (ToFlatbuffer(metadata_.get(), &flatbuffer)) { + ThriftSerializer serializer; + std::string thrift; + serializer.SerializeToString(metadata_.get(), &thrift); + AppendFlatbuffer(flatbuffer, &thrift); + PARQUET_THROW_NOT_OK(dst->Write(thrift)); + } else { + WriteTo(dst, nullptr); + } + return; + } + void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor) const { ThriftSerializer serializer; @@ -1061,12 +1093,23 @@ std::shared_ptr FileMetaData::Make( new FileMetaData(metadata, metadata_len, properties, std::move(file_decryptor))); } +std::shared_ptr FileMetaData::Make( + std::unique_ptr metadata, uint32_t metadata_len, + const ReaderProperties& properties) { + return std::shared_ptr( + new FileMetaData(std::move(metadata), metadata_len, properties)); +} + FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len, const ReaderProperties& properties, std::shared_ptr file_decryptor) : impl_(new FileMetaDataImpl(metadata, metadata_len, properties, std::move(file_decryptor))) {} +FileMetaData::FileMetaData(std::unique_ptr metadata, + uint32_t metadata_len, const ReaderProperties& properties) + : impl_(new FileMetaDataImpl(std::move(metadata), metadata_len, properties)) {} + FileMetaData::FileMetaData() : impl_(new FileMetaDataImpl()) {} FileMetaData::~FileMetaData() = default; @@ -1169,6 +1212,10 @@ void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, return impl_->WriteTo(dst, encryptor); } +void FileMetaData::WriteToWithMetadata3(::arrow::io::OutputStream* dst) const { + return impl_->WriteToWithMetadata3(dst); +} + class FileCryptoMetaData::FileCryptoMetaDataImpl { public: FileCryptoMetaDataImpl() = default; diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 3380adbf56a..16c8179b398 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -28,6 +28,7 @@ #include "parquet/platform.h" #include "parquet/properties.h" #include "parquet/type_fwd.h" +#include "generated/parquet_types.h" namespace parquet { @@ -255,6 +256,11 @@ class PARQUET_EXPORT FileMetaData { const ReaderProperties& properties = default_reader_properties(), std::shared_ptr file_decryptor = NULLPTR); + /// \brief Create a FileMetaData from an already-deserialized thrift object. + static std::shared_ptr Make( + std::unique_ptr metadata, uint32_t metadata_len, + const ReaderProperties& properties = default_reader_properties()); + ~FileMetaData(); bool Equals(const FileMetaData& other) const; @@ -337,6 +343,7 @@ class PARQUET_EXPORT FileMetaData { void WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor = NULLPTR) const; + void WriteToWithMetadata3(::arrow::io::OutputStream* dst) const; /// \brief Return Thrift-serialized representation of the metadata as a /// string @@ -389,6 +396,9 @@ class PARQUET_EXPORT FileMetaData { const ReaderProperties& properties, std::shared_ptr file_decryptor = NULLPTR); + explicit FileMetaData(std::unique_ptr metadata, uint32_t metadata_len, + const ReaderProperties& properties); + void set_file_decryptor(std::shared_ptr file_decryptor); const std::shared_ptr& file_decryptor() const; diff --git a/cpp/src/parquet/metadata3.cc b/cpp/src/parquet/metadata3.cc new file mode 100644 index 00000000000..f7b0cda74f9 --- /dev/null +++ b/cpp/src/parquet/metadata3.cc @@ -0,0 +1,1284 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "parquet/metadata3.h" + +#include +#include +#include +#include + +#include "arrow/util/compression.h" +#include "arrow/util/crc32.h" +#include "arrow/util/endian.h" +#include "arrow/util/ubsan.h" +#include "arrow/util/unreachable.h" +#include "parquet/file_writer.h" +#include "generated/parquet_types.h" +#include "parquet/thrift_internal.h" + +namespace parquet { + +namespace { + +template +constexpr bool IsEnumEq(T t, U u) { + return static_cast(t) == static_cast(u); +} + +static_assert(IsEnumEq(format::Type::BOOLEAN, format3::Type::BOOLEAN)); +static_assert(IsEnumEq(format::Type::INT32, format3::Type::INT32)); +static_assert(IsEnumEq(format::Type::INT64, format3::Type::INT64)); +static_assert(IsEnumEq(format::Type::INT96, format3::Type::INT96)); +static_assert(IsEnumEq(format::Type::FLOAT, format3::Type::FLOAT)); +static_assert(IsEnumEq(format::Type::DOUBLE, format3::Type::DOUBLE)); +static_assert(IsEnumEq(format::Type::BYTE_ARRAY, format3::Type::BYTE_ARRAY)); +static_assert(IsEnumEq(format::Type::FIXED_LEN_BYTE_ARRAY, + format3::Type::FIXED_LEN_BYTE_ARRAY)); + +static_assert(IsEnumEq(format::FieldRepetitionType::REQUIRED, + format3::FieldRepetitionType::REQUIRED)); +static_assert(IsEnumEq(format::FieldRepetitionType::OPTIONAL, + format3::FieldRepetitionType::OPTIONAL)); +static_assert(IsEnumEq(format::FieldRepetitionType::REPEATED, + format3::FieldRepetitionType::REPEATED)); + +static_assert(IsEnumEq(format::Encoding::PLAIN, format3::Encoding::PLAIN)); +static_assert(IsEnumEq(format::Encoding::PLAIN_DICTIONARY, + format3::Encoding::PLAIN_DICTIONARY)); +static_assert(IsEnumEq(format::Encoding::RLE, format3::Encoding::RLE)); +static_assert(IsEnumEq(format::Encoding::DELTA_BINARY_PACKED, + format3::Encoding::DELTA_BINARY_PACKED)); +static_assert(IsEnumEq(format::Encoding::DELTA_LENGTH_BYTE_ARRAY, + format3::Encoding::DELTA_LENGTH_BYTE_ARRAY)); +static_assert(IsEnumEq(format::Encoding::DELTA_BYTE_ARRAY, + format3::Encoding::DELTA_BYTE_ARRAY)); +static_assert(IsEnumEq(format::Encoding::RLE_DICTIONARY, + format3::Encoding::RLE_DICTIONARY)); +static_assert(IsEnumEq(format::Encoding::BYTE_STREAM_SPLIT, + format3::Encoding::BYTE_STREAM_SPLIT)); + +static_assert(IsEnumEq(format::CompressionCodec::UNCOMPRESSED, + format3::CompressionCodec::UNCOMPRESSED)); +static_assert(IsEnumEq(format::CompressionCodec::SNAPPY, + format3::CompressionCodec::SNAPPY)); +static_assert(IsEnumEq(format::CompressionCodec::GZIP, format3::CompressionCodec::GZIP)); +static_assert(IsEnumEq(format::CompressionCodec::LZO, format3::CompressionCodec::LZO)); +static_assert(IsEnumEq(format::CompressionCodec::BROTLI, + format3::CompressionCodec::BROTLI)); +static_assert(IsEnumEq(format::CompressionCodec::ZSTD, format3::CompressionCodec::ZSTD)); +static_assert(IsEnumEq(format::CompressionCodec::LZ4_RAW, + format3::CompressionCodec::LZ4_RAW)); + +static_assert(IsEnumEq(format::PageType::DATA_PAGE, format3::PageType::DATA_PAGE)); +static_assert(IsEnumEq(format::PageType::DATA_PAGE_V2, format3::PageType::DATA_PAGE_V2)); +static_assert(IsEnumEq(format::PageType::INDEX_PAGE, format3::PageType::INDEX_PAGE)); +static_assert(IsEnumEq(format::PageType::DICTIONARY_PAGE, + format3::PageType::DICTIONARY_PAGE)); + +constexpr double kMinCompressionRatio = 1.2; + +constexpr uint8_t kExtUUID[16] = {0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef}; + +// Extended format compression codec (using same values as format3::CompressionCodec) +enum class CompressionCodec : uint8_t { + UNCOMPRESSED = 0, + LZ4_RAW = 7, +}; + +auto GetNumChildren( + const flatbuffers::Vector>& s, size_t i) { + return s.Get(i)->num_children(); +} + +auto GetNumChildren(const std::vector& s, size_t i) { + return s[i].num_children; +} + +auto GetName(const flatbuffers::Vector>& s, + size_t i) { + return s.Get(i)->name()->str(); +} + +auto GetName(const std::vector& s, size_t i) { return s[i].name; } + +class ColumnMap { + public: + template + explicit ColumnMap(const Schema& s) { + for (size_t i = 0; i < s.size(); ++i) { + if (GetNumChildren(s, i) == 0) colchunk2schema_.push_back(i); + } + BuildParents(s); + } + + size_t ToSchema(size_t cc_idx) const { return colchunk2schema_[cc_idx]; } + std::optional ToCc(size_t schema_idx) const { + auto it = + std::lower_bound(colchunk2schema_.begin(), colchunk2schema_.end(), schema_idx); + if (it == colchunk2schema_.end() || *it != schema_idx) return std::nullopt; + return it - colchunk2schema_.begin(); + } + + template + auto ToPath(const Schema& s, size_t col_idx) { + std::vector path; + size_t len = 0; + for (size_t idx = ToSchema(col_idx); idx != 0; idx = parents_[idx]) ++len; + path.reserve(len); + + for (size_t idx = ToSchema(col_idx); idx != 0; idx = parents_[idx]) { + path.push_back(GetName(s, idx)); + } + std::reverse(path.begin(), path.end()); + + ARROW_DCHECK_EQ(path.size(), len); + return path; + } + + private: + template + void BuildParents(const Schema& s) { + if (s.size() <= 0) return; + parents_.resize(s.size()); + struct Info { + uint32_t parent_idx; + uint32_t remaining_children; + }; + std::stack stack; + parents_[0] = 0; + + stack.push({0, static_cast(GetNumChildren(s, 0))}); + for (size_t idx = 1; idx < s.size(); ++idx) { + parents_[idx] = stack.top().parent_idx; + stack.top().remaining_children--; + if (auto num_children = GetNumChildren(s, idx); num_children > 0) { + stack.push({static_cast(idx), static_cast(num_children)}); + } + while (!stack.empty() && stack.top().remaining_children == 0) stack.pop(); + } + } + + std::vector colchunk2schema_; + std::vector parents_; +}; + +struct MinMax { + struct Packed { + uint32_t lo4 = 0; + uint64_t lo8 = 0; + uint64_t hi8 = 0; + int8_t len = 0; + }; + Packed min; + Packed max; + std::string_view prefix; +}; + +uint32_t LoadLE32(const void* p) { + return ::arrow::bit_util::FromLittleEndian( + ::arrow::util::SafeLoadAs(static_cast(p))); +} +void StoreLE32(uint32_t v, void* p) { + v = ::arrow::bit_util::ToLittleEndian(v); + std::memcpy(p, &v, sizeof(v)); +} + +uint64_t LoadLE64(const void* p) { + return ::arrow::bit_util::FromLittleEndian( + ::arrow::util::SafeLoadAs(static_cast(p))); +} +void StoreLE64(uint64_t v, void* p) { + v = ::arrow::bit_util::ToLittleEndian(v); + std::memcpy(p, &v, sizeof(v)); +} + +uint32_t LoadBE32(const void* p) { + return ::arrow::bit_util::FromBigEndian( + ::arrow::util::SafeLoadAs(static_cast(p))); +} +void StoreBE32(uint32_t v, void* p) { + v = ::arrow::bit_util::ToBigEndian(v); + std::memcpy(p, &v, sizeof(v)); +} + +uint64_t LoadBE64(const void* p) { + return ::arrow::bit_util::FromBigEndian( + ::arrow::util::SafeLoadAs(static_cast(p))); +} +void StoreBE64(uint64_t v, void* p) { + v = ::arrow::bit_util::ToBigEndian(v); + std::memcpy(p, &v, sizeof(v)); +} + +MinMax Pack(format::Type::type type, const std::string& min, bool is_min_exact, + const std::string& max, bool is_max_exact) { + switch (type) { + case format::Type::BOOLEAN: + return {}; + case format::Type::INT32: + case format::Type::FLOAT: { + auto load = [](std::string_view v, bool is_exact) -> MinMax::Packed { + return {.lo4 = LoadLE32(v.data()), .len = static_cast(is_exact ? 4 : -4)}; + }; + return {load(min, is_min_exact), load(max, is_max_exact), ""}; + } + case format::Type::INT64: + case format::Type::DOUBLE: { + auto load = [](std::string_view v, bool is_exact) -> MinMax::Packed { + return {.lo8 = LoadLE64(v.data()), .len = static_cast(is_exact ? 8 : -8)}; + }; + return {load(min, is_min_exact), load(max, is_max_exact), ""}; + } + case format::Type::INT96: { + auto load = [](std::string_view v, bool is_exact) -> MinMax::Packed { + return {.lo4 = LoadLE32(v.data() + 0), + .lo8 = LoadLE64(v.data() + 4), + .len = static_cast(is_exact ? 12 : -12)}; + }; + return {load(min, is_min_exact), load(max, is_max_exact), ""}; + } + case format::Type::FIXED_LEN_BYTE_ARRAY: { + // Special case for decimal16. + if (min.size() == 16 && max.size() == 16 && is_min_exact && is_max_exact) { + auto load = [](std::string_view v) -> MinMax::Packed { + return { + .lo8 = LoadBE64(v.data() + 8), .hi8 = LoadBE64(v.data() + 0), .len = 16}; + }; + return {load(min), load(max), ""}; + } + [[fallthrough]]; + } + case format::Type::BYTE_ARRAY: { + auto load = [](std::string_view v, bool is_exact, bool is_max) -> MinMax::Packed { + if (v.size() <= 4) { + char buf[4] = {}; + memcpy(buf, v.data(), v.size()); + return {.lo4 = LoadBE32(buf), + .len = static_cast(is_exact ? v.size() : -v.size())}; + } + return {.lo4 = LoadBE32(v.data()) + (is_max ? 1 : -1), .len = -4}; + }; + auto [e1, e2] = std::mismatch(max.begin(), max.end(), min.begin(), min.end()); + size_t prefix_len = e1 - max.begin(); + return { + load(std::string_view(min).substr(prefix_len), is_min_exact, false), + load(std::string_view(max).substr(prefix_len), is_max_exact, true), + std::string_view(max).substr(0, prefix_len), + }; + } + } + ::arrow::Unreachable(); +} + +bool Unpack(format::Type::type type, const MinMax& packed, std::string* min, + bool* is_min_exact, std::string* max, bool* is_max_exact) { + switch (type) { + case format::Type::BOOLEAN: + return false; + case format::Type::INT32: + case format::Type::FLOAT: { + auto load = [](const MinMax::Packed& p, std::string* x, bool* is_exact) { + x->resize(4); + StoreLE32(p.lo4, x->data()); + *is_exact = p.len > 0; + }; + load(packed.min, min, is_min_exact); + load(packed.max, max, is_max_exact); + return true; + } + case format::Type::INT64: + case format::Type::DOUBLE: { + auto load = [](const MinMax::Packed& p, std::string* x, bool* is_exact) { + x->resize(8); + StoreLE64(p.lo8, x->data()); + *is_exact = p.len > 0; + }; + load(packed.min, min, is_min_exact); + load(packed.max, max, is_max_exact); + return true; + } + case format::Type::INT96: { + auto load = [](const MinMax::Packed& p, std::string* x, bool* is_exact) { + x->resize(12); + StoreLE32(p.lo4, x->data() + 0); + StoreLE64(p.lo8, x->data() + 4); + *is_exact = p.len > 0; + }; + load(packed.min, min, is_min_exact); + load(packed.max, max, is_max_exact); + return true; + } + case format::Type::BYTE_ARRAY: + case format::Type::FIXED_LEN_BYTE_ARRAY: { + auto load = [](const MinMax::Packed& p, std::string_view prefix, std::string* x, + bool* is_exact) { + x->resize(prefix.size() + 16); + if (!prefix.empty()) std::memcpy(x->data(), prefix.data(), prefix.size()); + if (p.len == 16) { + StoreBE64(p.hi8, x->data() + prefix.size() + 0); + StoreBE64(p.lo8, x->data() + prefix.size() + 8); + *is_exact = true; + } else { + StoreBE32(p.lo4, x->data() + prefix.size()); + x->resize(prefix.size() + std::abs(p.len)); + *is_exact = p.len >= 0; + } + }; + load(packed.min, packed.prefix, min, is_min_exact); + load(packed.max, packed.prefix, max, is_max_exact); + return true; + } + } + ::arrow::Unreachable(); +} + +struct ThriftConverter { + explicit ThriftConverter(const format3::FileMetaData* md, format::FileMetaData* to) + : md(md), colmap(*md->schema()), to(to) {} + + const format3::FileMetaData* md; + ColumnMap colmap; + format::FileMetaData* to; + + template + auto To(const flatbuffers::Vector* in, Args&&... args) { + std::vectorGet(0), std::forward(args)..., 0))> out; + if (!in) return out; + out.reserve(in->size()); + int idx = 0; + for (auto&& e : *in) out.push_back(To(e, std::forward(args)..., idx++)); + return out; + } + + auto To(format3::TimeUnit t) { + format::TimeUnit out; + if (t == format3::TimeUnit::NS) + out.__set_NANOS({}); + else if (t == format3::TimeUnit::US) + out.__set_MICROS({}); + else + out.__set_MILLIS({}); + return out; + } + + auto To(format3::Type t) { return static_cast(t); } + auto To(format3::FieldRepetitionType t) { + return static_cast(t); + } + auto To(format3::CompressionCodec c) { + return static_cast(c); + } + auto To(format3::Encoding e, size_t) { return static_cast(e); } + auto To(format3::PageType t) { return static_cast(t); } + auto To(format3::EdgeInterpolationAlgorithm a) { + return static_cast(a); + } + + auto To(format3::LogicalType t, size_t col_idx) { + const format3::SchemaElement* e = md->schema()->Get(col_idx); + format::LogicalType out; + switch (t) { + case format3::LogicalType::StringType: + out.__set_STRING({}); + break; + case format3::LogicalType::MapType: + out.__set_MAP({}); + break; + case format3::LogicalType::ListType: + out.__set_LIST({}); + break; + case format3::LogicalType::EnumType: + out.__set_ENUM({}); + break; + case format3::LogicalType::DecimalType: { + format::DecimalType dt; + dt.__set_precision(e->logical_type_as_DecimalType()->precision()); + dt.__set_scale(e->logical_type_as_DecimalType()->scale()); + out.__set_DECIMAL(dt); + break; + } + case format3::LogicalType::DateType: + out.__set_DATE({}); + break; + case format3::LogicalType::TimeType: { + format::TimeType tt; + tt.__set_unit(To(e->logical_type_as_TimeType()->unit())); + tt.__set_isAdjustedToUTC(e->logical_type_as_TimeType()->is_adjusted_to_utc()); + out.__set_TIME(tt); + break; + } + case format3::LogicalType::TimestampType: { + format::TimestampType tt; + tt.__set_unit(To(e->logical_type_as_TimestampType()->unit())); + tt.__set_isAdjustedToUTC( + e->logical_type_as_TimestampType()->is_adjusted_to_utc()); + out.__set_TIMESTAMP(tt); + break; + } + case format3::LogicalType::IntType: { + format::IntType it; + it.__set_bitWidth(e->logical_type_as_IntType()->bit_width()); + it.__set_isSigned(e->logical_type_as_IntType()->is_signed()); + out.__set_INTEGER(it); + break; + } + case format3::LogicalType::NullType: + out.__set_UNKNOWN({}); + break; + case format3::LogicalType::JsonType: + out.__set_JSON({}); + break; + case format3::LogicalType::BsonType: + out.__set_BSON({}); + break; + case format3::LogicalType::UUIDType: + out.__set_UUID({}); + break; + case format3::LogicalType::Float16Type: + out.__set_FLOAT16({}); + break; + case format3::LogicalType::VariantType: + out.__set_VARIANT({}); + break; + case format3::LogicalType::GeometryType: { + format::GeometryType gt; + gt.__set_crs(e->logical_type_as_GeometryType()->crs()->str()); + out.__set_GEOMETRY(gt); + break; + } + case format3::LogicalType::GeographyType: { + format::GeographyType gt; + gt.__set_crs(e->logical_type_as_GeographyType()->crs()->str()); + gt.__set_algorithm(To(e->logical_type_as_GeographyType()->algorithm())); + out.__set_GEOGRAPHY(gt); + break; + } + default: + ::arrow::Unreachable(); + } + return out; + } + + auto To(format3::ColumnOrder) { + format::ColumnOrder out; + out.__set_TYPE_ORDER({}); + return out; + } + + auto To(const format3::SchemaElement* e, size_t col_idx) { + format::SchemaElement out; + out.name = e->name()->str(); + if (e->type()) { + out.__isset.type = true; + out.type = To(*e->type()); + } + out.__isset.repetition_type = true; + out.repetition_type = To(e->repetition_type()); + if (e->logical_type_type() != format3::LogicalType::NONE) { + out.__isset.logicalType = true; + out.logicalType = To(e->logical_type_type(), col_idx); + } + if (e->type_length()) { + out.__isset.type_length = true; + out.type_length = *e->type_length(); + } + if (e->num_children() != 0) { + out.__isset.num_children = true; + out.num_children = e->num_children(); + } + + if (e->field_id()) { + out.__isset.field_id = true; + out.field_id = *e->field_id(); + } + return out; + } + + auto To(const format3::KV* kv, size_t) { + format::KeyValue out; + if (kv->key()->size() != 0) out.key = kv->key()->str(); + if (flatbuffers::IsFieldPresent(kv, format3::KV::VT_VAL)) { + out.__isset.value = true; + out.value = kv->val()->str(); + } + return out; + } + + auto To(const format3::Statistics* s, size_t, size_t col_idx) { + format::Statistics out; + if (s->null_count()) { + out.__isset.null_count = true; + out.null_count = *s->null_count(); + } + + bool set = false; + if (s->min_len() && s->max_len()) { + MinMax mm{ + .min = {.lo4 = s->min_lo4(), + .lo8 = s->min_lo8(), + .hi8 = s->min_hi8(), + .len = *s->min_len()}, + .max = {.lo4 = s->max_lo4(), + .lo8 = s->max_lo8(), + .hi8 = s->max_hi8(), + .len = *s->max_len()}, + .prefix = flatbuffers::GetStringView(s->prefix()), + }; + set = Unpack(To(*md->schema()->Get(colmap.ToSchema(col_idx))->type()), mm, + &out.min_value, &out.is_min_value_exact, &out.max_value, + &out.is_max_value_exact); + } + out.__isset.min_value = set; + out.__isset.max_value = set; + out.__isset.is_min_value_exact = set; + out.__isset.is_max_value_exact = set; + return out; + } + + auto To(const format3::ColumnMetadata* cm, size_t rg_idx, size_t col_idx) { + format::ColumnMetaData out; + out.type = To(*md->schema()->Get(colmap.ToSchema(col_idx))->type()); + out.codec = To(cm->codec()); + out.num_values = + cm->num_values() ? *cm->num_values() : md->row_groups()->Get(rg_idx)->num_rows(); + out.total_uncompressed_size = cm->total_uncompressed_size(); + out.total_compressed_size = cm->total_compressed_size(); + out.key_value_metadata = To(cm->key_value_metadata()); + if (cm->data_page_offset() == 0) { + out.data_page_offset = md->row_groups()->Get(rg_idx)->file_offset(); + } else { + out.data_page_offset = cm->data_page_offset(); + } + if (cm->index_page_offset()) { + out.__isset.index_page_offset = true; + out.index_page_offset = *cm->index_page_offset(); + } + if (cm->dictionary_page_offset()) { + out.__isset.dictionary_page_offset = true; + out.dictionary_page_offset = *cm->dictionary_page_offset(); + } + if (cm->statistics()) { + out.__isset.statistics = true; + out.statistics = To(cm->statistics(), rg_idx, col_idx); + } + if (cm->bloom_filter_offset()) { + out.__isset.bloom_filter_offset = true; + out.bloom_filter_offset = *cm->bloom_filter_offset(); + } + if (cm->bloom_filter_length()) { + out.__isset.bloom_filter_length = true; + out.bloom_filter_length = *cm->bloom_filter_length(); + } + if (cm->is_fully_dict_encoded()) { + // Adding a fake encoding_stats with one dictionary page and one data page with + // dictionary encoding to trick the reader function + // parquet::IsColumnChunkFullyDictionaryEncoded into treating this as + // fully_dict_encoded. + out.__isset.encoding_stats = true; + auto& dict = out.encoding_stats.emplace_back(); + dict.__set_page_type(format::PageType::DICTIONARY_PAGE); + dict.__set_encoding(format::Encoding::PLAIN); + auto& pes = out.encoding_stats.emplace_back(); + pes.__set_page_type(format::PageType::DATA_PAGE); + pes.__set_encoding(format::Encoding::RLE_DICTIONARY); + } + return out; + } + + auto To(const format3::ColumnChunk* cc, size_t rg_idx, size_t col_idx) { + format::ColumnChunk out; + out.__isset.meta_data = true; + out.meta_data = To(cc->meta_data(), rg_idx, col_idx); + out.meta_data.path_in_schema = colmap.ToPath(*md->schema(), col_idx); + return out; + } + + auto To(const format3::SortingColumn* sc, size_t, size_t) { + format::SortingColumn out; + out.column_idx = sc->column_idx(); + out.descending = sc->descending(); + out.nulls_first = sc->nulls_first(); + return out; + } + + auto To(const format3::RowGroup* rg, size_t rg_idx) { + format::RowGroup out; + out.columns = To(rg->columns(), rg_idx); + if (rg->sorting_columns()) { + out.__isset.sorting_columns = true; + out.sorting_columns = To(rg->sorting_columns(), rg_idx); + } + out.__set_total_byte_size(rg->total_byte_size()); + out.__set_num_rows(rg->num_rows()); + if (rg->ordinal()) out.__set_ordinal(*rg->ordinal()); + out.__set_file_offset(rg->file_offset()); + out.__set_total_compressed_size(rg->total_compressed_size()); + return out; + } + + void To() { + to->__set_version(md->version()); + to->__set_num_rows(md->num_rows()); + to->row_groups = To(md->row_groups()); + if (md->kv()) { + to->__isset.key_value_metadata = true; + to->key_value_metadata = To(md->kv()); + } + if (md->created_by()->size() > 0) { + to->__isset.created_by = true; + to->created_by = md->created_by()->str(); + } + to->schema = To(md->schema()); + for (auto* e : *md->schema()) { + if (e->num_children() == 0) { + to->column_orders.push_back(To(e->column_order_type())); + to->__isset.column_orders = true; + } + } + } +}; +struct FlatbufferConverter { + explicit FlatbufferConverter(const format::FileMetaData& md) + : md(md), colmap(md.schema) {} + const format::FileMetaData& md; + ColumnMap colmap; + ::flatbuffers::FlatBufferBuilder root; + + template + auto To(const std::vector& in, Args&&... args) { + std::vector(args)..., 0))> out; + out.reserve(in.size()); + size_t idx = 0; + for (auto&& e : in) out.push_back(To(e, std::forward(args)..., idx++)); + return root.CreateVector(out); + } + + template + auto OptTo(const std::vector& in, Args&&... args) + -> std::optional(args)...))> { + if (in.empty()) return std::nullopt; + return To(in, std::forward(args)...); + } + + auto To(format::TimeUnit t) { + if (t.__isset.NANOS) return format3::TimeUnit::NS; + if (t.__isset.MICROS) return format3::TimeUnit::US; + return format3::TimeUnit::MS; + } + auto To(format::Type::type t) { return static_cast(t); } + auto To(format::FieldRepetitionType::type t) { + return static_cast(t); + } + auto To(format::CompressionCodec::type c) { + return static_cast(c); + } + auto To(format::Encoding::type e) { return static_cast(e); } + auto To(format::PageType::type p) { return static_cast(p); } + auto To(format::EdgeInterpolationAlgorithm::type a) { + return static_cast(a); + } + + std::pair> To( + const format::LogicalType& t) { + if (t.__isset.STRING) { + return {format3::LogicalType::StringType, format3::CreateEmpty(root).Union()}; + } else if (t.__isset.MAP) { + return {format3::LogicalType::MapType, format3::CreateEmpty(root).Union()}; + } else if (t.__isset.LIST) { + return {format3::LogicalType::ListType, format3::CreateEmpty(root).Union()}; + } else if (t.__isset.ENUM) { + return {format3::LogicalType::EnumType, format3::CreateEmpty(root).Union()}; + } else if (t.__isset.DECIMAL) { + return { + format3::LogicalType::DecimalType, + format3::CreateDecimalOpts(root, t.DECIMAL.precision, t.DECIMAL.scale).Union()}; + } else if (t.__isset.DATE) { + return {format3::LogicalType::DateType, format3::CreateEmpty(root).Union()}; + } else if (t.__isset.TIME) { + auto tu = To(t.TIME.unit); + return {format3::LogicalType::TimeType, + format3::CreateTimeOpts(root, t.TIME.isAdjustedToUTC, tu).Union()}; + } else if (t.__isset.TIMESTAMP) { + auto tu = To(t.TIMESTAMP.unit); + return {format3::LogicalType::TimestampType, + format3::CreateTimeOpts(root, t.TIMESTAMP.isAdjustedToUTC, tu).Union()}; + } else if (t.__isset.INTEGER) { + return { + format3::LogicalType::IntType, + format3::CreateIntOpts(root, t.INTEGER.bitWidth, t.INTEGER.isSigned).Union()}; + } else if (t.__isset.UNKNOWN) { + return {format3::LogicalType::NullType, format3::CreateEmpty(root).Union()}; + } else if (t.__isset.JSON) { + return {format3::LogicalType::JsonType, format3::CreateEmpty(root).Union()}; + } else if (t.__isset.BSON) { + return {format3::LogicalType::BsonType, format3::CreateEmpty(root).Union()}; + } else if (t.__isset.UUID) { + return {format3::LogicalType::UUIDType, format3::CreateEmpty(root).Union()}; + } else if (t.__isset.FLOAT16) { + return {format3::LogicalType::Float16Type, format3::CreateEmpty(root).Union()}; + } else if (t.__isset.VARIANT) { + return {format3::LogicalType::VariantType, format3::CreateEmpty(root).Union()}; + } else if (t.__isset.GEOMETRY) { + auto crs = root.CreateString(t.GEOMETRY.crs); + return {format3::LogicalType::GeometryType, + format3::CreateGeometryType(root, crs).Union()}; + } else if (t.__isset.GEOGRAPHY) { + auto crs = t.GEOGRAPHY.__isset.crs ? root.CreateString(t.GEOGRAPHY.crs) : 0; + return {format3::LogicalType::GeographyType, + format3::CreateGeographyType(root, crs, To(t.GEOGRAPHY.algorithm)).Union()}; + } + ::arrow::Unreachable(); + } + + std::pair> To( + const format::ConvertedType::type& t, const format::SchemaElement& e) { + if (t == format::ConvertedType::UTF8) { + return {format3::LogicalType::StringType, format3::CreateEmpty(root).Union()}; + } else if (t == format::ConvertedType::MAP) { + return {format3::LogicalType::MapType, format3::CreateEmpty(root).Union()}; + } else if (t == format::ConvertedType::LIST) { + return {format3::LogicalType::ListType, format3::CreateEmpty(root).Union()}; + } else if (t == format::ConvertedType::ENUM) { + return {format3::LogicalType::EnumType, format3::CreateEmpty(root).Union()}; + } else if (t == format::ConvertedType::DECIMAL) { + return {format3::LogicalType::DecimalType, + format3::CreateDecimalOpts(root, e.precision, e.scale).Union()}; + } else if (t == format::ConvertedType::DATE) { + return {format3::LogicalType::DateType, format3::CreateEmpty(root).Union()}; + } else if (t == format::ConvertedType::TIME_MILLIS) { + return {format3::LogicalType::TimeType, + format3::CreateTimeOpts(root, false, format3::TimeUnit::MS).Union()}; + } else if (t == format::ConvertedType::TIME_MICROS) { + return {format3::LogicalType::TimeType, + format3::CreateTimeOpts(root, false, format3::TimeUnit::US).Union()}; + } else if (t == format::ConvertedType::TIMESTAMP_MILLIS) { + return {format3::LogicalType::TimestampType, + format3::CreateTimeOpts(root, false, format3::TimeUnit::MS).Union()}; + } else if (t == format::ConvertedType::TIMESTAMP_MICROS) { + return {format3::LogicalType::TimestampType, + format3::CreateTimeOpts(root, false, format3::TimeUnit::US).Union()}; + } else if (t == format::ConvertedType::INT_8) { + return {format3::LogicalType::IntType, + format3::CreateIntOpts(root, 8, true).Union()}; + } else if (t == format::ConvertedType::INT_16) { + return {format3::LogicalType::IntType, + format3::CreateIntOpts(root, 16, true).Union()}; + } else if (t == format::ConvertedType::INT_32) { + return {format3::LogicalType::IntType, + format3::CreateIntOpts(root, 32, true).Union()}; + } else if (t == format::ConvertedType::INT_64) { + return {format3::LogicalType::IntType, + format3::CreateIntOpts(root, 64, true).Union()}; + } else if (t == format::ConvertedType::UINT_8) { + return {format3::LogicalType::IntType, + format3::CreateIntOpts(root, 8, false).Union()}; + } else if (t == format::ConvertedType::UINT_16) { + return {format3::LogicalType::IntType, + format3::CreateIntOpts(root, 16, false).Union()}; + } else if (t == format::ConvertedType::UINT_32) { + return {format3::LogicalType::IntType, + format3::CreateIntOpts(root, 32, false).Union()}; + } else if (t == format::ConvertedType::UINT_64) { + return {format3::LogicalType::IntType, + format3::CreateIntOpts(root, 64, false).Union()}; + } else if (t == format::ConvertedType::JSON) { + return {format3::LogicalType::JsonType, format3::CreateEmpty(root).Union()}; + } else if (t == format::ConvertedType::BSON) { + return {format3::LogicalType::BsonType, format3::CreateEmpty(root).Union()}; + } else if (t == format::ConvertedType::INTERVAL) { + // todo: no logical type? + return {format3::LogicalType::NONE, format3::CreateEmpty(root).Union()}; + } + ::arrow::Unreachable(); + } + + std::pair> To( + const format::ColumnOrder& co) { + (void)co; + assert(co.__isset.TYPE_ORDER); + return {format3::ColumnOrder::TypeDefinedOrder, format3::CreateEmpty(root).Union()}; + } + + auto To(const format::SchemaElement& e, size_t schema_idx) { + auto name = root.CreateSharedString(e.name); + std::optional>> + logical_type; + if (e.__isset.logicalType) logical_type = To(e.logicalType); + if (!logical_type && e.__isset.converted_type) logical_type = To(e.converted_type, e); + std::optional>> + column_order; + if (md.__isset.column_orders && !md.column_orders.empty()) { + if (auto cc_idx = colmap.ToCc(schema_idx)) { + column_order = To(md.column_orders.at(*cc_idx)); + } + } + + format3::SchemaElementBuilder b(root); + if (e.__isset.type) b.add_type(To(e.type)); + if (e.__isset.repetition_type) b.add_repetition_type(To(e.repetition_type)); + if (logical_type) { + b.add_logical_type_type(logical_type->first); + b.add_logical_type(logical_type->second); + } + if (e.__isset.type_length) b.add_type_length(e.type_length); + b.add_name(name); + if (e.__isset.num_children) b.add_num_children(e.num_children); + if (e.__isset.field_id) b.add_field_id(e.field_id); + if (column_order) { + b.add_column_order_type(column_order->first); + b.add_column_order(column_order->second); + } + return b.Finish(); + } + + auto To(const format::KeyValue& kv, size_t) { + auto key = root.CreateSharedString(kv.key); + std::optional<::flatbuffers::Offset<::flatbuffers::String>> val; + if (kv.__isset.value) val = root.CreateSharedString(kv.value); + + format3::KVBuilder b(root); + b.add_key(key); + if (val) b.add_val(*val); + return b.Finish(); + } + + using Binary = ::flatbuffers::Offset>; + std::optional ToBinary(const std::string& v) { + if (v.empty()) return std::nullopt; + return std::make_optional( + root.CreateVector(reinterpret_cast(v.data()), v.size())); + } + + struct FixedVal { + uint32_t lo4 = 0; + uint64_t lo8 = 0; + uint64_t hi8 = 0; + uint8_t len = 0; + }; + + auto To(const format::ColumnMetaData& cm) { + if (!cm.encoding_stats.empty()) { + for (auto&& e : cm.encoding_stats) { + if (e.page_type != format::PageType::DATA_PAGE && + e.page_type != format::PageType::DATA_PAGE_V2) + continue; + if (e.encoding != format::Encoding::PLAIN_DICTIONARY && + e.encoding != format::Encoding::RLE_DICTIONARY) { + return false; + } + } + return true; + } + bool has_plain_dictionary_encoding = false; + bool has_non_dictionary_encoding = false; + for (auto encoding : cm.encodings) { + switch (encoding) { + case format::Encoding::PLAIN_DICTIONARY: + // PLAIN_DICTIONARY encoding was present, which means at + // least one page was dictionary encoded and v1.0 encodings are used. + has_plain_dictionary_encoding = true; + break; + case format::Encoding::RLE: + case format::Encoding::BIT_PACKED: + // Other than for boolean values, RLE and BIT_PACKED are only used for + // repetition or definition levels. Additionally booleans are not dictionary + // encoded hence it is safe to disregard the case where some boolean data pages + // are dictionary encoded and some boolean pages are RLE/BIT_PACKED encoded. + break; + default: + has_non_dictionary_encoding = true; + break; + } + } + if (has_plain_dictionary_encoding) { + // Return true, if there are no encodings other than dictionary or + // repetition/definition levels. + return !has_non_dictionary_encoding; + } + + // If PLAIN_DICTIONARY wasn't present, then either the column is not + // dictionary-encoded, or the 2.0 encoding, RLE_DICTIONARY, was used. + // For 2.0, this cannot determine whether a page fell back to non-dictionary encoding + // without page encoding stats. + return false; + } + + auto To(const format::Statistics& s, size_t, size_t col_idx) { + std::optional mm; + if (s.__isset.min_value && s.__isset.max_value) { + const auto& col = md.schema[colmap.ToSchema(col_idx)]; + mm = Pack(col.type, s.min_value, s.is_min_value_exact, s.max_value, + s.is_max_value_exact); + } + + std::optional> prefix; + if (mm && !mm->prefix.empty()) prefix = root.CreateSharedString(mm->prefix); + + format3::StatisticsBuilder b(root); + if (s.__isset.null_count) b.add_null_count(s.null_count); + if (prefix) b.add_prefix(*prefix); + if (mm) { + b.add_min_lo4(mm->min.lo4); + b.add_min_lo8(mm->min.lo8); + b.add_min_hi8(mm->min.hi8); + b.add_min_len(mm->min.len); + b.add_max_lo4(mm->max.lo4); + b.add_max_lo8(mm->max.lo8); + b.add_max_hi8(mm->max.hi8); + b.add_max_len(mm->max.len); + } + return b.Finish(); + } + + auto To(const format::ColumnMetaData& cm, size_t rg_idx, size_t col_idx) { + auto codec = To(cm.codec); + auto kv = OptTo(cm.key_value_metadata); + std::optional statistics; + if (cm.__isset.statistics) statistics = To(cm.statistics, rg_idx, col_idx); + + // All offsets are relative to the row group. + const auto& rg = md.row_groups[rg_idx]; + + format3::ColumnMetadataBuilder b(root); + b.add_codec(codec); + if (rg.num_rows != cm.num_values) b.add_num_values(cm.num_values); + b.add_total_uncompressed_size(cm.total_uncompressed_size); + b.add_total_compressed_size(cm.total_compressed_size); + if (kv) b.add_key_value_metadata(*kv); + if (cm.data_page_offset != rg.file_offset) { + b.add_data_page_offset(cm.data_page_offset); + } + if (cm.__isset.index_page_offset) b.add_index_page_offset(cm.index_page_offset); + if (cm.__isset.dictionary_page_offset) { + b.add_dictionary_page_offset(cm.dictionary_page_offset); + } + if (statistics) b.add_statistics(*statistics); + b.add_is_fully_dict_encoded(To(cm)); + if (cm.__isset.bloom_filter_offset) b.add_bloom_filter_offset(cm.bloom_filter_offset); + if (cm.__isset.bloom_filter_length) b.add_bloom_filter_length(cm.bloom_filter_length); + + ARROW_DCHECK_EQ(cm.path_in_schema, colmap.ToPath(md.schema, col_idx)); + return b.Finish(); + } + + auto To(const format::ColumnChunk& cc, size_t rg_idx, size_t col_idx) { + auto meta_data = To(cc.meta_data, rg_idx, col_idx); + + format3::ColumnChunkBuilder b(root); + b.add_meta_data(meta_data); + // TODO + // - crypto_metadata + // - encrypted_column_metadata + return b.Finish(); + } + + auto To(const format::SortingColumn& sc, size_t, size_t) { + return format3::CreateSortingColumn(root, sc.column_idx, sc.descending, + sc.nulls_first); + } + + auto To(const format::RowGroup& rg, size_t rg_idx) { + auto columns = To(rg.columns, rg_idx); + auto sorting_columns = OptTo(rg.sorting_columns, rg_idx); + + format3::RowGroupBuilder b(root); + b.add_columns(columns); + b.add_total_byte_size(rg.total_byte_size); + b.add_num_rows(rg.num_rows); + if (sorting_columns) b.add_sorting_columns(*sorting_columns); + if (rg.__isset.file_offset) b.add_file_offset(rg.file_offset); + if (rg.__isset.total_compressed_size) + b.add_total_compressed_size(rg.total_compressed_size); + if (rg.__isset.ordinal) b.add_ordinal(rg.ordinal); + return b.Finish(); + } + + void To() { + auto schema = To(md.schema); + auto row_groups = To(md.row_groups); + auto kv = OptTo(md.key_value_metadata); + auto created_by = root.CreateString(md.created_by); + + format3::FileMetaDataBuilder b(root); + b.add_version(md.version); + b.add_schema(schema); + b.add_num_rows(md.num_rows); + b.add_row_groups(row_groups); + if (kv) b.add_kv(*kv); + b.add_created_by(created_by); // check empty + // TODO + // - encryption_algorithm + // - footer_signing_key_metadata + root.Finish(b.Finish()); + } +}; + +static const std::unordered_set kSupportedConvertedTypes({ + format::ConvertedType::UTF8, + format::ConvertedType::MAP, + format::ConvertedType::LIST, + format::ConvertedType::ENUM, + format::ConvertedType::DECIMAL, + format::ConvertedType::DATE, + format::ConvertedType::TIME_MILLIS, + format::ConvertedType::TIME_MICROS, + format::ConvertedType::TIMESTAMP_MILLIS, + format::ConvertedType::TIMESTAMP_MICROS, + format::ConvertedType::UINT_8, + format::ConvertedType::UINT_16, + format::ConvertedType::UINT_32, + format::ConvertedType::UINT_64, + format::ConvertedType::INT_8, + format::ConvertedType::INT_16, + format::ConvertedType::INT_32, + format::ConvertedType::INT_64, + format::ConvertedType::JSON, + format::ConvertedType::BSON, + format::ConvertedType::INTERVAL, +}); + +bool CanConvertToFlatbuffer(const format::FileMetaData& md) { + if (md.__isset.encryption_algorithm) return false; + if (md.__isset.footer_signing_key_metadata) return false; + + for (auto&& se : md.schema) { + if (se.__isset.logicalType) { + auto&& lt = se.logicalType; + if (!lt.__isset.STRING && !lt.__isset.MAP && !lt.__isset.LIST && !lt.__isset.ENUM && + !lt.__isset.DECIMAL && !lt.__isset.DATE && !lt.__isset.TIME && + !lt.__isset.TIMESTAMP && !lt.__isset.INTEGER && !lt.__isset.UNKNOWN && + !lt.__isset.JSON && !lt.__isset.BSON && !lt.__isset.UUID && + !lt.__isset.FLOAT16 && !lt.__isset.VARIANT && !lt.__isset.GEOMETRY && + !lt.__isset.GEOGRAPHY) { + return false; + } + } + if (se.__isset.converted_type) { + if (kSupportedConvertedTypes.count(se.converted_type) == 0) { + return false; + } + } + } + for (auto&& rg : md.row_groups) { + for (auto&& cc : rg.columns) { + if (cc.__isset.crypto_metadata) return false; + if (cc.__isset.encrypted_column_metadata) return false; + } + } + return true; +} + +::arrow::Status CheckMagicNumber(const uint8_t* p) { + if (std::memcmp(p, kParquetMagic, 4) == 0) return ::arrow::Status::OK(); + if (std::memcmp(p, kParquetEMagic, 4) == 0) + return ::arrow::Status::NotImplemented("metdata3 doesn't support encrypted footer"); + return ::arrow::Status::Invalid("invalid magic number"); +} + +} // namespace + +bool ToFlatbuffer(format::FileMetaData* md, std::string* flatbuffer) { + if (!CanConvertToFlatbuffer(*md)) return false; + FlatbufferConverter conv(*md); + conv.To(); + *flatbuffer = std::string(reinterpret_cast(conv.root.GetBufferPointer()), + conv.root.GetSize()); + return true; +} + +format::FileMetaData FromFlatbuffer(const format3::FileMetaData* md) { + format::FileMetaData result; + ThriftConverter conv(md, &result); + conv.To(); + return result; +} + +static std::string PackFlatbuffer(const std::string& in) { + std::string out; + int n = 0; + // Create LZ4 codec + auto maybe_codec = ::arrow::util::Codec::Create(::arrow::Compression::LZ4); + if (maybe_codec.ok()) { + auto codec = std::move(*maybe_codec); + int64_t max_compressed_len = + codec->MaxCompressedLen(in.size(), reinterpret_cast(in.data())); + out.resize(max_compressed_len + 33); + PARQUET_ASSIGN_OR_THROW( + n, codec->Compress(in.size(), reinterpret_cast(in.data()), + max_compressed_len, reinterpret_cast(out.data()))); + } + if (!maybe_codec.ok() || 1.0 * in.size() / n < kMinCompressionRatio) { + // Compression not worth it, store uncompressed + out.resize(in.size() + 33); // Ensure buffer is sized before writing + std::memcpy(out.data(), in.data(), in.size()); + n = in.size(); + out[n] = static_cast(CompressionCodec::UNCOMPRESSED); + } else { + // Use compressed data + out[n] = static_cast(CompressionCodec::LZ4_RAW); + } + + // Pointer to metadata section (after data and compressor byte) + uint8_t* const p = reinterpret_cast(out.data()) + n + 1; + + // Compute and store checksums and lengths + uint32_t crc32 = ::arrow::internal::crc32(0, reinterpret_cast(out.data()), n + 1); + StoreLE32(crc32, p + 0); // crc32(data .. compressor) + StoreLE32(n, p + 4); // compressed_len + StoreLE32(in.size(), p + 8); // raw_len + uint32_t len_crc32 = ::arrow::internal::crc32(0, p + 4, 8); + StoreLE32(len_crc32, p + 12); // crc32(compressed_len .. raw_len) + + // Store UUID identifier + std::memcpy(p + 16, kExtUUID, 16); + out.resize(n + 33); + return out; +} + +inline uint8_t* WriteULEB64(uint64_t v, uint8_t* out) { + uint8_t* p = out; + do { + uint8_t b = v & 0x7F; + if (v < 0x80) { + *p++ = b; + return p; + } + *p++ = b | 0x80; + v >>= 7; + } while (true); +} + +inline uint32_t CountLeadingZeros32(uint32_t v) { + if (v == 0) return 32; + uint32_t count = 0; + uint32_t mask = 0x80000000; + while ((v & mask) == 0) { + ++count; + mask >>= 1; + } + return count; +} + +inline int32_t ULEB32Len(uint32_t v) { + return 1 + ((32 - CountLeadingZeros32(v | 0x1)) * 9) / 64; +} + +void AppendFlatbuffer(std::string flatbuffer, std::string* thrift) { + // Pack the flatbuffer with LZ4 compression and checksums + std::string packed = PackFlatbuffer(flatbuffer); + + const uint32_t kFieldId = 32767; + int header_size = 1 + ULEB32Len(kFieldId) + ULEB32Len(packed.length()); + + const size_t old_size = thrift->size(); + thrift->resize(old_size + header_size + packed.size() + 1); // +1 for stop field + + // Pointer to the new write position + uint8_t* p = reinterpret_cast(&(*thrift)[old_size]); + + // Write the binary type indicator + *p++ = 0x08; + + // Write field id and size using ULEB64 + p = WriteULEB64(kFieldId, p); + p = WriteULEB64(static_cast(packed.size()), p); + + // Copy the packed payload + std::memcpy(p, packed.data(), packed.size()); + p += packed.size(); + + // Add stop field + *p = 0x00; + return; +} + +::arrow::Result ExtractFlatbuffer(std::shared_ptr buf, std::string* out_flatbuffer) { + if (buf->size() < 8) return 8; + PARQUET_THROW_NOT_OK(CheckMagicNumber(buf->data() + buf->size() - 4)); + uint32_t md_len = LoadLE32(buf->data() + buf->size() -8); + if (md_len < 34) return 0; + if (buf->size() < 42) return 42; // 34 (metadata3 trailer) + 8 (len + PAR1) + + // Check for extended metadata marker (UUID) at the end of the metadata + const uint8_t* p = buf->data() + buf->size() - 42; + if (memcmp(p + 17, kExtUUID, 16) != 0) { + // No extended metadata, return 0 to indicate no flatbuffer found + return 0; + } + + // Extended metadata is present - extract and parse it + auto compressor = static_cast(*p); + uint32_t crc32_val = LoadLE32(p + 1); + uint32_t compressed_len = LoadLE32(p + 5); + uint32_t raw_len = LoadLE32(p + 9); + uint32_t len_crc32 = LoadLE32(p + 13); + + // Verify length CRC + uint32_t expected_len_crc = ::arrow::internal::crc32(0, p + 5, 8); + if (len_crc32 != expected_len_crc) { + return ::arrow::Status::Invalid("Extended metadata length CRC mismatch"); + } + + // Verify data CRC + uint32_t expected_crc = ::arrow::internal::crc32(0, p - compressed_len, compressed_len + 1); + if (crc32_val != expected_crc) { + return ::arrow::Status::Invalid("Extended metadata data CRC mismatch"); + } + + // Decompress if needed + std::vector decompressed_data(raw_len); + switch (compressor) { + case CompressionCodec::UNCOMPRESSED: + if (compressed_len != raw_len) { + return ::arrow::Status::Invalid("UNCOMPRESSED length mismatch"); + } + std::memcpy(decompressed_data.data(), p - compressed_len, raw_len); + break; + case CompressionCodec::LZ4_RAW: { + if (raw_len < compressed_len) { + return ::arrow::Status::Invalid("LZ4 length error: raw_len < compressed_len"); + } + // Use Arrow's LZ4 codec for decompression + ARROW_ASSIGN_OR_RAISE(auto codec, ::arrow::util::Codec::Create(::arrow::Compression::LZ4)); + ARROW_ASSIGN_OR_RAISE( + int64_t actual_size, + codec->Decompress(compressed_len, p - compressed_len, raw_len, + decompressed_data.data())); + if (static_cast(actual_size) != raw_len) { + return ::arrow::Status::Invalid("LZ4 decompression failed: expected ", raw_len, + " bytes but got ", actual_size, " bytes"); + } + break; + } + default: + return ::arrow::Status::NotImplemented("Unsupported compression codec"); + } + + // Verify flatbuffer + auto verifier = flatbuffers::Verifier(decompressed_data.data(), raw_len); + if (!format3::VerifyFileMetaDataBuffer(verifier)) { + return ::arrow::Status::Invalid("Flatbuffer verification failed"); + } + + ARROW_CHECK_NE(out_flatbuffer, nullptr); + out_flatbuffer->assign(reinterpret_cast(decompressed_data.data()), raw_len); + + return compressed_len + 42; +} + +} // namespace parquet diff --git a/cpp/src/parquet/metadata3.h b/cpp/src/parquet/metadata3.h new file mode 100644 index 00000000000..3458eac70a1 --- /dev/null +++ b/cpp/src/parquet/metadata3.h @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/result.h" +#include "flatbuffers/flatbuffers.h" +#include "generated/parquet3_generated.h" +#include "generated/parquet_types.h" +#include "parquet/thrift_internal.h" + +namespace parquet { + +// Convert to flatbuffer representation of the footer metadata. +// Return false if the schema is not supported by the FlatBuffer format. +bool ToFlatbuffer(format::FileMetaData* md, std::string* flatbuf); + +// The flatbuffer in `from` must be valid (such as one retured by `ToFlatbuffer`). +format::FileMetaData FromFlatbuffer(const format3::FileMetaData* md); + + +// Append/extract the flatbuffer from the footer as a thrift extension: +// https://github.com/apache/parquet-format/blob/master/BinaryProtocolExtensions.md. +// +// `flatbuf` is the flatbuffer representation of the footer metadata. +// `thrift` is the buffer containing the thrift representation of the footer metadata as its suffix. +// +// Returns the number of bytes added. +// +// The extension itself is as follows: +// +// +-------------------+------------+--------------------------------------+----------------+---------+--------------------------------+------+ +// | compress(flatbuf) | compressor | crc(compress(flatbuf) .. compressor) | compressed_len | raw_len | crc(compressed_len .. raw_len) | UUID | +// +-------------------+------------+--------------------------------------+----------------+---------+--------------------------------+------+ +// +// flatbuf: the flatbuffer representation of the footer metadata. +// compressor: the compression scheme applied to the flatbuf. +// compress(x): x compressed with the specified compressor. +// crc(x): the crc32 checksum of x. +// y .. x: concatenation of the bytes of y and x. +// UUID: a 16-byte unique identifier. +// +// All integers (lengths, crc) are stored in little-endian. + +// Append a flatbuffer as an extended field to Thrift-serialized metadata. +// The flatbuffer is compressed with LZ4, packed with checksums and metadata, +// then appended as a Thrift binary field (ID 32767) followed by a stop field. +void AppendFlatbuffer(std::string flatbuffer, std::string* thrift); + +// Extract flatbuffer from a Parquet file buffer. +// Returns the size of the flatbuffer if found (and writes to out_flatbuffer), +// returns 0 if no flatbuffer extension is present, or returns the required +// buffer size if the input buffer is too small. +::arrow::Result ExtractFlatbuffer(std::shared_ptr buf, std::string* out_flatbuffer); + +} // using namespace parquet + diff --git a/cpp/src/parquet/metadata3_benchmark.cc b/cpp/src/parquet/metadata3_benchmark.cc new file mode 100644 index 00000000000..98d37206b62 --- /dev/null +++ b/cpp/src/parquet/metadata3_benchmark.cc @@ -0,0 +1,331 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/filesystem/path_util.h" +#include "arrow/util/endian.h" +#include "arrow/util/logging.h" +#include "arrow/util/ubsan.h" +#include "arrow/util/unreachable.h" +#include "benchmark/benchmark.h" +#include "flatbuffers/flatbuffers.h" +#include "generated/parquet3_generated.h" +#include "generated/parquet_types.h" +#include "parquet/metadata3.h" +#include "parquet/thrift_internal.h" + +static inline std::string GetBasename(const std::string& path) { + auto pos = path.find_last_of("/\\"); + return (pos == std::string::npos) ? path : path.substr(pos + 1); +} + +// Baseline +// +// 0/amazon_apparel.footer: num-rgs=1182 num-cols=16 thrift=2158995 flatbuf=4040696 +// 1/amazon_movie_tv.footer: num-rgs=3 num-cols=18 thrift=22578 flatbuf=25680 +// 2/amazon_polarity.footer: num-rgs=900 num-cols=4 thrift=1074313 flatbuf=1379944 +// 3/amazon_reviews_books.footer: num-rgs=159 num-cols=44 thrift=767840 flatbuf=1174696 +// 4/large-footer1: num-rgs=23 num-cols=2001 thrift=3253741 flatbuf=5906552 +// 5/large-footer2: num-rgs=4 num-cols=2930 thrift=2248476 flatbuf=2801976 +// +// +// Remove deprecated ColumnChunk.file_offset +// +// +// 0/amazon_apparel.footer: num-rgs=1182 num-cols=16 thrift=2158995 flatbuf=1292376 +// 1/amazon_movie_tv.footer: num-rgs=3 num-cols=18 thrift=22578 flatbuf=5056 +// 2/amazon_polarity.footer: num-rgs=900 num-cols=4 thrift=1074313 flatbuf=214192 +// 3/amazon_reviews_books.footer: num-rgs=159 num-cols=44 thrift=767840 flatbuf=226112 +// 4/large-footer1: num-rgs=23 num-cols=2001 thrift=3253741 flatbuf=2961808 +// 5/large-footer2: num-rgs=4 num-cols=2930 thrift=2248476 flatbuf=1120360 +// +// +// Optimized statistics +// +// 0/amazon_apparel.footer: num-rgs=1182 num-cols=16 thrift=2158995 flatbuf=3874720 +// 1/amazon_movie_tv.footer: num-rgs=3 num-cols=18 thrift=22578 flatbuf=8208 +// 2/amazon_polarity.footer: num-rgs=900 num-cols=4 thrift=1074313 flatbuf=1304568 +// 3/amazon_reviews_books.footer: num-rgs=159 num-cols=44 thrift=767840 flatbuf=721728 +// 4/large-footer1: num-rgs=23 num-cols=2001 thrift=3253741 flatbuf=5538032 +// 5/large-footer2: num-rgs=4 num-cols=2930 thrift=2248476 flatbuf=2599152 +// +// +// Optimized offsets/num_vals +// +// RowGroup size limited to 2^31 and num values to 2^31. ColumnChunk offsets are relative +// to RowGroup starts which makes then all int32s too. +// +// 0/amazon_apparel.footer: num-rgs=1182 num-cols=16 thrift=2158995 flatbuf=3331720 +// 1/amazon_movie_tv.footer: num-rgs=3 num-cols=18 thrift=22578 flatbuf=7560 +// 2/amazon_polarity.footer: num-rgs=900 num-cols=4 thrift=1074313 flatbuf=1214640 +// 3/amazon_reviews_books.footer: num-rgs=159 num-cols=44 thrift=767840 flatbuf=620344 +// 4/large-footer1: num-rgs=23 num-cols=2001 thrift=3253741 flatbuf=4801656 +// 5/large-footer2: num-rgs=4 num-cols=2930 thrift=2248476 flatbuf=2390080 +// +// +// Optimized num_values when ColumnChunk is dense +// +// +// 0/amazon_apparel.footer: num-rgs=1182 num-cols=16 thrift=2158995 flatbuf=3265192 +// 1/amazon_movie_tv.footer: num-rgs=3 num-cols=18 thrift=22578 flatbuf=7568 +// 2/amazon_polarity.footer: num-rgs=900 num-cols=4 thrift=1074313 flatbuf=1207416 +// 3/amazon_reviews_books.footer: num-rgs=159 num-cols=44 thrift=767840 flatbuf=611720 +// 4/large-footer1: num-rgs=23 num-cols=2001 thrift=3253741 flatbuf=4433832 +// 5/large-footer2: num-rgs=4 num-cols=2930 thrift=2248476 flatbuf=2343608 +// +// +// Replace encoding stats with is_fully_dict_encoded +// +// 0/amazon_apparel.footer: num-rgs=1182 num-cols=16 thrift=2158995 flatbuf=2622520 +// 1/amazon_movie_tv.footer: num-rgs=3 num-cols=18 thrift=22578 flatbuf=6792 +// 2/amazon_polarity.footer: num-rgs=900 num-cols=4 thrift=1074313 flatbuf=1106640 +// 3/amazon_reviews_books.footer: num-rgs=159 num-cols=44 thrift=767840 flatbuf=489016 +// 4/large-footer1: num-rgs=23 num-cols=2001 thrift=3253741 flatbuf=4433656 +// 5/large-footer2: num-rgs=4 num-cols=2930 thrift=2248476 flatbuf=2062584 +// +// +// Remove path_in_schema in ColumnMetadata +// +// 0/amazon_apparel.footer: num-rgs=1182 num-cols=16 thrift=2158995 flatbuf=2092640 +// 1/amazon_movie_tv.footer: num-rgs=3 num-cols=18 thrift=22578 flatbuf=5544 +// 2/amazon_polarity.footer: num-rgs=900 num-cols=4 thrift=1074313 flatbuf=1045304 +// 3/amazon_reviews_books.footer: num-rgs=159 num-cols=44 thrift=767840 flatbuf=333176 +// 4/large-footer1: num-rgs=23 num-cols=2001 thrift=3253741 flatbuf=3697824 +// 5/large-footer2: num-rgs=4 num-cols=2930 thrift=2248476 flatbuf=1922080 +// +// +// Remove encodings in ColumnMetadata +// +// 0/amazon_apparel.footer: num-rgs=1182 num-cols=16 thrift=2158995 flatbuf=1884920 +// 1/amazon_movie_tv.footer: num-rgs=3 num-cols=18 thrift=22578 flatbuf=5296 +// 2/amazon_polarity.footer: num-rgs=900 num-cols=4 thrift=1074313 flatbuf=1009328 +// 3/amazon_reviews_books.footer: num-rgs=159 num-cols=44 thrift=767840 flatbuf=292560 +// 4/large-footer1: num-rgs=23 num-cols=2001 thrift=3253741 flatbuf=3329648 +// 5/large-footer2: num-rgs=4 num-cols=2930 thrift=2248476 flatbuf=1781344 +// +// +// Optimize statistics further. Only allow 4/8 byte values. Add common prefix. +// Remove distinct_count. +// +// 0/amazon_apparel.footer: num-rgs=1182 num-cols=16 thrift=2158995 flatbuf=1350760 +// 1/amazon_movie_tv.footer: num-rgs=3 num-cols=18 thrift=22578 flatbuf=5192 +// 2/amazon_polarity.footer: num-rgs=900 num-cols=4 thrift=1074313 flatbuf=235368 +// 3/amazon_reviews_books.footer: num-rgs=159 num-cols=44 thrift=767840 flatbuf=238656 +// 4/large-footer1: num-rgs=23 num-cols=2001 thrift=3253741 flatbuf=3329632 +// 5/large-footer2: num-rgs=4 num-cols=2930 thrift=2248476 flatbuf=1165112 +// +// + +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wunused-function" + +namespace parquet { +namespace { + +std::string ReadFile(const std::string& name) { + std::stringstream buffer; + std::ifstream t(name); + buffer << t.rdbuf(); + return buffer.str(); +} + +std::string Serialize(const format::FileMetaData& md) { + ThriftSerializer ser; + std::string out; + ser.SerializeToString(&md, &out); + return out; +} + +format::FileMetaData DeserializeThrift(const std::string& bytes) { + ThriftDeserializer des(100 << 20, 100 << 20); + format::FileMetaData md; + uint32_t n = bytes.size(); + des.DeserializeMessage(reinterpret_cast(bytes.data()), &n, &md); + return md; +} + +std::string SerializeFlatbuffer(format::FileMetaData* md) { + std::string flatbuffer; + parquet::ToFlatbuffer(md, &flatbuffer); + return flatbuffer; +} + +format::FileMetaData DeserializeFlatbuffer(const format3::FileMetaData* md) { + return parquet::FromFlatbuffer(md); +} + +struct Footer { + std::string name; + std::string thrift; + std::string flatbuf; + format::FileMetaData md; + + static Footer Make(const char* filename) { + std::string bytes = ReadFile(filename); + auto md = DeserializeThrift(bytes); + std::string flatbuf; + // removes unsupported fields for fair comparison + parquet::ToFlatbuffer(&md, &flatbuf); + return {GetBasename(filename), Serialize(md), std::move(flatbuf), std::move(md)}; + } +}; + +void Parse(benchmark::State& state, const Footer& footer) { + for (auto _ : state) { + auto md = DeserializeThrift(footer.thrift); + } +} + +void AppendUleb(uint32_t x, std::string* out) { + while (true) { + uint8_t c = x & 0x7F; + if (x < 0x80) return out->push_back(c); + out->push_back(c + 0x80); + x >>= 7; + } +} + +std::string AppendExtension(std::string thrift, const std::string& ext) { + thrift.back() = '\x08'; // replace stop field with binary type + AppendUleb(32767, &thrift); // field-id + AppendUleb(ext.size(), &thrift); + thrift += ext; + thrift += '\x00'; // add the stop field + return thrift; +} + +void EncodeFlatbuf(benchmark::State& state, const Footer& footer) { + auto md = footer.md; + for (auto _ : state) { + auto ser = SerializeFlatbuffer(&md); + benchmark::DoNotOptimize(ser); + } +} + +void ThriftFromFlatbuf(benchmark::State& state, const Footer& footer) { + for (auto _ : state) { + auto md = DeserializeFlatbuffer(format3::GetFileMetaData(footer.flatbuf.data())); + benchmark::DoNotOptimize(md); + } +} + +void ParseAndVerifyFlatbuf(benchmark::State& state, const Footer& footer) { + for (auto _ : state) { + flatbuffers::Verifier v(reinterpret_cast(footer.flatbuf.data()), + footer.flatbuf.size()); + auto fmd = format3::GetFileMetaData(footer.flatbuf.data()); + ARROW_DCHECK_EQ(fmd->num_rows(), footer.md.num_rows); + bool ok = fmd->Verify(v); + ARROW_DCHECK(ok); + } +} + +void ParseWithExtension(benchmark::State& state, const Footer& footer) { + auto with_ext = AppendExtension(footer.thrift, footer.flatbuf); + + for (auto _ : state) { + auto md = DeserializeThrift(with_ext); + } +} + +void Analyze(std::string_view name, const format::FileMetaData& md) { + std::cerr << "Analyzing: " << name << "\n"; + std::vector sizes; + int num_cols = md.schema.size() - 1; + size_t stats_bytes = 0, kv_bytes = 0; + for (auto& kv : md.key_value_metadata) kv_bytes += kv.key.size() + kv.value.size(); + for (int i = 0; i < num_cols; ++i) { + for (auto& rg : md.row_groups) { + auto& cc = rg.columns[i]; + if (!cc.__isset.meta_data) continue; + auto& cmd = cc.meta_data; + auto& s = cmd.statistics; + stats_bytes += s.max_value.size() + s.min_value.size(); + for (auto& kv : cmd.key_value_metadata) kv_bytes += kv.key.size() + kv.value.size(); + } + } + std::cerr << "num-rgs=" << md.row_groups.size() << " num-cols=" << num_cols + << " stats_bytes=" << stats_bytes << " kv_bytes=" << kv_bytes << "\n"; +} + +struct SiBytes { + double v; + int p; + char u; +}; + +SiBytes ToSiBytes(size_t v) { + auto kb = [](size_t n) { return n << 10; }; + auto mb = [](size_t n) { return n << 20; }; + auto gb = [](size_t n) { return n << 30; }; + if (v < kb(2)) return {v * 1., 0, ' '}; + if (v < mb(2)) return {v / 1024., v < kb(10), 'k'}; + if (v < gb(2)) return {v / 1024. / 1024, v < mb(10), 'M'}; + return {v / 1024. / 1024 / 1024, v < gb(10), 'G'}; +} + +} // namespace +} // namespace parquet + +int main(int argc, char** argv) { + ::benchmark::Initialize(&argc, argv); + std::vector footers; + for (int i = 1; i < argc; ++i) footers.push_back(parquet::Footer::Make(argv[i])); + struct { + std::string name; + void (*fn)(benchmark::State&, const parquet::Footer&); + } benchmarks[] = { + {"Parse", parquet::Parse}, + {"ParseWithExtension", parquet::ParseWithExtension}, + {"EncodeFlatbuf", parquet::EncodeFlatbuf}, + {"ThriftFromFlatbuf", parquet::ThriftFromFlatbuf}, + {"ParseAndVerifyFlatbuf", parquet::ParseAndVerifyFlatbuf}, + }; + for (auto&& footer : footers) { + for (auto&& [n, fn] : benchmarks) { + char buf[1024]; + snprintf(buf, sizeof(buf), "%30s/%s", footer.name.c_str(), n.c_str()); + ::benchmark::RegisterBenchmark(buf, fn, footer)->Unit(benchmark::kMillisecond); + } + } + + char key[1024]; + char val[1024]; + for (size_t i = 0; i < footers.size(); ++i) { + auto&& f = footers[i]; + auto name = GetBasename(f.name); + snprintf(key, sizeof(key), "%lu/%s", i, name.c_str()); + auto thrift = parquet::ToSiBytes(f.thrift.size()); + auto flatbuf = parquet::ToSiBytes(f.flatbuf.size()); + snprintf(val, sizeof(val), "num-rgs=%lu num-cols=%lu thrift=%.*f%c flatbuf=%.*f%c", + f.md.row_groups.size(), f.md.schema.size() - 1, thrift.p, thrift.v, thrift.u, + flatbuf.p, flatbuf.v, flatbuf.u); + ::benchmark::AddCustomContext(key, val); + } + + ::benchmark::RunSpecifiedBenchmarks(); + ::benchmark::Shutdown(); + return 0; +} diff --git a/cpp/src/parquet/metadata3_test.cc b/cpp/src/parquet/metadata3_test.cc new file mode 100644 index 00000000000..3be6ee16f64 --- /dev/null +++ b/cpp/src/parquet/metadata3_test.cc @@ -0,0 +1,907 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/metadata3.h" + +#include +#include + +#include "arrow/io/memory.h" +#include "arrow/testing/gtest_compat.h" +#include "arrow/util/config.h" +#include "flatbuffers/flatbuffers.h" +#include "generated/parquet3_generated.h" +#include "parquet/column_writer.h" +#include "parquet/file_reader.h" +#include "parquet/file_writer.h" +#include "parquet/metadata.h" +#include "parquet/schema.h" +#include "parquet/statistics.h" +#include "parquet/test_util.h" +#include "parquet/thrift_internal.h" +#include "parquet/types.h" + +namespace parquet { + +namespace test { + +using schema::GroupNode; +using schema::NodePtr; +using schema::PrimitiveNode; + +class TestMetadata3RoundTrip : public ::testing::Test { + public: + void SetUp() override {} + + protected: + // Helper to verify flatbuffer is valid + void VerifyFlatbuffer(const std::string& flatbuf) { + // FlatBuffers require proper alignment. When copied to a string, alignment may be + // lost. Create an aligned buffer for verification + std::vector aligned_buffer(flatbuf.begin(), flatbuf.end()); + flatbuffers::Verifier verifier(aligned_buffer.data(), aligned_buffer.size()); + ASSERT_TRUE(format3::VerifyFileMetaDataBuffer(verifier)); + } + + // Helper to compare logical equivalence of Thrift FileMetaData after round-trip + void AssertFileMetadataLogicallyEqual(const format::FileMetaData& original, + const format::FileMetaData& converted) { + // Compare file-level metadata + ASSERT_EQ(original.version, converted.version); + ASSERT_EQ(original.num_rows, converted.num_rows); + ASSERT_EQ(original.schema.size(), converted.schema.size()); + ASSERT_EQ(original.row_groups.size(), converted.row_groups.size()); + + // Compare row groups + for (size_t rg = 0; rg < original.row_groups.size(); ++rg) { + const auto& orig_rg = original.row_groups[rg]; + const auto& conv_rg = converted.row_groups[rg]; + ASSERT_EQ(orig_rg.num_rows, conv_rg.num_rows); + ASSERT_EQ(orig_rg.total_byte_size, conv_rg.total_byte_size); + ASSERT_EQ(orig_rg.columns.size(), conv_rg.columns.size()); + + // Compare columns + for (size_t col = 0; col < orig_rg.columns.size(); ++col) { + const auto& orig_col = orig_rg.columns[col].meta_data; + const auto& conv_col = conv_rg.columns[col].meta_data; + + ASSERT_EQ(orig_col.type, conv_col.type); + ASSERT_EQ(orig_col.codec, conv_col.codec); + ASSERT_EQ(orig_col.num_values, conv_col.num_values); + ASSERT_EQ(orig_col.total_compressed_size, conv_col.total_compressed_size); + ASSERT_EQ(orig_col.total_uncompressed_size, conv_col.total_uncompressed_size); + ASSERT_EQ(orig_col.data_page_offset, conv_col.data_page_offset); + ASSERT_EQ(orig_col.path_in_schema, conv_col.path_in_schema); + + // Compare dictionary_page_offset + ASSERT_EQ(orig_col.__isset.dictionary_page_offset, + conv_col.__isset.dictionary_page_offset); + if (orig_col.__isset.dictionary_page_offset) { + ASSERT_EQ(orig_col.dictionary_page_offset, conv_col.dictionary_page_offset); + } + + // Compare statistics + ASSERT_EQ(orig_col.__isset.statistics, conv_col.__isset.statistics); + if (orig_col.__isset.statistics) { + const auto& orig_stats = orig_col.statistics; + const auto& conv_stats = conv_col.statistics; + + ASSERT_EQ(orig_stats.__isset.null_count, conv_stats.__isset.null_count); + if (orig_stats.__isset.null_count) { + ASSERT_EQ(orig_stats.null_count, conv_stats.null_count); + } + + ASSERT_EQ(orig_stats.__isset.min_value, conv_stats.__isset.min_value); + ASSERT_EQ(orig_stats.__isset.max_value, conv_stats.__isset.max_value); + + if (orig_stats.__isset.min_value && orig_stats.__isset.max_value) { + // Metadata3 is lossy for BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY types + // It only stores up to 4 bytes (after removing common prefix) to save space + bool is_byte_array = (orig_col.type == format::Type::BYTE_ARRAY || + orig_col.type == format::Type::FIXED_LEN_BYTE_ARRAY); + + if (is_byte_array) { + // For byte arrays, verify that truncated statistics form a conservative + // (wider) range converted_min <= original_min and converted_max >= + // original_max + ASSERT_LE(conv_stats.min_value, orig_stats.min_value) + << "Converted min should be <= original min for conservative filtering"; + ASSERT_GE(conv_stats.max_value, orig_stats.max_value) + << "Converted max should be >= original max for conservative filtering"; + + // The is_exact flag should be false for truncated values + if (orig_stats.min_value.size() > 4) { + ASSERT_FALSE(conv_stats.is_min_value_exact); + ASSERT_FALSE(conv_stats.is_max_value_exact); + } + } else { + // For other types, values should match exactly + ASSERT_EQ(orig_stats.min_value, conv_stats.min_value); + ASSERT_EQ(orig_stats.max_value, conv_stats.max_value); + ASSERT_EQ(orig_stats.is_min_value_exact, conv_stats.is_min_value_exact); + ASSERT_EQ(orig_stats.is_max_value_exact, conv_stats.is_max_value_exact); + } + } + } + } + } + } + + // Helper to create a simple schema with specified types + std::shared_ptr MakeSchema(const std::vector& types, + const std::vector& names) { + schema::NodeVector fields; + for (size_t i = 0; i < types.size(); ++i) { + fields.push_back(schema::PrimitiveNode::Make(names[i], Repetition::OPTIONAL, + types[i], ConvertedType::NONE)); + } + return std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); + } + + // Helper to write a Parquet file with random data + std::shared_ptr<::arrow::Buffer> WriteParquetFile( + std::shared_ptr schema, int num_rowgroups, int rows_per_rowgroup, + const std::shared_ptr& props = nullptr) { + auto sink = CreateOutputStream(); + + auto writer_props = props; + if (!writer_props) { + // Enable metadata3 by default for these tests + writer_props = WriterProperties::Builder().enable_write_metadata3()->build(); + } + + auto file_writer = ParquetFileWriter::Open(sink, schema, writer_props); + SchemaDescriptor schema_descr; + schema_descr.Init(schema); + + for (int rg = 0; rg < num_rowgroups; ++rg) { + auto row_group_writer = file_writer->AppendRowGroup(); + + for (int col = 0; col < schema_descr.num_columns(); ++col) { + const auto* descr = schema_descr.Column(col); + Type::type type = descr->physical_type(); + + switch (type) { + case Type::INT32: { + auto column_writer = + static_cast(row_group_writer->NextColumn()); + std::vector values(rows_per_rowgroup); + std::vector def_levels(rows_per_rowgroup); + random_numbers(rows_per_rowgroup, rg * 1000 + col, -10000, 10000, + values.data()); + random_numbers(rows_per_rowgroup, rg * 2000 + col, (int16_t)0, (int16_t)1, + def_levels.data()); + column_writer->WriteBatch(rows_per_rowgroup, def_levels.data(), nullptr, + values.data()); + column_writer->Close(); + break; + } + case Type::INT64: { + auto column_writer = + static_cast(row_group_writer->NextColumn()); + std::vector values(rows_per_rowgroup); + std::vector def_levels(rows_per_rowgroup); + random_numbers(rows_per_rowgroup, rg * 1000 + col, (int64_t)-1000000, + (int64_t)1000000, values.data()); + random_numbers(rows_per_rowgroup, rg * 2000 + col, (int16_t)0, (int16_t)1, + def_levels.data()); + column_writer->WriteBatch(rows_per_rowgroup, def_levels.data(), nullptr, + values.data()); + column_writer->Close(); + break; + } + case Type::FLOAT: { + auto column_writer = + static_cast(row_group_writer->NextColumn()); + std::vector values(rows_per_rowgroup); + std::vector def_levels(rows_per_rowgroup); + random_numbers(rows_per_rowgroup, rg * 1000 + col, -1000.0f, 1000.0f, + values.data()); + random_numbers(rows_per_rowgroup, rg * 2000 + col, (int16_t)0, (int16_t)1, + def_levels.data()); + column_writer->WriteBatch(rows_per_rowgroup, def_levels.data(), nullptr, + values.data()); + column_writer->Close(); + break; + } + case Type::DOUBLE: { + auto column_writer = + static_cast(row_group_writer->NextColumn()); + std::vector values(rows_per_rowgroup); + std::vector def_levels(rows_per_rowgroup); + random_numbers(rows_per_rowgroup, rg * 1000 + col, -10000.0, 10000.0, + values.data()); + random_numbers(rows_per_rowgroup, rg * 2000 + col, (int16_t)0, (int16_t)1, + def_levels.data()); + column_writer->WriteBatch(rows_per_rowgroup, def_levels.data(), nullptr, + values.data()); + column_writer->Close(); + break; + } + case Type::BYTE_ARRAY: { + auto column_writer = + static_cast(row_group_writer->NextColumn()); + std::vector values(rows_per_rowgroup); + std::vector def_levels(rows_per_rowgroup); + std::vector buf(rows_per_rowgroup * 20); + random_byte_array(rows_per_rowgroup, rg * 1000 + col, buf.data(), + values.data(), 5, 15); + random_numbers(rows_per_rowgroup, rg * 2000 + col, (int16_t)0, (int16_t)1, + def_levels.data()); + column_writer->WriteBatch(rows_per_rowgroup, def_levels.data(), nullptr, + values.data()); + column_writer->Close(); + break; + } + case Type::FIXED_LEN_BYTE_ARRAY: { + auto column_writer = + static_cast(row_group_writer->NextColumn()); + std::vector values(rows_per_rowgroup); + std::vector def_levels(rows_per_rowgroup); + std::vector buf(rows_per_rowgroup * FLBA_LENGTH); + random_fixed_byte_array(rows_per_rowgroup, rg * 1000 + col, buf.data(), + FLBA_LENGTH, values.data()); + random_numbers(rows_per_rowgroup, rg * 2000 + col, (int16_t)0, (int16_t)1, + def_levels.data()); + column_writer->WriteBatch(rows_per_rowgroup, def_levels.data(), nullptr, + values.data()); + column_writer->Close(); + break; + } + default: + throw ParquetException("Unsupported type in test"); + } + } + row_group_writer->Close(); + } + file_writer->Close(); + + PARQUET_ASSIGN_OR_THROW(auto buffer, sink->Finish()); + return buffer; + } +}; + +// Test basic round-trip conversion with INT32 columns +// Debug test to check flatbuffer verification +TEST_F(TestMetadata3RoundTrip, DebugFlatbufferVerification) { + auto schema = MakeSchema({Type::INT32}, {"col1"}); + auto buffer = WriteParquetFile(schema, /*num_rowgroups=*/1, /*rows_per_rowgroup=*/100); + + // First, let's check if the metadata can be converted to flatbuffer + auto source = std::make_shared<::arrow::io::BufferReader>(buffer); + auto file_reader = ParquetFileReader::Open(source); + auto metadata = file_reader->metadata(); + + // Get the Thrift metadata + std::string thrift_serialized = metadata->SerializeToString(); + auto reader_props = default_reader_properties(); + ThriftDeserializer deserializer(reader_props); + format::FileMetaData thrift_md; + uint32_t len = static_cast(thrift_serialized.size()); + deserializer.DeserializeMessage( + reinterpret_cast(thrift_serialized.data()), &len, &thrift_md); + + // Check writer properties + auto writer_props = WriterProperties::Builder().enable_write_metadata3()->build(); + std::cout << "Writer properties write_metadata3(): " + << (writer_props->write_metadata3() ? "TRUE" : "FALSE") << std::endl; + + // Print schema info + std::cout << "Schema size: " << thrift_md.schema.size() << std::endl; + for (size_t i = 0; i < thrift_md.schema.size() && i < 5; ++i) { + auto& se = thrift_md.schema[i]; + std::cout << "Schema element " << i << ": name=" << se.name; + if (se.__isset.type) std::cout << ", type=" << se.type; + if (se.__isset.converted_type) std::cout << ", converted_type=" << se.converted_type; + if (se.__isset.logicalType) std::cout << ", has logicalType"; + std::cout << std::endl; + } + + // Try to convert + std::string flatbuf; + bool converted = ToFlatbuffer(&thrift_md, &flatbuf); + std::cout << "ToFlatbuffer result: " << (converted ? "SUCCESS" : "FAILED") << std::endl; + if (converted) { + std::cout << "Converted flatbuffer size: " << flatbuf.size() << std::endl; + } + + // Check file size + std::cout << "Parquet file size: " << buffer->size() << " bytes" << std::endl; + std::cout << "Thrift metadata size: " << thrift_serialized.size() << " bytes" + << std::endl; + + // Extract the flatbuffer from the file + std::string extracted_flatbuf; + auto result = ExtractFlatbuffer(buffer, &extracted_flatbuf); + std::cout << "ExtractFlatbuffer result: " << result.ok() << ", size: " << *result + << std::endl; + + // Check the last few bytes of metadata to see if flatbuffer marker is there + if (buffer->size() > 100) { + // Read the metadata length from footer + const uint8_t* footer = buffer->data() + buffer->size() - 8; + uint32_t metadata_len = *reinterpret_cast(footer); + metadata_len = ::arrow::bit_util::FromLittleEndian(metadata_len); + std::cout << "Metadata length from footer: " << metadata_len << " bytes" << std::endl; + + if (metadata_len > 50 && buffer->size() > metadata_len + 8) { + const uint8_t* md_start = buffer->data() + buffer->size() - 8 - metadata_len; + // Check last 40 bytes of metadata for UUID marker + const uint8_t* check_pos = md_start + metadata_len - 40; + std::cout << "Last 40 bytes of metadata (hex): "; + for (int i = 0; i < 40 && i < metadata_len; ++i) { + printf("%02x ", check_pos[i]); + } + std::cout << std::endl; + } + } + + if (result.ok() && *result > 0) { + std::cout << "Flatbuffer size: " << extracted_flatbuf.size() << std::endl; + + // Try to verify it + flatbuffers::Verifier verifier( + reinterpret_cast(extracted_flatbuf.data()), + extracted_flatbuf.size()); + bool valid = format3::VerifyFileMetaDataBuffer(verifier); + std::cout << "Verification result: " << (valid ? "PASSED" : "FAILED") << std::endl; + + // Also check if we can read it + auto fmd = format3::GetFileMetaData(extracted_flatbuf.data()); + std::cout << "Can read FileMetaData: " << (fmd != nullptr ? "YES" : "NO") + << std::endl; + if (fmd) { + std::cout << "Version: " << fmd->version() << std::endl; + std::cout << "Num rows: " << fmd->num_rows() << std::endl; + } + + ASSERT_TRUE(valid) + << "Flatbuffer verification should pass for writer-created flatbuffers"; + } else { + std::cout << "No flatbuffer found in file" << std::endl; + if (!converted) { + std::cout << "Reason: ToFlatbuffer() returned false - metadata cannot be converted" + << std::endl; + } else { + std::cout << "ToFlatbuffer succeeded but flatbuffer not found in file - writer may " + "not be using it" + << std::endl; + } + } +} + +TEST_F(TestMetadata3RoundTrip, Int32Columns) { + auto schema = MakeSchema({Type::INT32, Type::INT32}, {"col1", "col2"}); + + // Write file with metadata3 enabled (flatbuffer will be embedded) + auto buffer = WriteParquetFile(schema, /*num_rowgroups=*/2, /*rows_per_rowgroup=*/100); + + // Read back without metadata3 to get Thrift metadata + auto source1 = std::make_shared<::arrow::io::BufferReader>(buffer); + auto reader_props_thrift = default_reader_properties(); + reader_props_thrift.set_read_metadata3(false); + auto file_reader1 = ParquetFileReader::Open(source1, reader_props_thrift); + auto metadata1 = file_reader1->metadata(); + std::string thrift1 = metadata1->SerializeToString(); + + // Read back with metadata3 enabled to read from flatbuffer + auto source2 = std::make_shared<::arrow::io::BufferReader>(buffer); + auto reader_props_fb = default_reader_properties(); + reader_props_fb.set_read_metadata3(true); + auto file_reader2 = ParquetFileReader::Open(source2, reader_props_fb); + auto metadata2 = file_reader2->metadata(); + std::string thrift2 = metadata2->SerializeToString(); + + // Deserialize both to compare + ThriftDeserializer deserializer(default_reader_properties()); + format::FileMetaData md1, md2; + uint32_t len1 = static_cast(thrift1.size()); + uint32_t len2 = static_cast(thrift2.size()); + deserializer.DeserializeMessage(reinterpret_cast(thrift1.data()), &len1, + &md1); + deserializer.DeserializeMessage(reinterpret_cast(thrift2.data()), &len2, + &md2); + + // Compare: metadata read from Thrift vs metadata read from Flatbuffer should be + // equivalent + AssertFileMetadataLogicallyEqual(md1, md2); +} + +// Test round-trip with INT64 columns +TEST_F(TestMetadata3RoundTrip, Int64Columns) { + auto schema = MakeSchema({Type::INT64, Type::INT64}, {"col1", "col2"}); + auto buffer = WriteParquetFile(schema, /*num_rowgroups=*/2, /*rows_per_rowgroup=*/100); + + // Read back without metadata3 to get Thrift metadata + auto source1 = std::make_shared<::arrow::io::BufferReader>(buffer); + auto reader_props_thrift = default_reader_properties(); + reader_props_thrift.set_read_metadata3(false); + auto file_reader1 = ParquetFileReader::Open(source1, reader_props_thrift); + auto metadata1 = file_reader1->metadata(); + std::string thrift1 = metadata1->SerializeToString(); + + // Read back with metadata3 enabled to read from flatbuffer + auto source2 = std::make_shared<::arrow::io::BufferReader>(buffer); + auto reader_props_fb = default_reader_properties(); + reader_props_fb.set_read_metadata3(true); + auto file_reader2 = ParquetFileReader::Open(source2, reader_props_fb); + auto metadata2 = file_reader2->metadata(); + std::string thrift2 = metadata2->SerializeToString(); + + // Deserialize both to compare + ThriftDeserializer deserializer(default_reader_properties()); + format::FileMetaData md1, md2; + uint32_t len1 = static_cast(thrift1.size()); + uint32_t len2 = static_cast(thrift2.size()); + deserializer.DeserializeMessage(reinterpret_cast(thrift1.data()), &len1, + &md1); + deserializer.DeserializeMessage(reinterpret_cast(thrift2.data()), &len2, + &md2); + + // Compare: metadata read from Thrift vs metadata read from Flatbuffer should be + // equivalent + AssertFileMetadataLogicallyEqual(md1, md2); +} + +// Test round-trip with FLOAT and DOUBLE columns +TEST_F(TestMetadata3RoundTrip, FloatDoubleColumns) { + auto schema = MakeSchema({Type::FLOAT, Type::DOUBLE}, {"float_col", "double_col"}); + auto buffer = WriteParquetFile(schema, /*num_rowgroups=*/2, /*rows_per_rowgroup=*/100); + + // Read back without metadata3 to get Thrift metadata + auto source1 = std::make_shared<::arrow::io::BufferReader>(buffer); + auto reader_props_thrift = default_reader_properties(); + reader_props_thrift.set_read_metadata3(false); + auto file_reader1 = ParquetFileReader::Open(source1, reader_props_thrift); + auto metadata1 = file_reader1->metadata(); + std::string thrift1 = metadata1->SerializeToString(); + + // Read back with metadata3 enabled to read from flatbuffer + auto source2 = std::make_shared<::arrow::io::BufferReader>(buffer); + auto reader_props_fb = default_reader_properties(); + reader_props_fb.set_read_metadata3(true); + auto file_reader2 = ParquetFileReader::Open(source2, reader_props_fb); + auto metadata2 = file_reader2->metadata(); + std::string thrift2 = metadata2->SerializeToString(); + + // Deserialize both to compare + ThriftDeserializer deserializer(default_reader_properties()); + format::FileMetaData md1, md2; + uint32_t len1 = static_cast(thrift1.size()); + uint32_t len2 = static_cast(thrift2.size()); + deserializer.DeserializeMessage(reinterpret_cast(thrift1.data()), &len1, + &md1); + deserializer.DeserializeMessage(reinterpret_cast(thrift2.data()), &len2, + &md2); + + // Compare: metadata read from Thrift vs metadata read from Flatbuffer should be + // equivalent + AssertFileMetadataLogicallyEqual(md1, md2); +} + +// Test round-trip with BYTE_ARRAY columns +TEST_F(TestMetadata3RoundTrip, ByteArrayColumns) { + auto schema = MakeSchema({Type::BYTE_ARRAY}, {"byte_array_col"}); + auto buffer = WriteParquetFile(schema, /*num_rowgroups=*/2, /*rows_per_rowgroup=*/100); + + // Read back without metadata3 to get Thrift metadata + auto source1 = std::make_shared<::arrow::io::BufferReader>(buffer); + auto reader_props_thrift = default_reader_properties(); + reader_props_thrift.set_read_metadata3(false); + auto file_reader1 = ParquetFileReader::Open(source1, reader_props_thrift); + auto metadata1 = file_reader1->metadata(); + std::string thrift1 = metadata1->SerializeToString(); + + // Read back with metadata3 enabled to read from flatbuffer + auto source2 = std::make_shared<::arrow::io::BufferReader>(buffer); + auto reader_props_fb = default_reader_properties(); + reader_props_fb.set_read_metadata3(true); + auto file_reader2 = ParquetFileReader::Open(source2, reader_props_fb); + auto metadata2 = file_reader2->metadata(); + std::string thrift2 = metadata2->SerializeToString(); + + // Deserialize both to compare + ThriftDeserializer deserializer(default_reader_properties()); + format::FileMetaData md1, md2; + uint32_t len1 = static_cast(thrift1.size()); + uint32_t len2 = static_cast(thrift2.size()); + deserializer.DeserializeMessage(reinterpret_cast(thrift1.data()), &len1, + &md1); + deserializer.DeserializeMessage(reinterpret_cast(thrift2.data()), &len2, + &md2); + + // Compare: metadata read from Thrift vs metadata read from Flatbuffer should be + // equivalent + AssertFileMetadataLogicallyEqual(md1, md2); +} + +// Test round-trip with FIXED_LEN_BYTE_ARRAY columns +TEST_F(TestMetadata3RoundTrip, FixedLenByteArrayColumns) { + schema::NodeVector fields; + fields.push_back(schema::PrimitiveNode::Make("flba_col", Repetition::OPTIONAL, + Type::FIXED_LEN_BYTE_ARRAY, + ConvertedType::NONE, FLBA_LENGTH)); + auto schema = std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); + + auto buffer = WriteParquetFile(schema, /*num_rowgroups=*/2, /*rows_per_rowgroup=*/100); + + // Read back without metadata3 to get Thrift metadata + auto source1 = std::make_shared<::arrow::io::BufferReader>(buffer); + auto reader_props_thrift = default_reader_properties(); + reader_props_thrift.set_read_metadata3(false); + auto file_reader1 = ParquetFileReader::Open(source1, reader_props_thrift); + auto metadata1 = file_reader1->metadata(); + std::string thrift1 = metadata1->SerializeToString(); + + // Read back with metadata3 enabled to read from flatbuffer + auto source2 = std::make_shared<::arrow::io::BufferReader>(buffer); + auto reader_props_fb = default_reader_properties(); + reader_props_fb.set_read_metadata3(true); + auto file_reader2 = ParquetFileReader::Open(source2, reader_props_fb); + auto metadata2 = file_reader2->metadata(); + std::string thrift2 = metadata2->SerializeToString(); + + // Deserialize both to compare + ThriftDeserializer deserializer(default_reader_properties()); + format::FileMetaData md1, md2; + uint32_t len1 = static_cast(thrift1.size()); + uint32_t len2 = static_cast(thrift2.size()); + deserializer.DeserializeMessage(reinterpret_cast(thrift1.data()), &len1, + &md1); + deserializer.DeserializeMessage(reinterpret_cast(thrift2.data()), &len2, + &md2); + + // Compare: metadata read from Thrift vs metadata read from Flatbuffer should be + // equivalent + AssertFileMetadataLogicallyEqual(md1, md2); +} + +// Test round-trip with mixed column types +TEST_F(TestMetadata3RoundTrip, MixedColumnTypes) { + auto schema = + MakeSchema({Type::INT32, Type::INT64, Type::FLOAT, Type::DOUBLE, Type::BYTE_ARRAY}, + {"int32_col", "int64_col", "float_col", "double_col", "byte_array_col"}); + auto buffer = WriteParquetFile(schema, /*num_rowgroups=*/3, /*rows_per_rowgroup=*/100); + + // Read back without metadata3 to get Thrift metadata + auto source1 = std::make_shared<::arrow::io::BufferReader>(buffer); + auto reader_props_thrift = default_reader_properties(); + reader_props_thrift.set_read_metadata3(false); + auto file_reader1 = ParquetFileReader::Open(source1, reader_props_thrift); + auto metadata1 = file_reader1->metadata(); + std::string thrift1 = metadata1->SerializeToString(); + + // Read back with metadata3 enabled to read from flatbuffer + auto source2 = std::make_shared<::arrow::io::BufferReader>(buffer); + auto reader_props_fb = default_reader_properties(); + reader_props_fb.set_read_metadata3(true); + auto file_reader2 = ParquetFileReader::Open(source2, reader_props_fb); + auto metadata2 = file_reader2->metadata(); + std::string thrift2 = metadata2->SerializeToString(); + + // Deserialize both to compare + ThriftDeserializer deserializer(default_reader_properties()); + format::FileMetaData md1, md2; + uint32_t len1 = static_cast(thrift1.size()); + uint32_t len2 = static_cast(thrift2.size()); + deserializer.DeserializeMessage(reinterpret_cast(thrift1.data()), &len1, + &md1); + deserializer.DeserializeMessage(reinterpret_cast(thrift2.data()), &len2, + &md2); + + // Compare: metadata read from Thrift vs metadata read from Flatbuffer should be + // equivalent + AssertFileMetadataLogicallyEqual(md1, md2); +} + +// Test AppendFlatbuffer and ExtractFlatbuffer +TEST_F(TestMetadata3RoundTrip, AppendAndExtractFlatbuffer) { + auto schema = MakeSchema({Type::INT32, Type::INT64}, {"col1", "col2"}); + auto buffer = WriteParquetFile(schema, /*num_rowgroups=*/2, /*rows_per_rowgroup=*/100); + + auto source = std::make_shared<::arrow::io::BufferReader>(buffer); + auto file_reader = ParquetFileReader::Open(source); + auto metadata = file_reader->metadata(); + + std::string thrift_serialized = metadata->SerializeToString(); + auto reader_props = default_reader_properties(); + ThriftDeserializer deserializer(reader_props); + format::FileMetaData original_md; + uint32_t len = static_cast(thrift_serialized.size()); + deserializer.DeserializeMessage( + reinterpret_cast(thrift_serialized.data()), &len, &original_md); + + // Convert to flatbuffer + std::string flatbuf; + ASSERT_TRUE(ToFlatbuffer(&original_md, &flatbuf)); + + // Verify the original flatbuffer before appending + VerifyFlatbuffer(flatbuf); + + // Append flatbuffer to the Thrift serialized data + std::string parquet_format = thrift_serialized; + AppendFlatbuffer(flatbuf, &parquet_format); + ASSERT_GT(parquet_format.size(), thrift_serialized.size()); + + // Manually add footer (length + "PAR1") as WriteFileMetaData would + uint32_t metadata_len = static_cast(parquet_format.size()); + char footer[8]; + *reinterpret_cast(footer) = ::arrow::bit_util::ToLittleEndian(metadata_len); + std::memcpy(footer + 4, "PAR1", 4); + parquet_format.append(footer, 8); + + // Extract flatbuffer back + auto parquet_buf = std::make_shared( + reinterpret_cast(parquet_format.data()), parquet_format.size()); + std::string extracted_flatbuf; + auto result = ExtractFlatbuffer(parquet_buf, &extracted_flatbuf); + ASSERT_TRUE(result.ok()) << result.status(); + + // Verify extracted flatbuffer + VerifyFlatbuffer(extracted_flatbuf); + + // Do round-trip test on extracted flatbuffer + // Convert Flatbuffer → Thrift + auto fmd = format3::GetFileMetaData(extracted_flatbuf.data()); + format::FileMetaData converted_md = FromFlatbuffer(fmd); + + // Compare the original and round-tripped Thrift FileMetadata + AssertFileMetadataLogicallyEqual(original_md, converted_md); +} + +// Unit test for ExtractFlatbuffer - basic round-trip +TEST_F(TestMetadata3RoundTrip, ExtractFlatbufferRoundTrip) { + // Create a simple Thrift FileMetaData + format::FileMetaData thrift_md; + thrift_md.__set_version(1); + thrift_md.__set_num_rows(100); + thrift_md.__set_created_by("test_creator"); + + // Add a simple schema + format::SchemaElement root; + root.__set_name("test_schema"); + root.__set_repetition_type(format::FieldRepetitionType::REQUIRED); + root.__set_num_children(0); + thrift_md.schema.push_back(root); + + // Serialize the Thrift metadata + ThriftSerializer serializer; + uint32_t len; + uint8_t* thrift_buffer; + serializer.SerializeToBuffer(&thrift_md, &len, &thrift_buffer); + std::string thrift_str(reinterpret_cast(thrift_buffer), len); + + // Convert to flatbuffer + std::string flatbuf; + ASSERT_TRUE(ToFlatbuffer(&thrift_md, &flatbuf)); + + // Append flatbuffer to thrift + AppendFlatbuffer(flatbuf, &thrift_str); + + // Add Parquet footer (length + "PAR1") + uint32_t metadata_len = static_cast(thrift_str.size()); + char footer[8]; + *reinterpret_cast(footer) = ::arrow::bit_util::ToLittleEndian(metadata_len); + std::memcpy(footer + 4, "PAR1", 4); + thrift_str.append(footer, 8); + + // Now extract the flatbuffer + auto buffer = std::make_shared( + reinterpret_cast(thrift_str.data()), thrift_str.size()); + + std::string extracted_flatbuf; + auto result = ExtractFlatbuffer(buffer, &extracted_flatbuf); + + // Verify extraction succeeded + ASSERT_TRUE(result.ok()) << result.status(); + ASSERT_GT(*result, 0); + + // Verify the extracted flatbuffer matches the original + ASSERT_EQ(flatbuf.size(), extracted_flatbuf.size()); + ASSERT_EQ(flatbuf, extracted_flatbuf); +} + +// Unit test for ExtractFlatbuffer - no flatbuffer present +TEST_F(TestMetadata3RoundTrip, ExtractFlatbufferNotPresent) { + // Create a mock Parquet file without flatbuffer extension + std::string mock_thrift = "mock_thrift_data_without_extension"; + + // Add Parquet footer (length + "PAR1") + uint32_t metadata_len = static_cast(mock_thrift.size()); + char footer[8]; + *reinterpret_cast(footer) = ::arrow::bit_util::ToLittleEndian(metadata_len); + std::memcpy(footer + 4, "PAR1", 4); + mock_thrift.append(footer, 8); + + auto buffer = std::make_shared( + reinterpret_cast(mock_thrift.data()), mock_thrift.size()); + + std::string extracted_flatbuf; + auto result = ExtractFlatbuffer(buffer, &extracted_flatbuf); + + // Should return 0 indicating no flatbuffer found + ASSERT_TRUE(result.ok()); + ASSERT_EQ(*result, 0); + ASSERT_TRUE(extracted_flatbuf.empty()); +} + +// Unit test for ExtractFlatbuffer - buffer too small +TEST_F(TestMetadata3RoundTrip, ExtractFlatbufferBufferTooSmall) { + // Create a buffer with less than 8 bytes + std::string small_data = "PAR1"; + auto buffer = std::make_shared( + reinterpret_cast(small_data.data()), small_data.size()); + + std::string extracted_flatbuf; + auto result = ExtractFlatbuffer(buffer, &extracted_flatbuf); + + // Should return required size + ASSERT_TRUE(result.ok()); + ASSERT_EQ(*result, 8); +} + +// Unit test for ExtractFlatbuffer - large flatbuffer (will be compressed) +TEST_F(TestMetadata3RoundTrip, ExtractFlatbufferCompressed) { + // Create a large Thrift FileMetaData to test compression + format::FileMetaData thrift_md; + thrift_md.__set_version(1); + thrift_md.__set_num_rows(1000000); + thrift_md.__set_created_by("test_creator_with_long_name_to_increase_size"); + + // Add many schema elements to make it large enough to compress + format::SchemaElement root; + root.__set_name("root"); + root.__set_repetition_type(format::FieldRepetitionType::REQUIRED); + root.__set_num_children(50); + thrift_md.schema.push_back(root); + + for (int i = 0; i < 50; ++i) { + format::SchemaElement col; + col.__set_name("column_" + std::to_string(i)); + col.__set_type(format::Type::INT64); + col.__set_repetition_type(format::FieldRepetitionType::OPTIONAL); + thrift_md.schema.push_back(col); + } + + // Serialize the Thrift metadata + ThriftSerializer serializer; + uint32_t len; + uint8_t* thrift_buffer; + serializer.SerializeToBuffer(&thrift_md, &len, &thrift_buffer); + std::string thrift_str(reinterpret_cast(thrift_buffer), len); + + // Convert to flatbuffer + std::string flatbuf; + ASSERT_TRUE(ToFlatbuffer(&thrift_md, &flatbuf)); + + // Append flatbuffer to thrift + AppendFlatbuffer(flatbuf, &thrift_str); + + uint32_t metadata_len = static_cast(thrift_str.size()); + char footer[8]; + *reinterpret_cast(footer) = ::arrow::bit_util::ToLittleEndian(metadata_len); + std::memcpy(footer + 4, "PAR1", 4); + thrift_str.append(footer, 8); + + // Extract + auto buffer = std::make_shared( + reinterpret_cast(thrift_str.data()), thrift_str.size()); + + std::string extracted_flatbuf; + auto result = ExtractFlatbuffer(buffer, &extracted_flatbuf); + + ASSERT_TRUE(result.ok()); + ASSERT_GT(*result, 0); + + // Verify the extracted flatbuffer matches the original + ASSERT_EQ(flatbuf.size(), extracted_flatbuf.size()); + ASSERT_EQ(flatbuf, extracted_flatbuf); +} + +// Unit test for ExtractFlatbuffer - invalid magic number +TEST_F(TestMetadata3RoundTrip, ExtractFlatbufferInvalidMagic) { + // Create a buffer with invalid magic number + std::string data; + data.resize(100); + + // Add invalid footer + uint32_t metadata_len = 50; + char footer[8]; + *reinterpret_cast(footer) = ::arrow::bit_util::ToLittleEndian(metadata_len); + std::memcpy(footer + 4, "XXXX", 4); // Invalid magic + data.append(footer, 8); + + auto buffer = std::make_shared(reinterpret_cast(data.data()), + data.size()); + + std::string extracted_flatbuf; + // ExtractFlatbuffer throws an exception for invalid magic number + EXPECT_THROW({ ExtractFlatbuffer(buffer, &extracted_flatbuf); }, ParquetException); +} + +// Test with large number of row groups +TEST_F(TestMetadata3RoundTrip, ManyRowGroups) { + auto schema = + MakeSchema({Type::INT32, Type::INT64, Type::FLOAT}, {"col1", "col2", "col3"}); + auto buffer = WriteParquetFile(schema, /*num_rowgroups=*/10, /*rows_per_rowgroup=*/50); + + // Read back without metadata3 to get Thrift metadata + auto source1 = std::make_shared<::arrow::io::BufferReader>(buffer); + auto reader_props_thrift = default_reader_properties(); + reader_props_thrift.set_read_metadata3(false); + auto file_reader1 = ParquetFileReader::Open(source1, reader_props_thrift); + auto metadata1 = file_reader1->metadata(); + std::string thrift1 = metadata1->SerializeToString(); + + // Read back with metadata3 enabled to read from flatbuffer + auto source2 = std::make_shared<::arrow::io::BufferReader>(buffer); + auto reader_props_fb = default_reader_properties(); + reader_props_fb.set_read_metadata3(true); + auto file_reader2 = ParquetFileReader::Open(source2, reader_props_fb); + auto metadata2 = file_reader2->metadata(); + std::string thrift2 = metadata2->SerializeToString(); + + // Deserialize both to compare + ThriftDeserializer deserializer(default_reader_properties()); + format::FileMetaData md1, md2; + uint32_t len1 = static_cast(thrift1.size()); + uint32_t len2 = static_cast(thrift2.size()); + deserializer.DeserializeMessage(reinterpret_cast(thrift1.data()), &len1, + &md1); + deserializer.DeserializeMessage(reinterpret_cast(thrift2.data()), &len2, + &md2); + + // Compare: metadata read from Thrift vs metadata read from Flatbuffer should be + // equivalent + AssertFileMetadataLogicallyEqual(md1, md2); +} + +// Test flatbuffer size is smaller than thrift for typical cases +TEST_F(TestMetadata3RoundTrip, FlatbufferSizeComparison) { + auto schema = MakeSchema({Type::INT32, Type::INT64, Type::FLOAT, Type::DOUBLE}, + {"col1", "col2", "col3", "col4"}); + auto buffer = WriteParquetFile(schema, /*num_rowgroups=*/5, /*rows_per_rowgroup=*/100); + + auto source = std::make_shared<::arrow::io::BufferReader>(buffer); + auto file_reader = ParquetFileReader::Open(source); + auto metadata = file_reader->metadata(); + + std::string thrift_serialized = metadata->SerializeToString(); + auto reader_props = default_reader_properties(); + ThriftDeserializer deserializer(reader_props); + format::FileMetaData original_md; + uint32_t len = static_cast(thrift_serialized.size()); + deserializer.DeserializeMessage( + reinterpret_cast(thrift_serialized.data()), &len, &original_md); + + std::string flatbuf; + ASSERT_TRUE(ToFlatbuffer(&original_md, &flatbuf)); + + // Log the sizes for comparison + std::cout << "Thrift size: " << thrift_serialized.size() << " bytes" << std::endl; + std::cout << "Flatbuffer size: " << flatbuf.size() << " bytes" << std::endl; + + if (flatbuf.size() < thrift_serialized.size()) { + double ratio = static_cast(flatbuf.size()) / thrift_serialized.size(); + std::cout << "Flatbuffer is " << (1.0 - ratio) * 100.0 << "% smaller" << std::endl; + } +} + +} // namespace test +} // namespace parquet diff --git a/cpp/src/parquet/parquet3.fbs b/cpp/src/parquet/parquet3.fbs new file mode 100644 index 00000000000..68d858f507a --- /dev/null +++ b/cpp/src/parquet/parquet3.fbs @@ -0,0 +1,224 @@ +namespace parquet.format3; + +// Optimization notes +// 1. Statistics are stored in integral types if their size is fixed, otherwise prefix + suffix +// 2. ColumnMetaData.encoding_stats are removed, they are replaced with +// ColumnMetaData.is_fully_dict_encoded. +// 3. RowGroups are limited to 2GB in size, so we can use int for sizes. +// 4. ColumnChunk/ColumnMetaData offsets are now relative to the start of the row group, so we can +// use int for offsets. +// 5. Remove ordinal. +// 6. Restrict RowGroups to 2^31-1 rows. +// 7. Remove offset/column indexes, they are small and just their offsets are of similar size. + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Physical types. +/////////////////////////////////////////////////////////////////////////////////////////////////// + +enum Type : byte { + BOOLEAN = 0, + INT32 = 1, + INT64 = 2, + INT96 = 3, + FLOAT = 4, + DOUBLE = 5, + BYTE_ARRAY = 6, + FIXED_LEN_BYTE_ARRAY = 7, +} + +enum FieldRepetitionType : byte { + REQUIRED = 0, + OPTIONAL = 1, + REPEATED = 2, +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Encodings. +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Note: Match the thrift enum values so that we can cast between them. +enum Encoding : byte { + PLAIN = 0, + // GROUP_VAR_INT = 1, + PLAIN_DICTIONARY = 2, + RLE = 3, + // BIT_PACKED = 4, + DELTA_BINARY_PACKED = 5, + DELTA_LENGTH_BYTE_ARRAY = 6, + DELTA_BYTE_ARRAY = 7, + RLE_DICTIONARY = 8, + BYTE_STREAM_SPLIT = 9, +} + +// Note: Match the thrift enum values so that we can cast between them. +enum CompressionCodec : byte { + UNCOMPRESSED = 0, + SNAPPY = 1, + GZIP = 2, + LZO = 3, + BROTLI = 4, + // LZ4 = 5, + ZSTD = 6, + LZ4_RAW = 7, +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Logical types. +/////////////////////////////////////////////////////////////////////////////////////////////////// + +table Empty {} +table DecimalOpts { + precision: int; + scale: int; +} +enum TimeUnit : byte { + MS = 0, + US = 1, + NS = 2, +} +table TimeOpts { + is_adjusted_to_utc: bool; + unit: TimeUnit; +} +table IntOpts { + bit_width: byte = 8; + is_signed: bool; +} +table GeometryType { + crs: string; +} +enum EdgeInterpolationAlgorithm : byte { + SPHERICAL = 0, + VINCENTY = 1, + THOMAS = 2, + ANDOYER = 3, + KARNEY = 4, +} +table GeographyType { + crs: string; + algorithm: EdgeInterpolationAlgorithm; +} +union LogicalType { + StringType:Empty, + MapType:Empty, + ListType:Empty, + EnumType:Empty, + DecimalType:DecimalOpts, + DateType:Empty, + TimeType:TimeOpts, + TimestampType:TimeOpts, + IntType:IntOpts, + NullType:Empty, + JsonType:Empty, + BsonType:Empty, + UUIDType:Empty, + Float16Type:Empty, + VariantType:Empty, + GeometryType:GeometryType, + GeographyType:GeographyType, +} + +table Statistics { + null_count: int = null; + // Store min/max values fixed sized entities depending on the physical type. If len is present + // then the min/max value is present. + // + // - BOOLEAN: none + // - INT32/FLOAT: lo4 (little-endian) + // - INT64/DOUBLE: lo8 (little-endian) + // - INT96: lo4+lo8 (little-endian) + // - FIXED_LEN_BYTE_ARRAY: + // - BYTE_ARRAY: + // prefix: the longest common prefix of min/max + // lo8+hi8 zero padded 16 bytes (big-endian) of the suffix + // len: the length for the suffix of the value after removing the prefix. If > 16 then the + // value is inexact + min_lo4: uint; + min_lo8: ulong; + min_hi8: ulong; + min_len: byte = null; + max_lo4: uint; + max_lo8: ulong; + max_hi8: ulong; + max_len: byte = null; + prefix: string; +} + +union ColumnOrder { + TypeDefinedOrder:Empty, +} + +table SchemaElement { + name: string; + type: Type = null; + repetition_type: FieldRepetitionType; + logical_type: LogicalType; + type_length: int = null; + num_children: int = 0; + field_id: int = null; + column_order: ColumnOrder; // only present for leaf nodes +} + +enum PageType : byte { + DATA_PAGE = 0, + INDEX_PAGE = 1, + DICTIONARY_PAGE = 2, + DATA_PAGE_V2 = 3, +} + +table KV { + key: string; + val: string; +} + +table ColumnMetadata { + codec: CompressionCodec; + num_values: long = null; // only present if not equal to rg.num_rows + total_uncompressed_size: long; + total_compressed_size: long; + key_value_metadata: [KV]; + data_page_offset: long; + index_page_offset: long = null; + dictionary_page_offset: long = null; + statistics: Statistics; + is_fully_dict_encoded: bool; + bloom_filter_offset: long = null; + bloom_filter_length: int = null; +} + +table ColumnChunk { + file_path: string; + meta_data: ColumnMetadata; + // crypto_metadata: ColumnCryptoMetadata; // TODO + // encrypted_column_metadata: [byte]; // TODO +} + +table SortingColumn { + column_idx: int; + descending: bool; + nulls_first: bool; +} + +table RowGroup { + columns: [ColumnChunk]; + total_byte_size: long; + num_rows: long; + sorting_columns: [SortingColumn]; + file_offset: long; + total_compressed_size: long; + ordinal: short = null; +} + +table FileMetaData { + version: int; + schema: [SchemaElement]; + num_rows: long; + row_groups: [RowGroup]; + kv: [KV]; + created_by: string; + // column_orders: [ColumnOrder]; // moved to SchemaElement + // encryption_algorithm: [EncryptionAlgorithm]; // TODO + // footer_signing_key_metadata: binary; // TODO +} + +root_type FileMetaData; diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 1dcfe67c29f..5ecf5db2475 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -139,6 +139,11 @@ class PARQUET_EXPORT ReaderProperties { void set_footer_read_size(size_t size) { footer_read_size_ = size; } size_t footer_read_size() const { return footer_read_size_; } + // If enabled, try to read the metadata3 footer from the file. + // If it fails, fall back to Thrift footer decoding. + bool read_metadata3() const { return read_metadata3_; } + void set_read_metadata3(bool read_metadata3) { read_metadata3_ = read_metadata3; } + private: MemoryPool* pool_; int64_t buffer_size_ = kDefaultBufferSize; @@ -148,6 +153,7 @@ class PARQUET_EXPORT ReaderProperties { bool page_checksum_verification_ = false; // Used with a RecordReader. bool read_dense_for_nullable_ = false; + bool read_metadata3_ = false; size_t footer_read_size_ = kDefaultFooterReadSize; std::shared_ptr file_decryption_properties_; }; @@ -301,6 +307,7 @@ class PARQUET_EXPORT WriterProperties { store_decimal_as_integer_(false), page_checksum_enabled_(false), size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL), + write_metadata3_(false), content_defined_chunking_enabled_(false), content_defined_chunking_options_({}) {} @@ -317,6 +324,7 @@ class PARQUET_EXPORT WriterProperties { store_decimal_as_integer_(properties.store_decimal_as_integer()), page_checksum_enabled_(properties.page_checksum_enabled()), size_statistics_level_(properties.size_statistics_level()), + write_metadata3_(properties.write_metadata3()), sorting_columns_(properties.sorting_columns()), default_column_properties_(properties.default_column_properties()), content_defined_chunking_enabled_( @@ -461,6 +469,16 @@ class PARQUET_EXPORT WriterProperties { return this; } + Builder* enable_write_metadata3() { + write_metadata3_ = true; + return this; + } + + Builder* disable_write_metadata3() { + write_metadata3_ = false; + return this; + } + /// \brief Define the encoding that is used when we don't utilise dictionary encoding. // /// This is only applied if dictionary encoding is disabled. If the dictionary grows @@ -782,7 +800,8 @@ class PARQUET_EXPORT WriterProperties { size_statistics_level_, std::move(file_encryption_properties_), default_column_properties_, column_properties, data_page_version_, store_decimal_as_integer_, std::move(sorting_columns_), - content_defined_chunking_enabled_, content_defined_chunking_options_)); + content_defined_chunking_enabled_, content_defined_chunking_options_, + write_metadata3_)); } private: @@ -800,6 +819,7 @@ class PARQUET_EXPORT WriterProperties { bool store_decimal_as_integer_; bool page_checksum_enabled_; SizeStatisticsLevel size_statistics_level_; + bool write_metadata3_; std::shared_ptr file_encryption_properties_; @@ -843,6 +863,8 @@ class PARQUET_EXPORT WriterProperties { inline bool page_checksum_enabled() const { return page_checksum_enabled_; } + inline bool write_metadata3() const { return write_metadata3_; } + inline bool content_defined_chunking_enabled() const { return content_defined_chunking_enabled_; } @@ -953,7 +975,7 @@ class PARQUET_EXPORT WriterProperties { const std::unordered_map& column_properties, ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer, std::vector sorting_columns, bool content_defined_chunking_enabled, - CdcOptions content_defined_chunking_options) + CdcOptions content_defined_chunking_options, bool write_metadata3) : pool_(pool), dictionary_pagesize_limit_(dictionary_pagesize_limit), write_batch_size_(write_batch_size), @@ -971,7 +993,8 @@ class PARQUET_EXPORT WriterProperties { default_column_properties_(default_column_properties), column_properties_(column_properties), content_defined_chunking_enabled_(content_defined_chunking_enabled), - content_defined_chunking_options_(content_defined_chunking_options) {} + content_defined_chunking_options_(content_defined_chunking_options), + write_metadata3_(write_metadata3) {} MemoryPool* pool_; int64_t dictionary_pagesize_limit_; @@ -995,6 +1018,7 @@ class PARQUET_EXPORT WriterProperties { bool content_defined_chunking_enabled_; CdcOptions content_defined_chunking_options_; + bool write_metadata3_; }; PARQUET_EXPORT const std::shared_ptr& default_writer_properties();