diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index ce194f897e4..ac75337208e 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -281,21 +281,27 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list selected << "\"StatsSet\": "; if (column_chunk->is_stats_set()) { stream << "\"True\", \"Stats\": {"; + bool comma = false; if (stats->HasNullCount()) { - stream << "\"NumNulls\": \"" << stats->null_count(); + comma = true; + stream << "\"NumNulls\": \"" << stats->null_count() << "\""; } if (stats->HasDistinctCount()) { - stream << "\", " - << "\"DistinctValues\": \"" << stats->distinct_count(); + if (comma) + stream << ", "; + comma = true; + stream << "\"DistinctValues\": \"" << stats->distinct_count() << "\""; } if (stats->HasMinMax()) { + if (comma) + stream << ", "; + comma = true; std::string min = stats->EncodeMin(), max = stats->EncodeMax(); - stream << "\", " - << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) + stream << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", " - << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min); + << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min) << "\""; } - stream << "\" },"; + stream << " },"; } else { stream << "\"False\","; } @@ -312,11 +318,11 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list selected } stream << "\", " << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size() - << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size(); + << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size() << "\""; if (column_chunk->bloom_filter_offset()) { // Output BloomFilter {offset, length} - stream << "\", BloomFilter {" + stream << ", \"BloomFilter\": {" << "\"offset\": \"" << column_chunk->bloom_filter_offset().value(); if (column_chunk->bloom_filter_length()) { stream << "\", \"length\": \"" << column_chunk->bloom_filter_length().value(); @@ -327,7 +333,7 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list selected if (column_chunk->GetColumnIndexLocation()) { auto location = column_chunk->GetColumnIndexLocation().value(); // Output ColumnIndex {offset, length} - stream << "\", ColumnIndex {" + stream << ", \"ColumnIndex\": {" << "\"offset\": \"" << location.offset; stream << "\", \"length\": \"" << location.length; stream << "\"}"; @@ -336,14 +342,14 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list selected if (column_chunk->GetOffsetIndexLocation()) { auto location = column_chunk->GetOffsetIndexLocation().value(); // Output OffsetIndex {offset, length} - stream << "\", OffsetIndex {" + stream << ", \"OffsetIndex\": {" << "\"offset\": \"" << location.offset; stream << "\", \"length\": \"" << location.length; stream << "\"}"; } // end of a ColumnChunk - stream << "\" }"; + stream << " }"; c1++; if (c1 != static_cast(selected_columns.size())) { stream << ",\n"; diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index 7b50ed48d06..60ea4cdc560 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -115,12 +115,22 @@ std::string FormatStatValue(Type::type parquet_type, ::std::string_view val) { result << i32_val[0] << " " << i32_val[1] << " " << i32_val[2]; break; } - case Type::BYTE_ARRAY: { - return std::string(val); - } - case Type::FIXED_LEN_BYTE_ARRAY: { - return std::string(val); - } + case Type::BYTE_ARRAY: + case Type::FIXED_LEN_BYTE_ARRAY: + // Escape byte arrays to be usable in json strings. + for (char c : val) { + if (c == '\\' || c == '"') + result << '\\' << c; + else if (c >= 32 && c <= 126) + result << c; + else + // What to do if the byte array is not valid utf8? + // There doesn't seem to be a standard way to reversibly convert byte strings to valid utf8 while keeping simple ascii readable. + // For now we'll just output invalid utf8, which json parsers may be ok with. Idk if that's the best option. + result << c; + // result << "�"; + } + break; case Type::UNDEFINED: default: break;