From c3213a0ad1ee9512c3d2e3643668fc53779f1d27 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sun, 19 Jan 2025 00:49:00 +0800 Subject: [PATCH] GH-3125: Add CLI for SizeStatistics --- parquet-cli/README.md | 2 + .../java/org/apache/parquet/cli/Main.java | 2 + .../commands/ShowSizeStatisticsCommand.java | 116 ++++++++++++++++++ .../ShowSizeStatisticsCommandTest.java | 37 ++++++ .../columnindex/ColumnIndexBuilder.java | 35 +++++- .../columnindex/OffsetIndexBuilder.java | 11 +- 6 files changed, 197 insertions(+), 6 deletions(-) create mode 100644 parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java create mode 100644 parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommandTest.java diff --git a/parquet-cli/README.md b/parquet-cli/README.md index fb02b08173..963e4f171b 100644 --- a/parquet-cli/README.md +++ b/parquet-cli/README.md @@ -119,6 +119,8 @@ Usage: parquet [options] [command] [command options] Scan all records from a file rewrite Rewrite one or more Parquet files to a new Parquet file + size-stats + Print size statistics for a Parquet file Examples: diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java index 74ecb44086..c39e3b8e5a 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java @@ -51,6 +51,7 @@ import org.apache.parquet.cli.commands.ShowDictionaryCommand; import org.apache.parquet.cli.commands.ShowFooterCommand; import org.apache.parquet.cli.commands.ShowPagesCommand; +import org.apache.parquet.cli.commands.ShowSizeStatisticsCommand; import org.apache.parquet.cli.commands.ToAvroCommand; import org.apache.parquet.cli.commands.TransCompressionCommand; import org.slf4j.Logger; @@ -105,6 +106,7 @@ public class Main extends Configured implements Tool { jc.addCommand("bloom-filter", new ShowBloomFilterCommand(console)); jc.addCommand("scan", new ScanCommand(console)); jc.addCommand("rewrite", new RewriteCommand(console)); + jc.addCommand("size-stats", new ShowSizeStatisticsCommand(console)); } @Override diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java new file mode 100644 index 0000000000..0821d260e0 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.commands; + +import static org.apache.parquet.cli.Util.humanReadable; + +import com.beust.jcommander.Parameter; +import com.beust.jcommander.Parameters; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import java.io.IOException; +import java.util.List; +import org.apache.commons.text.TextStringBuilder; +import org.apache.parquet.cli.BaseCommand; +import org.apache.parquet.column.statistics.SizeStatistics; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; + +@Parameters(commandDescription = "Print size statistics for a Parquet file") +public class ShowSizeStatisticsCommand extends BaseCommand { + + public ShowSizeStatisticsCommand(Logger console) { + super(console); + } + + @Parameter(description = "") + List targets; + + @Override + @SuppressWarnings("unchecked") + public int run() throws IOException { + Preconditions.checkArgument(targets != null && !targets.isEmpty(), "A Parquet file is required."); + Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files."); + + String source = targets.get(0); + try (ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source))) { + ParquetMetadata footer = reader.getFooter(); + MessageType schema = footer.getFileMetaData().getSchema(); + + console.info("\nFile path: {}", source); + + List rowGroups = footer.getBlocks(); + for (int index = 0, n = rowGroups.size(); index < n; index++) { + printRowGroupSizeStats(console, index, rowGroups.get(index), schema); + console.info(""); + } + } + + return 0; + } + + private void printRowGroupSizeStats(Logger console, int index, BlockMetaData rowGroup, MessageType schema) { + int maxColumnWidth = Math.max( + "column".length(), + rowGroup.getColumns().stream() + .map(col -> col.getPath().toString().length()) + .max(Integer::compare) + .orElse(0)); + + console.info(String.format("\nRow group %d\n%s", index, new TextStringBuilder(80).appendPadding(80, '-'))); + + String formatString = String.format("%%-%ds %%-15s %%-40s %%-40s", maxColumnWidth); + console.info( + String.format(formatString, "column", "unencoded bytes", "rep level histogram", "def level histogram")); + + for (ColumnChunkMetaData column : rowGroup.getColumns()) { + printColumnSizeStats(console, column, schema, maxColumnWidth); + } + } + + private void printColumnSizeStats(Logger console, ColumnChunkMetaData column, MessageType schema, int columnWidth) { + SizeStatistics stats = column.getSizeStatistics(); + + if (stats != null && stats.isValid()) { + String unencodedBytes = stats.getUnencodedByteArrayDataBytes().isPresent() + ? humanReadable(stats.getUnencodedByteArrayDataBytes().get()) + : "-"; + List repLevels = stats.getRepetitionLevelHistogram(); + String repLevelsString = (repLevels != null && !repLevels.isEmpty()) ? repLevels.toString() : "-"; + List defLevels = stats.getDefinitionLevelHistogram(); + String defLevelsString = (defLevels != null && !defLevels.isEmpty()) ? defLevels.toString() : "-"; + String formatString = String.format("%%-%ds %%-15s %%-40s %%-40s", columnWidth); + console.info( + String.format(formatString, column.getPath(), unencodedBytes, repLevelsString, defLevelsString)); + } else { + String formatString = String.format("%%-%ds %%-15s %%-40s %%-40s", columnWidth); + console.info(String.format(formatString, column.getPath(), "-", "-", "-")); + } + } + + @Override + public List getExamples() { + return Lists.newArrayList("# Show size statistics for a Parquet file", "sample.parquet"); + } +} diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommandTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommandTest.java new file mode 100644 index 0000000000..55d4f9d6e8 --- /dev/null +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommandTest.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli.commands; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import org.apache.hadoop.conf.Configuration; +import org.junit.Assert; +import org.junit.Test; + +public class ShowSizeStatisticsCommandTest extends ParquetFileTest { + @Test + public void testShowSizeStatisticsCommand() throws IOException { + File file = parquetFile(); + ShowSizeStatisticsCommand command = new ShowSizeStatisticsCommand(createLogger()); + command.targets = Arrays.asList(file.getAbsolutePath()); + command.setConf(new Configuration()); + Assert.assertEquals(0, command.run()); + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java index ffbb82197b..e78b2ceae1 100644 --- a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java @@ -182,14 +182,39 @@ public List getDefinitionLevelHistogram() { return LongLists.unmodifiable(LongArrayList.wrap(defLevelHistogram)); } + private String formatHistogram(long[] histogram, int pageIndex) { + if (histogram != null && histogram.length > 0) { + int numLevelsPerPage = histogram.length / nullPages.length; + int offset = pageIndex * numLevelsPerPage; + StringBuilder sb = new StringBuilder(); + sb.append('['); + for (int j = 0; j < numLevelsPerPage; j++) { + if (j > 0) { + sb.append(","); + } + sb.append(histogram[offset + j]); + } + sb.append(']'); + return sb.toString(); + } + return TOSTRING_MISSING_VALUE_MARKER; + } + @Override public String toString() { try (Formatter formatter = new Formatter()) { formatter.format("Boundary order: %s\n", boundaryOrder); String minMaxPart = - " %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s\n"; - formatter.format("%-10s %20s" + minMaxPart, "", "null count", "min", "max"); - String format = "page-%-5d %20s" + minMaxPart; + " %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s"; + formatter.format( + "%-10s %20s" + minMaxPart + " %20s %20s\n", + "", + "null count", + "min", + "max", + "rep level histogram", + "def level histogram"); + String format = "page-%-5d %20s" + minMaxPart + " %20s %20s\n"; int arrayIndex = 0; for (int i = 0, n = nullPages.length; i < n; ++i) { String nullCount = @@ -201,7 +226,9 @@ public String toString() { min = truncate(getMinValueAsString(arrayIndex)); max = truncate(getMaxValueAsString(arrayIndex++)); } - formatter.format(format, i, nullCount, min, max); + String repLevelHist = formatHistogram(repLevelHistogram, i); + String defLevelHist = formatHistogram(defLevelHistogram, i); + formatter.format(format, i, nullCount, min, max, repLevelHist, defLevelHist); } return formatter.toString(); } diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/OffsetIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/OffsetIndexBuilder.java index b56f58d6fc..bd729ad97b 100644 --- a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/OffsetIndexBuilder.java +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/OffsetIndexBuilder.java @@ -39,10 +39,17 @@ private static class OffsetIndexImpl implements OffsetIndex { @Override public String toString() { try (Formatter formatter = new Formatter()) { - formatter.format("%-10s %20s %16s %20s\n", "", "offset", "compressed size", "first row index"); + formatter.format( + "%-10s %20s %20s %20s %20s\n", + "", "offset", "compressed size", "first row index", "unencoded bytes"); for (int i = 0, n = offsets.length; i < n; ++i) { + String unencodedBytes = + (unencodedByteArrayDataBytes != null && unencodedByteArrayDataBytes.length > 0) + ? String.valueOf(unencodedByteArrayDataBytes[i]) + : "-"; formatter.format( - "page-%-5d %20d %16d %20d\n", i, offsets[i], compressedPageSizes[i], firstRowIndexes[i]); + "page-%-5d %20d %20d %20d %20s\n", + i, offsets[i], compressedPageSizes[i], firstRowIndexes[i], unencodedBytes); } return formatter.toString(); }