Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions parquet-cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ Usage: parquet [options] [command] [command options]
Scan all records from a file
rewrite
Rewrite one or more Parquet files to a new Parquet file
size-stats
Print size statistics for a Parquet file

Examples:

Expand Down
2 changes: 2 additions & 0 deletions parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
import org.apache.parquet.cli.commands.ShowDictionaryCommand;
import org.apache.parquet.cli.commands.ShowFooterCommand;
import org.apache.parquet.cli.commands.ShowPagesCommand;
import org.apache.parquet.cli.commands.ShowSizeStatisticsCommand;
import org.apache.parquet.cli.commands.ToAvroCommand;
import org.apache.parquet.cli.commands.TransCompressionCommand;
import org.slf4j.Logger;
Expand Down Expand Up @@ -105,6 +106,7 @@ public class Main extends Configured implements Tool {
jc.addCommand("bloom-filter", new ShowBloomFilterCommand(console));
jc.addCommand("scan", new ScanCommand(console));
jc.addCommand("rewrite", new RewriteCommand(console));
jc.addCommand("size-stats", new ShowSizeStatisticsCommand(console));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.parquet.cli.commands;

import static org.apache.parquet.cli.Util.humanReadable;

import com.beust.jcommander.Parameter;
import com.beust.jcommander.Parameters;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.util.List;
import org.apache.commons.text.TextStringBuilder;
import org.apache.parquet.cli.BaseCommand;
import org.apache.parquet.column.statistics.SizeStatistics;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;
import org.slf4j.Logger;

@Parameters(commandDescription = "Print size statistics for a Parquet file")
public class ShowSizeStatisticsCommand extends BaseCommand {

public ShowSizeStatisticsCommand(Logger console) {
super(console);
}

@Parameter(description = "<parquet path>")
List<String> targets;

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && !targets.isEmpty(), "A Parquet file is required.");
Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");

String source = targets.get(0);
try (ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source))) {
ParquetMetadata footer = reader.getFooter();
MessageType schema = footer.getFileMetaData().getSchema();

console.info("\nFile path: {}", source);

List<BlockMetaData> rowGroups = footer.getBlocks();
for (int index = 0, n = rowGroups.size(); index < n; index++) {
printRowGroupSizeStats(console, index, rowGroups.get(index), schema);
console.info("");
}
}

return 0;
}

private void printRowGroupSizeStats(Logger console, int index, BlockMetaData rowGroup, MessageType schema) {
int maxColumnWidth = Math.max(
"column".length(),
rowGroup.getColumns().stream()
.map(col -> col.getPath().toString().length())
.max(Integer::compare)
.orElse(0));

console.info(String.format("\nRow group %d\n%s", index, new TextStringBuilder(80).appendPadding(80, '-')));

String formatString = String.format("%%-%ds %%-15s %%-40s %%-40s", maxColumnWidth);
console.info(
String.format(formatString, "column", "unencoded bytes", "rep level histogram", "def level histogram"));

for (ColumnChunkMetaData column : rowGroup.getColumns()) {
printColumnSizeStats(console, column, schema, maxColumnWidth);
}
}

private void printColumnSizeStats(Logger console, ColumnChunkMetaData column, MessageType schema, int columnWidth) {
SizeStatistics stats = column.getSizeStatistics();

if (stats != null && stats.isValid()) {
String unencodedBytes = stats.getUnencodedByteArrayDataBytes().isPresent()
? humanReadable(stats.getUnencodedByteArrayDataBytes().get())
: "-";
List<Long> repLevels = stats.getRepetitionLevelHistogram();
String repLevelsString = (repLevels != null && !repLevels.isEmpty()) ? repLevels.toString() : "-";
List<Long> defLevels = stats.getDefinitionLevelHistogram();
String defLevelsString = (defLevels != null && !defLevels.isEmpty()) ? defLevels.toString() : "-";
String formatString = String.format("%%-%ds %%-15s %%-40s %%-40s", columnWidth);
console.info(
String.format(formatString, column.getPath(), unencodedBytes, repLevelsString, defLevelsString));
} else {
String formatString = String.format("%%-%ds %%-15s %%-40s %%-40s", columnWidth);
console.info(String.format(formatString, column.getPath(), "-", "-", "-"));
}
}

@Override
public List<String> getExamples() {
return Lists.newArrayList("# Show size statistics for a Parquet file", "sample.parquet");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.cli.commands;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import org.apache.hadoop.conf.Configuration;
import org.junit.Assert;
import org.junit.Test;

public class ShowSizeStatisticsCommandTest extends ParquetFileTest {
@Test
public void testShowSizeStatisticsCommand() throws IOException {
File file = parquetFile();
ShowSizeStatisticsCommand command = new ShowSizeStatisticsCommand(createLogger());
command.targets = Arrays.asList(file.getAbsolutePath());
command.setConf(new Configuration());
Assert.assertEquals(0, command.run());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -182,14 +182,39 @@ public List<Long> getDefinitionLevelHistogram() {
return LongLists.unmodifiable(LongArrayList.wrap(defLevelHistogram));
}

private String formatHistogram(long[] histogram, int pageIndex) {
if (histogram != null && histogram.length > 0) {
int numLevelsPerPage = histogram.length / nullPages.length;
int offset = pageIndex * numLevelsPerPage;
StringBuilder sb = new StringBuilder();
sb.append('[');
for (int j = 0; j < numLevelsPerPage; j++) {
if (j > 0) {
sb.append(",");
}
sb.append(histogram[offset + j]);
}
sb.append(']');
return sb.toString();
}
return TOSTRING_MISSING_VALUE_MARKER;
}

@Override
public String toString() {
try (Formatter formatter = new Formatter()) {
formatter.format("Boundary order: %s\n", boundaryOrder);
String minMaxPart =
" %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s\n";
formatter.format("%-10s %20s" + minMaxPart, "", "null count", "min", "max");
String format = "page-%-5d %20s" + minMaxPart;
" %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s";
formatter.format(
"%-10s %20s" + minMaxPart + " %20s %20s\n",
"",
"null count",
"min",
"max",
"rep level histogram",
"def level histogram");
String format = "page-%-5d %20s" + minMaxPart + " %20s %20s\n";
int arrayIndex = 0;
for (int i = 0, n = nullPages.length; i < n; ++i) {
String nullCount =
Expand All @@ -201,7 +226,9 @@ public String toString() {
min = truncate(getMinValueAsString(arrayIndex));
max = truncate(getMaxValueAsString(arrayIndex++));
}
formatter.format(format, i, nullCount, min, max);
String repLevelHist = formatHistogram(repLevelHistogram, i);
String defLevelHist = formatHistogram(defLevelHistogram, i);
formatter.format(format, i, nullCount, min, max, repLevelHist, defLevelHist);
}
return formatter.toString();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,17 @@ private static class OffsetIndexImpl implements OffsetIndex {
@Override
public String toString() {
try (Formatter formatter = new Formatter()) {
formatter.format("%-10s %20s %16s %20s\n", "", "offset", "compressed size", "first row index");
formatter.format(
"%-10s %20s %20s %20s %20s\n",
"", "offset", "compressed size", "first row index", "unencoded bytes");
for (int i = 0, n = offsets.length; i < n; ++i) {
String unencodedBytes =
(unencodedByteArrayDataBytes != null && unencodedByteArrayDataBytes.length > 0)
? String.valueOf(unencodedByteArrayDataBytes[i])
: "-";
formatter.format(
"page-%-5d %20d %16d %20d\n", i, offsets[i], compressedPageSizes[i], firstRowIndexes[i]);
"page-%-5d %20d %20d %20d %20s\n",
i, offsets[i], compressedPageSizes[i], firstRowIndexes[i], unencodedBytes);
}
return formatter.toString();
}
Expand Down
Loading