Skip to content

Commit be5ada2

Browse files
authored
GH-3125: Add CLI for SizeStatistics (#3126)
1 parent d5f86d7 commit be5ada2

File tree

6 files changed

+197
-6
lines changed

6 files changed

+197
-6
lines changed

parquet-cli/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,8 @@ Usage: parquet [options] [command] [command options]
119119
Scan all records from a file
120120
rewrite
121121
Rewrite one or more Parquet files to a new Parquet file
122+
size-stats
123+
Print size statistics for a Parquet file
122124
123125
Examples:
124126

parquet-cli/src/main/java/org/apache/parquet/cli/Main.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
import org.apache.parquet.cli.commands.ShowDictionaryCommand;
5252
import org.apache.parquet.cli.commands.ShowFooterCommand;
5353
import org.apache.parquet.cli.commands.ShowPagesCommand;
54+
import org.apache.parquet.cli.commands.ShowSizeStatisticsCommand;
5455
import org.apache.parquet.cli.commands.ToAvroCommand;
5556
import org.apache.parquet.cli.commands.TransCompressionCommand;
5657
import org.slf4j.Logger;
@@ -105,6 +106,7 @@ public class Main extends Configured implements Tool {
105106
jc.addCommand("bloom-filter", new ShowBloomFilterCommand(console));
106107
jc.addCommand("scan", new ScanCommand(console));
107108
jc.addCommand("rewrite", new RewriteCommand(console));
109+
jc.addCommand("size-stats", new ShowSizeStatisticsCommand(console));
108110
}
109111

110112
@Override
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.parquet.cli.commands;
21+
22+
import static org.apache.parquet.cli.Util.humanReadable;
23+
24+
import com.beust.jcommander.Parameter;
25+
import com.beust.jcommander.Parameters;
26+
import com.google.common.base.Preconditions;
27+
import com.google.common.collect.Lists;
28+
import java.io.IOException;
29+
import java.util.List;
30+
import org.apache.commons.text.TextStringBuilder;
31+
import org.apache.parquet.cli.BaseCommand;
32+
import org.apache.parquet.column.statistics.SizeStatistics;
33+
import org.apache.parquet.hadoop.ParquetFileReader;
34+
import org.apache.parquet.hadoop.metadata.BlockMetaData;
35+
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
36+
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
37+
import org.apache.parquet.schema.MessageType;
38+
import org.slf4j.Logger;
39+
40+
@Parameters(commandDescription = "Print size statistics for a Parquet file")
41+
public class ShowSizeStatisticsCommand extends BaseCommand {
42+
43+
public ShowSizeStatisticsCommand(Logger console) {
44+
super(console);
45+
}
46+
47+
@Parameter(description = "<parquet path>")
48+
List<String> targets;
49+
50+
@Override
51+
@SuppressWarnings("unchecked")
52+
public int run() throws IOException {
53+
Preconditions.checkArgument(targets != null && !targets.isEmpty(), "A Parquet file is required.");
54+
Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
55+
56+
String source = targets.get(0);
57+
try (ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source))) {
58+
ParquetMetadata footer = reader.getFooter();
59+
MessageType schema = footer.getFileMetaData().getSchema();
60+
61+
console.info("\nFile path: {}", source);
62+
63+
List<BlockMetaData> rowGroups = footer.getBlocks();
64+
for (int index = 0, n = rowGroups.size(); index < n; index++) {
65+
printRowGroupSizeStats(console, index, rowGroups.get(index), schema);
66+
console.info("");
67+
}
68+
}
69+
70+
return 0;
71+
}
72+
73+
private void printRowGroupSizeStats(Logger console, int index, BlockMetaData rowGroup, MessageType schema) {
74+
int maxColumnWidth = Math.max(
75+
"column".length(),
76+
rowGroup.getColumns().stream()
77+
.map(col -> col.getPath().toString().length())
78+
.max(Integer::compare)
79+
.orElse(0));
80+
81+
console.info(String.format("\nRow group %d\n%s", index, new TextStringBuilder(80).appendPadding(80, '-')));
82+
83+
String formatString = String.format("%%-%ds %%-15s %%-40s %%-40s", maxColumnWidth);
84+
console.info(
85+
String.format(formatString, "column", "unencoded bytes", "rep level histogram", "def level histogram"));
86+
87+
for (ColumnChunkMetaData column : rowGroup.getColumns()) {
88+
printColumnSizeStats(console, column, schema, maxColumnWidth);
89+
}
90+
}
91+
92+
private void printColumnSizeStats(Logger console, ColumnChunkMetaData column, MessageType schema, int columnWidth) {
93+
SizeStatistics stats = column.getSizeStatistics();
94+
95+
if (stats != null && stats.isValid()) {
96+
String unencodedBytes = stats.getUnencodedByteArrayDataBytes().isPresent()
97+
? humanReadable(stats.getUnencodedByteArrayDataBytes().get())
98+
: "-";
99+
List<Long> repLevels = stats.getRepetitionLevelHistogram();
100+
String repLevelsString = (repLevels != null && !repLevels.isEmpty()) ? repLevels.toString() : "-";
101+
List<Long> defLevels = stats.getDefinitionLevelHistogram();
102+
String defLevelsString = (defLevels != null && !defLevels.isEmpty()) ? defLevels.toString() : "-";
103+
String formatString = String.format("%%-%ds %%-15s %%-40s %%-40s", columnWidth);
104+
console.info(
105+
String.format(formatString, column.getPath(), unencodedBytes, repLevelsString, defLevelsString));
106+
} else {
107+
String formatString = String.format("%%-%ds %%-15s %%-40s %%-40s", columnWidth);
108+
console.info(String.format(formatString, column.getPath(), "-", "-", "-"));
109+
}
110+
}
111+
112+
@Override
113+
public List<String> getExamples() {
114+
return Lists.newArrayList("# Show size statistics for a Parquet file", "sample.parquet");
115+
}
116+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.parquet.cli.commands;
20+
21+
import java.io.File;
22+
import java.io.IOException;
23+
import java.util.Arrays;
24+
import org.apache.hadoop.conf.Configuration;
25+
import org.junit.Assert;
26+
import org.junit.Test;
27+
28+
public class ShowSizeStatisticsCommandTest extends ParquetFileTest {
29+
@Test
30+
public void testShowSizeStatisticsCommand() throws IOException {
31+
File file = parquetFile();
32+
ShowSizeStatisticsCommand command = new ShowSizeStatisticsCommand(createLogger());
33+
command.targets = Arrays.asList(file.getAbsolutePath());
34+
command.setConf(new Configuration());
35+
Assert.assertEquals(0, command.run());
36+
}
37+
}

parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -182,14 +182,39 @@ public List<Long> getDefinitionLevelHistogram() {
182182
return LongLists.unmodifiable(LongArrayList.wrap(defLevelHistogram));
183183
}
184184

185+
private String formatHistogram(long[] histogram, int pageIndex) {
186+
if (histogram != null && histogram.length > 0) {
187+
int numLevelsPerPage = histogram.length / nullPages.length;
188+
int offset = pageIndex * numLevelsPerPage;
189+
StringBuilder sb = new StringBuilder();
190+
sb.append('[');
191+
for (int j = 0; j < numLevelsPerPage; j++) {
192+
if (j > 0) {
193+
sb.append(",");
194+
}
195+
sb.append(histogram[offset + j]);
196+
}
197+
sb.append(']');
198+
return sb.toString();
199+
}
200+
return TOSTRING_MISSING_VALUE_MARKER;
201+
}
202+
185203
@Override
186204
public String toString() {
187205
try (Formatter formatter = new Formatter()) {
188206
formatter.format("Boundary order: %s\n", boundaryOrder);
189207
String minMaxPart =
190-
" %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s\n";
191-
formatter.format("%-10s %20s" + minMaxPart, "", "null count", "min", "max");
192-
String format = "page-%-5d %20s" + minMaxPart;
208+
" %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s %-" + MAX_VALUE_LENGTH_FOR_TOSTRING + "s";
209+
formatter.format(
210+
"%-10s %20s" + minMaxPart + " %20s %20s\n",
211+
"",
212+
"null count",
213+
"min",
214+
"max",
215+
"rep level histogram",
216+
"def level histogram");
217+
String format = "page-%-5d %20s" + minMaxPart + " %20s %20s\n";
193218
int arrayIndex = 0;
194219
for (int i = 0, n = nullPages.length; i < n; ++i) {
195220
String nullCount =
@@ -201,7 +226,9 @@ public String toString() {
201226
min = truncate(getMinValueAsString(arrayIndex));
202227
max = truncate(getMaxValueAsString(arrayIndex++));
203228
}
204-
formatter.format(format, i, nullCount, min, max);
229+
String repLevelHist = formatHistogram(repLevelHistogram, i);
230+
String defLevelHist = formatHistogram(defLevelHistogram, i);
231+
formatter.format(format, i, nullCount, min, max, repLevelHist, defLevelHist);
205232
}
206233
return formatter.toString();
207234
}

parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/OffsetIndexBuilder.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,17 @@ private static class OffsetIndexImpl implements OffsetIndex {
3939
@Override
4040
public String toString() {
4141
try (Formatter formatter = new Formatter()) {
42-
formatter.format("%-10s %20s %16s %20s\n", "", "offset", "compressed size", "first row index");
42+
formatter.format(
43+
"%-10s %20s %20s %20s %20s\n",
44+
"", "offset", "compressed size", "first row index", "unencoded bytes");
4345
for (int i = 0, n = offsets.length; i < n; ++i) {
46+
String unencodedBytes =
47+
(unencodedByteArrayDataBytes != null && unencodedByteArrayDataBytes.length > 0)
48+
? String.valueOf(unencodedByteArrayDataBytes[i])
49+
: "-";
4450
formatter.format(
45-
"page-%-5d %20d %16d %20d\n", i, offsets[i], compressedPageSizes[i], firstRowIndexes[i]);
51+
"page-%-5d %20d %20d %20d %20s\n",
52+
i, offsets[i], compressedPageSizes[i], firstRowIndexes[i], unencodedBytes);
4653
}
4754
return formatter.toString();
4855
}

0 commit comments

Comments
 (0)