From f86a80f0c664c8d3cc838ee753cd50c2e103aecc Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 14 Nov 2024 13:54:10 +0800 Subject: [PATCH 1/2] GH-3059: Add configuration to disable size statistics --- .../parquet/column/ParquetProperties.java | 45 +++++++++++- .../column/impl/ColumnValueCollector.java | 9 ++- .../column/statistics/SizeStatistics.java | 36 +++++++++- parquet-hadoop/README.md | 17 +++++ .../parquet/hadoop/ParquetOutputFormat.java | 17 +++++ .../apache/parquet/hadoop/ParquetWriter.java | 24 +++++++ .../parquet/hadoop/TestParquetWriter.java | 70 +++++++++++++++++++ 7 files changed, 214 insertions(+), 4 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java index 1fd7ed9bc8..9aaef4b3cf 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java @@ -64,6 +64,7 @@ public class ParquetProperties { public static final boolean DEFAULT_ADAPTIVE_BLOOM_FILTER_ENABLED = false; public static final int DEFAULT_BLOOM_FILTER_CANDIDATES_NUMBER = 5; public static final boolean DEFAULT_STATISTICS_ENABLED = true; + public static final boolean DEFAULT_SIZE_STATISTICS_ENABLED = true; public static final boolean DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED = true; @@ -112,6 +113,7 @@ public static WriterVersion fromString(String name) { private final int columnIndexTruncateLength; private final int statisticsTruncateLength; private final boolean statisticsEnabled; + private final boolean sizeStatisticsEnabled; // The expected NDV (number of distinct values) for each columns private final ColumnProperty bloomFilterNDVs; @@ -125,6 +127,7 @@ public static WriterVersion fromString(String name) { private final ColumnProperty byteStreamSplitEnabled; private final Map extraMetaData; private final ColumnProperty statistics; + private final ColumnProperty sizeStatistics; private ParquetProperties(Builder builder) { this.pageSizeThreshold = builder.pageSize; @@ -143,6 +146,7 @@ private ParquetProperties(Builder builder) { this.columnIndexTruncateLength = builder.columnIndexTruncateLength; this.statisticsTruncateLength = builder.statisticsTruncateLength; this.statisticsEnabled = builder.statisticsEnabled; + this.sizeStatisticsEnabled = builder.sizeStatisticsEnabled; this.bloomFilterNDVs = builder.bloomFilterNDVs.build(); this.bloomFilterFPPs = builder.bloomFilterFPPs.build(); this.bloomFilterEnabled = builder.bloomFilterEnabled.build(); @@ -154,6 +158,7 @@ private ParquetProperties(Builder builder) { this.byteStreamSplitEnabled = builder.byteStreamSplitEnabled.build(); this.extraMetaData = builder.extraMetaData; this.statistics = builder.statistics.build(); + this.sizeStatistics = builder.sizeStatistics.build(); } public static Builder builder() { @@ -345,6 +350,14 @@ public boolean getStatisticsEnabled(ColumnDescriptor column) { return statisticsEnabled; } + public boolean getSizeStatisticsEnabled(ColumnDescriptor column) { + Boolean columnSetting = sizeStatistics.getValue(column); + if (columnSetting != null) { + return columnSetting; + } + return sizeStatisticsEnabled; + } + @Override public String toString() { return "Parquet page size to " + getPageSizeThreshold() + '\n' @@ -361,7 +374,9 @@ public String toString() { + "Bloom filter expected number of distinct values are: " + bloomFilterNDVs + '\n' + "Bloom filter false positive probabilities are: " + bloomFilterFPPs + '\n' + "Page row count limit to " + getPageRowCountLimit() + '\n' - + "Writing page checksums is: " + (getPageWriteChecksumEnabled() ? "on" : "off"); + + "Writing page checksums is: " + (getPageWriteChecksumEnabled() ? "on" : "off") + '\n' + + "Statistics enabled: " + statisticsEnabled + '\n' + + "Size statistics enabled: " + sizeStatisticsEnabled; } public static class Builder { @@ -378,6 +393,7 @@ public static class Builder { private int columnIndexTruncateLength = DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH; private int statisticsTruncateLength = DEFAULT_STATISTICS_TRUNCATE_LENGTH; private boolean statisticsEnabled = DEFAULT_STATISTICS_ENABLED; + private boolean sizeStatisticsEnabled = DEFAULT_SIZE_STATISTICS_ENABLED; private final ColumnProperty.Builder bloomFilterNDVs; private final ColumnProperty.Builder bloomFilterFPPs; private int maxBloomFilterBytes = DEFAULT_MAX_BLOOM_FILTER_BYTES; @@ -389,6 +405,7 @@ public static class Builder { private final ColumnProperty.Builder byteStreamSplitEnabled; private Map extraMetaData = new HashMap<>(); private final ColumnProperty.Builder statistics; + private final ColumnProperty.Builder sizeStatistics; private Builder() { enableDict = ColumnProperty.builder().withDefaultValue(DEFAULT_IS_DICTIONARY_ENABLED); @@ -405,6 +422,7 @@ private Builder() { numBloomFilterCandidates = ColumnProperty.builder().withDefaultValue(DEFAULT_BLOOM_FILTER_CANDIDATES_NUMBER); statistics = ColumnProperty.builder().withDefaultValue(DEFAULT_STATISTICS_ENABLED); + sizeStatistics = ColumnProperty.builder().withDefaultValue(DEFAULT_SIZE_STATISTICS_ENABLED); } private Builder(ParquetProperties toCopy) { @@ -428,6 +446,7 @@ private Builder(ParquetProperties toCopy) { this.byteStreamSplitEnabled = ColumnProperty.builder(toCopy.byteStreamSplitEnabled); this.extraMetaData = toCopy.extraMetaData; this.statistics = ColumnProperty.builder(toCopy.statistics); + this.sizeStatistics = ColumnProperty.builder(toCopy.sizeStatistics); } /** @@ -693,6 +712,30 @@ public Builder withStatisticsEnabled(boolean enabled) { return this; } + /** + * Sets whether size statistics are enabled globally. When disabled, size statistics will not be collected + * for any column unless explicitly enabled for specific columns. + * + * @param enabled whether to collect size statistics globally + * @return this builder for method chaining + */ + public Builder withSizeStatisticsEnabled(boolean enabled) { + this.sizeStatistics.withDefaultValue(enabled); + return this; + } + + /** + * Sets the size statistics enabled/disabled for the specified column. All column size statistics are enabled by default. + * + * @param columnPath the path of the column (dot-string) + * @param enabled whether to collect size statistics for the column + * @return this builder for method chaining + */ + public Builder withSizeStatisticsEnabled(String columnPath, boolean enabled) { + this.sizeStatistics.withValue(columnPath, enabled); + return this; + } + public ParquetProperties build() { ParquetProperties properties = new ParquetProperties(this); // we pass a constructed but uninitialized factory to ParquetProperties above as currently diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnValueCollector.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnValueCollector.java index 164287df2d..95f735f75f 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnValueCollector.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnValueCollector.java @@ -37,6 +37,7 @@ class ColumnValueCollector { private final ColumnDescriptor path; private final boolean statisticsEnabled; + private final boolean sizeStatisticsEnabled; private BloomFilterWriter bloomFilterWriter; private BloomFilter bloomFilter; private Statistics statistics; @@ -45,6 +46,7 @@ class ColumnValueCollector { ColumnValueCollector(ColumnDescriptor path, BloomFilterWriter bloomFilterWriter, ParquetProperties props) { this.path = path; this.statisticsEnabled = props.getStatisticsEnabled(path); + this.sizeStatisticsEnabled = props.getSizeStatisticsEnabled(path); resetPageStatistics(); initBloomFilter(bloomFilterWriter, props); } @@ -53,8 +55,11 @@ void resetPageStatistics() { this.statistics = statisticsEnabled ? Statistics.createStats(path.getPrimitiveType()) : Statistics.noopStats(path.getPrimitiveType()); - this.sizeStatisticsBuilder = SizeStatistics.newBuilder( - path.getPrimitiveType(), path.getMaxRepetitionLevel(), path.getMaxDefinitionLevel()); + this.sizeStatisticsBuilder = sizeStatisticsEnabled + ? SizeStatistics.newBuilder( + path.getPrimitiveType(), path.getMaxRepetitionLevel(), path.getMaxDefinitionLevel()) + : SizeStatistics.noopBuilder( + path.getPrimitiveType(), path.getMaxRepetitionLevel(), path.getMaxDefinitionLevel()); } void writeNull(int repetitionLevel, int definitionLevel) { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java index 4241f23c34..97a49be652 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java @@ -52,7 +52,7 @@ public class SizeStatistics { * Builder to create a SizeStatistics. */ public static class Builder { - private final PrimitiveType type; + protected final PrimitiveType type; private long unencodedByteArrayDataBytes; private final long[] repetitionLevelHistogram; private final long[] definitionLevelHistogram; @@ -257,4 +257,38 @@ public SizeStatistics copy() { public boolean isValid() { return valid; } + + /** + * Creates a no-op size statistics builder that collects no data. + * Used when size statistics collection is disabled. + */ + private static class NoopBuilder extends Builder { + private NoopBuilder(PrimitiveType type, int maxRepetitionLevel, int maxDefinitionLevel) { + super(type, maxRepetitionLevel, maxDefinitionLevel); + } + + @Override + public void add(int repetitionLevel, int definitionLevel) { + // Do nothing + } + + @Override + public void add(int repetitionLevel, int definitionLevel, Binary value) { + // Do nothing + } + + @Override + public SizeStatistics build() { + SizeStatistics stats = new SizeStatistics(type, 0L, Collections.emptyList(), Collections.emptyList()); + stats.valid = false; // Mark as invalid since this is a noop builder + return stats; + } + } + + /** + * Creates a builder that doesn't collect any statistics. + */ + public static Builder noopBuilder(PrimitiveType type, int maxRepetitionLevel, int maxDefinitionLevel) { + return new NoopBuilder(type, maxRepetitionLevel, maxDefinitionLevel); + } } diff --git a/parquet-hadoop/README.md b/parquet-hadoop/README.md index 5817069f4b..fc12a57f27 100644 --- a/parquet-hadoop/README.md +++ b/parquet-hadoop/README.md @@ -525,3 +525,20 @@ conf.set("parquet.column.statistics.enabled", true); conf.set("parquet.column.statistics.enabled#column.path", false); // The final configuration will be: Enable statistics for all columns except 'column.path' ``` + +--- + +**Property:** `parquet.size.statistics.enabled` +**Description:** Whether to enable size statistics collection. +If `true`, size statistics will be collected for all columns unless explicitly disabled for specific columns. +If `false`, size statistics will be disabled for all columns regardless of column-specific settings. +It is possible to enable or disable size statistics for specific columns by appending `#` followed by the column path. +**Default value:** `true` +**Example:** +```java +// Enable size statistics for all columns +conf.set("parquet.size.statistics.enabled", true); +// Disable size statistics for 'column.path' +conf.set("parquet.size.statistics.enabled#column.path", false); +// The final configuration will be: Enable size statistics for all columns except 'column.path' +``` diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java index c13781a685..f00fde9aa1 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java @@ -157,6 +157,7 @@ public static enum JobSummaryLevel { public static final String PAGE_ROW_COUNT_LIMIT = "parquet.page.row.count.limit"; public static final String PAGE_WRITE_CHECKSUM_ENABLED = "parquet.page.write-checksum.enabled"; public static final String STATISTICS_ENABLED = "parquet.column.statistics.enabled"; + public static final String SIZE_STATISTICS_ENABLED = "parquet.size.statistics.enabled"; public static JobSummaryLevel getJobSummaryLevel(Configuration conf) { String level = conf.get(JOB_SUMMARY_LEVEL); @@ -409,6 +410,22 @@ public static boolean getStatisticsEnabled(Configuration conf, String columnPath return conf.getBoolean(STATISTICS_ENABLED, ParquetProperties.DEFAULT_STATISTICS_ENABLED); } + public static void setSizeStatisticsEnabled(Configuration conf, boolean enabled) { + conf.setBoolean(SIZE_STATISTICS_ENABLED, enabled); + } + + public static void setSizeStatisticsEnabled(Configuration conf, String path, boolean enabled) { + conf.setBoolean(SIZE_STATISTICS_ENABLED + "#" + path, enabled); + } + + public static boolean getSizeStatisticsEnabled(Configuration conf) { + return conf.getBoolean(SIZE_STATISTICS_ENABLED, ParquetProperties.DEFAULT_SIZE_STATISTICS_ENABLED); + } + + public static boolean getSizeStatisticsEnabled(Configuration conf, String path) { + return conf.getBoolean(SIZE_STATISTICS_ENABLED + "#" + path, getSizeStatisticsEnabled(conf)); + } + private WriteSupport writeSupport; private ParquetOutputCommitter committer; diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java index f3c50c16c3..d4c4bc1040 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java @@ -918,6 +918,30 @@ public SELF withStatisticsEnabled(boolean enabled) { return self(); } + /** + * Sets the size statistics enabled/disabled for the specified column. All column size statistics are enabled by default. + * + * @param columnPath the path of the column (dot-string) + * @param enabled whether to collect size statistics for the column + * @return this builder for method chaining + */ + public SELF withSizeStatisticsEnabled(String columnPath, boolean enabled) { + encodingPropsBuilder.withSizeStatisticsEnabled(columnPath, enabled); + return self(); + } + + /** + * Sets whether size statistics are enabled globally. When disabled, size statistics will not be collected + * for any column unless explicitly enabled for specific columns. + * + * @param enabled whether to collect size statistics globally + * @return this builder for method chaining + */ + public SELF withSizeStatisticsEnabled(boolean enabled) { + encodingPropsBuilder.withSizeStatisticsEnabled(enabled); + return self(); + } + /** * Build a {@link ParquetWriter} with the accumulated configuration. * diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java index 55d132c190..8b1d3e137c 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java @@ -32,8 +32,11 @@ import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; import static org.apache.parquet.schema.MessageTypeParser.parseMessageType; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; import static org.apache.parquet.schema.Type.Repetition.REQUIRED; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import com.google.common.collect.ImmutableMap; @@ -524,4 +527,71 @@ private void testParquetFileNumberOfBlocks( assertEquals(expectedNumberOfBlocks, footer.getBlocks().size()); } } + + @Test + public void testSizeStatisticsControl() throws Exception { + MessageType schema = Types.buildMessage() + .required(BINARY) + .named("string_field") + .required(BOOLEAN) + .named("boolean_field") + .required(INT32) + .named("int32_field") + .named("test_schema"); + + SimpleGroupFactory factory = new SimpleGroupFactory(schema); + + // Create test data + Group group = factory.newGroup() + .append("string_field", "test") + .append("boolean_field", true) + .append("int32_field", 42); + + // Test global disable + File file = temp.newFile(); + temp.delete(); + Path path = new Path(file.getAbsolutePath()); + ParquetWriter writer = ExampleParquetWriter.builder(path) + .withType(schema) + .withSizeStatisticsEnabled(false) + .build(); + writer.write(group); + writer.close(); + + ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration())); + ParquetMetadata footer = reader.getFooter(); + reader.close(); + // Verify size statistics are disabled globally + for (BlockMetaData block : footer.getBlocks()) { + for (ColumnChunkMetaData column : block.getColumns()) { + assertNull(column.getSizeStatistics()); + } + } + + // Test column-specific control + file = temp.newFile(); + temp.delete(); + path = new Path(file.getAbsolutePath()); + writer = ExampleParquetWriter.builder(path) + .withType(schema) + .withSizeStatisticsEnabled(true) // enable globally + .withSizeStatisticsEnabled("boolean_field", false) // disable for specific column + .build(); + writer.write(group); + writer.close(); + + reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration())); + footer = reader.getFooter(); + reader.close(); + // Verify size statistics are enabled for all columns except boolean_field + for (BlockMetaData block : footer.getBlocks()) { + for (ColumnChunkMetaData column : block.getColumns()) { + if (column.getPath().toDotString().equals("boolean_field")) { + assertNull(column.getSizeStatistics()); + } else { + assertTrue(column.getSizeStatistics().isValid()); + } + } + } + } } From 0d52539a7631e527bde38477412ad2f464f9425e Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 15 Nov 2024 16:47:55 +0800 Subject: [PATCH 2/2] use try-with-resource --- .../parquet/hadoop/TestParquetWriter.java | 52 +++++++++---------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java index 8b1d3e137c..64001bcaf2 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java @@ -551,20 +551,19 @@ public void testSizeStatisticsControl() throws Exception { File file = temp.newFile(); temp.delete(); Path path = new Path(file.getAbsolutePath()); - ParquetWriter writer = ExampleParquetWriter.builder(path) + try (ParquetWriter writer = ExampleParquetWriter.builder(path) .withType(schema) .withSizeStatisticsEnabled(false) - .build(); - writer.write(group); - writer.close(); - - ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration())); - ParquetMetadata footer = reader.getFooter(); - reader.close(); - // Verify size statistics are disabled globally - for (BlockMetaData block : footer.getBlocks()) { - for (ColumnChunkMetaData column : block.getColumns()) { - assertNull(column.getSizeStatistics()); + .build()) { + writer.write(group); + } + + try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) { + // Verify size statistics are disabled globally + for (BlockMetaData block : reader.getFooter().getBlocks()) { + for (ColumnChunkMetaData column : block.getColumns()) { + assertNull(column.getSizeStatistics()); + } } } @@ -572,24 +571,23 @@ public void testSizeStatisticsControl() throws Exception { file = temp.newFile(); temp.delete(); path = new Path(file.getAbsolutePath()); - writer = ExampleParquetWriter.builder(path) + try (ParquetWriter writer = ExampleParquetWriter.builder(path) .withType(schema) .withSizeStatisticsEnabled(true) // enable globally .withSizeStatisticsEnabled("boolean_field", false) // disable for specific column - .build(); - writer.write(group); - writer.close(); - - reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration())); - footer = reader.getFooter(); - reader.close(); - // Verify size statistics are enabled for all columns except boolean_field - for (BlockMetaData block : footer.getBlocks()) { - for (ColumnChunkMetaData column : block.getColumns()) { - if (column.getPath().toDotString().equals("boolean_field")) { - assertNull(column.getSizeStatistics()); - } else { - assertTrue(column.getSizeStatistics().isValid()); + .build()) { + writer.write(group); + } + + try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) { + // Verify size statistics are enabled for all columns except boolean_field + for (BlockMetaData block : reader.getFooter().getBlocks()) { + for (ColumnChunkMetaData column : block.getColumns()) { + if (column.getPath().toDotString().equals("boolean_field")) { + assertNull(column.getSizeStatistics()); + } else { + assertTrue(column.getSizeStatistics().isValid()); + } } } }