Skip to content

Commit ccac04f

Browse files
authored
GH-3059: Add configuration to disable size statistics (#3060)
1 parent 331aea0 commit ccac04f

File tree

7 files changed

+212
-4
lines changed

7 files changed

+212
-4
lines changed

parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ public class ParquetProperties {
6464
public static final boolean DEFAULT_ADAPTIVE_BLOOM_FILTER_ENABLED = false;
6565
public static final int DEFAULT_BLOOM_FILTER_CANDIDATES_NUMBER = 5;
6666
public static final boolean DEFAULT_STATISTICS_ENABLED = true;
67+
public static final boolean DEFAULT_SIZE_STATISTICS_ENABLED = true;
6768

6869
public static final boolean DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED = true;
6970

@@ -112,6 +113,7 @@ public static WriterVersion fromString(String name) {
112113
private final int columnIndexTruncateLength;
113114
private final int statisticsTruncateLength;
114115
private final boolean statisticsEnabled;
116+
private final boolean sizeStatisticsEnabled;
115117

116118
// The expected NDV (number of distinct values) for each columns
117119
private final ColumnProperty<Long> bloomFilterNDVs;
@@ -125,6 +127,7 @@ public static WriterVersion fromString(String name) {
125127
private final ColumnProperty<ByteStreamSplitMode> byteStreamSplitEnabled;
126128
private final Map<String, String> extraMetaData;
127129
private final ColumnProperty<Boolean> statistics;
130+
private final ColumnProperty<Boolean> sizeStatistics;
128131

129132
private ParquetProperties(Builder builder) {
130133
this.pageSizeThreshold = builder.pageSize;
@@ -143,6 +146,7 @@ private ParquetProperties(Builder builder) {
143146
this.columnIndexTruncateLength = builder.columnIndexTruncateLength;
144147
this.statisticsTruncateLength = builder.statisticsTruncateLength;
145148
this.statisticsEnabled = builder.statisticsEnabled;
149+
this.sizeStatisticsEnabled = builder.sizeStatisticsEnabled;
146150
this.bloomFilterNDVs = builder.bloomFilterNDVs.build();
147151
this.bloomFilterFPPs = builder.bloomFilterFPPs.build();
148152
this.bloomFilterEnabled = builder.bloomFilterEnabled.build();
@@ -154,6 +158,7 @@ private ParquetProperties(Builder builder) {
154158
this.byteStreamSplitEnabled = builder.byteStreamSplitEnabled.build();
155159
this.extraMetaData = builder.extraMetaData;
156160
this.statistics = builder.statistics.build();
161+
this.sizeStatistics = builder.sizeStatistics.build();
157162
}
158163

159164
public static Builder builder() {
@@ -345,6 +350,14 @@ public boolean getStatisticsEnabled(ColumnDescriptor column) {
345350
return statisticsEnabled;
346351
}
347352

353+
public boolean getSizeStatisticsEnabled(ColumnDescriptor column) {
354+
Boolean columnSetting = sizeStatistics.getValue(column);
355+
if (columnSetting != null) {
356+
return columnSetting;
357+
}
358+
return sizeStatisticsEnabled;
359+
}
360+
348361
@Override
349362
public String toString() {
350363
return "Parquet page size to " + getPageSizeThreshold() + '\n'
@@ -361,7 +374,9 @@ public String toString() {
361374
+ "Bloom filter expected number of distinct values are: " + bloomFilterNDVs + '\n'
362375
+ "Bloom filter false positive probabilities are: " + bloomFilterFPPs + '\n'
363376
+ "Page row count limit to " + getPageRowCountLimit() + '\n'
364-
+ "Writing page checksums is: " + (getPageWriteChecksumEnabled() ? "on" : "off");
377+
+ "Writing page checksums is: " + (getPageWriteChecksumEnabled() ? "on" : "off") + '\n'
378+
+ "Statistics enabled: " + statisticsEnabled + '\n'
379+
+ "Size statistics enabled: " + sizeStatisticsEnabled;
365380
}
366381

367382
public static class Builder {
@@ -378,6 +393,7 @@ public static class Builder {
378393
private int columnIndexTruncateLength = DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH;
379394
private int statisticsTruncateLength = DEFAULT_STATISTICS_TRUNCATE_LENGTH;
380395
private boolean statisticsEnabled = DEFAULT_STATISTICS_ENABLED;
396+
private boolean sizeStatisticsEnabled = DEFAULT_SIZE_STATISTICS_ENABLED;
381397
private final ColumnProperty.Builder<Long> bloomFilterNDVs;
382398
private final ColumnProperty.Builder<Double> bloomFilterFPPs;
383399
private int maxBloomFilterBytes = DEFAULT_MAX_BLOOM_FILTER_BYTES;
@@ -389,6 +405,7 @@ public static class Builder {
389405
private final ColumnProperty.Builder<ByteStreamSplitMode> byteStreamSplitEnabled;
390406
private Map<String, String> extraMetaData = new HashMap<>();
391407
private final ColumnProperty.Builder<Boolean> statistics;
408+
private final ColumnProperty.Builder<Boolean> sizeStatistics;
392409

393410
private Builder() {
394411
enableDict = ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_IS_DICTIONARY_ENABLED);
@@ -405,6 +422,7 @@ private Builder() {
405422
numBloomFilterCandidates =
406423
ColumnProperty.<Integer>builder().withDefaultValue(DEFAULT_BLOOM_FILTER_CANDIDATES_NUMBER);
407424
statistics = ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_STATISTICS_ENABLED);
425+
sizeStatistics = ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_SIZE_STATISTICS_ENABLED);
408426
}
409427

410428
private Builder(ParquetProperties toCopy) {
@@ -428,6 +446,7 @@ private Builder(ParquetProperties toCopy) {
428446
this.byteStreamSplitEnabled = ColumnProperty.builder(toCopy.byteStreamSplitEnabled);
429447
this.extraMetaData = toCopy.extraMetaData;
430448
this.statistics = ColumnProperty.builder(toCopy.statistics);
449+
this.sizeStatistics = ColumnProperty.builder(toCopy.sizeStatistics);
431450
}
432451

433452
/**
@@ -693,6 +712,30 @@ public Builder withStatisticsEnabled(boolean enabled) {
693712
return this;
694713
}
695714

715+
/**
716+
* Sets whether size statistics are enabled globally. When disabled, size statistics will not be collected
717+
* for any column unless explicitly enabled for specific columns.
718+
*
719+
* @param enabled whether to collect size statistics globally
720+
* @return this builder for method chaining
721+
*/
722+
public Builder withSizeStatisticsEnabled(boolean enabled) {
723+
this.sizeStatistics.withDefaultValue(enabled);
724+
return this;
725+
}
726+
727+
/**
728+
* Sets the size statistics enabled/disabled for the specified column. All column size statistics are enabled by default.
729+
*
730+
* @param columnPath the path of the column (dot-string)
731+
* @param enabled whether to collect size statistics for the column
732+
* @return this builder for method chaining
733+
*/
734+
public Builder withSizeStatisticsEnabled(String columnPath, boolean enabled) {
735+
this.sizeStatistics.withValue(columnPath, enabled);
736+
return this;
737+
}
738+
696739
public ParquetProperties build() {
697740
ParquetProperties properties = new ParquetProperties(this);
698741
// we pass a constructed but uninitialized factory to ParquetProperties above as currently

parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnValueCollector.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ class ColumnValueCollector {
3737

3838
private final ColumnDescriptor path;
3939
private final boolean statisticsEnabled;
40+
private final boolean sizeStatisticsEnabled;
4041
private BloomFilterWriter bloomFilterWriter;
4142
private BloomFilter bloomFilter;
4243
private Statistics<?> statistics;
@@ -45,6 +46,7 @@ class ColumnValueCollector {
4546
ColumnValueCollector(ColumnDescriptor path, BloomFilterWriter bloomFilterWriter, ParquetProperties props) {
4647
this.path = path;
4748
this.statisticsEnabled = props.getStatisticsEnabled(path);
49+
this.sizeStatisticsEnabled = props.getSizeStatisticsEnabled(path);
4850
resetPageStatistics();
4951
initBloomFilter(bloomFilterWriter, props);
5052
}
@@ -53,8 +55,11 @@ void resetPageStatistics() {
5355
this.statistics = statisticsEnabled
5456
? Statistics.createStats(path.getPrimitiveType())
5557
: Statistics.noopStats(path.getPrimitiveType());
56-
this.sizeStatisticsBuilder = SizeStatistics.newBuilder(
57-
path.getPrimitiveType(), path.getMaxRepetitionLevel(), path.getMaxDefinitionLevel());
58+
this.sizeStatisticsBuilder = sizeStatisticsEnabled
59+
? SizeStatistics.newBuilder(
60+
path.getPrimitiveType(), path.getMaxRepetitionLevel(), path.getMaxDefinitionLevel())
61+
: SizeStatistics.noopBuilder(
62+
path.getPrimitiveType(), path.getMaxRepetitionLevel(), path.getMaxDefinitionLevel());
5863
}
5964

6065
void writeNull(int repetitionLevel, int definitionLevel) {

parquet-column/src/main/java/org/apache/parquet/column/statistics/SizeStatistics.java

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ public class SizeStatistics {
5252
* Builder to create a SizeStatistics.
5353
*/
5454
public static class Builder {
55-
private final PrimitiveType type;
55+
protected final PrimitiveType type;
5656
private long unencodedByteArrayDataBytes;
5757
private final long[] repetitionLevelHistogram;
5858
private final long[] definitionLevelHistogram;
@@ -257,4 +257,38 @@ public SizeStatistics copy() {
257257
public boolean isValid() {
258258
return valid;
259259
}
260+
261+
/**
262+
* Creates a no-op size statistics builder that collects no data.
263+
* Used when size statistics collection is disabled.
264+
*/
265+
private static class NoopBuilder extends Builder {
266+
private NoopBuilder(PrimitiveType type, int maxRepetitionLevel, int maxDefinitionLevel) {
267+
super(type, maxRepetitionLevel, maxDefinitionLevel);
268+
}
269+
270+
@Override
271+
public void add(int repetitionLevel, int definitionLevel) {
272+
// Do nothing
273+
}
274+
275+
@Override
276+
public void add(int repetitionLevel, int definitionLevel, Binary value) {
277+
// Do nothing
278+
}
279+
280+
@Override
281+
public SizeStatistics build() {
282+
SizeStatistics stats = new SizeStatistics(type, 0L, Collections.emptyList(), Collections.emptyList());
283+
stats.valid = false; // Mark as invalid since this is a noop builder
284+
return stats;
285+
}
286+
}
287+
288+
/**
289+
* Creates a builder that doesn't collect any statistics.
290+
*/
291+
public static Builder noopBuilder(PrimitiveType type, int maxRepetitionLevel, int maxDefinitionLevel) {
292+
return new NoopBuilder(type, maxRepetitionLevel, maxDefinitionLevel);
293+
}
260294
}

parquet-hadoop/README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,3 +525,20 @@ conf.set("parquet.column.statistics.enabled", true);
525525
conf.set("parquet.column.statistics.enabled#column.path", false);
526526
// The final configuration will be: Enable statistics for all columns except 'column.path'
527527
```
528+
529+
---
530+
531+
**Property:** `parquet.size.statistics.enabled`
532+
**Description:** Whether to enable size statistics collection.
533+
If `true`, size statistics will be collected for all columns unless explicitly disabled for specific columns.
534+
If `false`, size statistics will be disabled for all columns regardless of column-specific settings.
535+
It is possible to enable or disable size statistics for specific columns by appending `#` followed by the column path.
536+
**Default value:** `true`
537+
**Example:**
538+
```java
539+
// Enable size statistics for all columns
540+
conf.set("parquet.size.statistics.enabled", true);
541+
// Disable size statistics for 'column.path'
542+
conf.set("parquet.size.statistics.enabled#column.path", false);
543+
// The final configuration will be: Enable size statistics for all columns except 'column.path'
544+
```

parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ public static enum JobSummaryLevel {
157157
public static final String PAGE_ROW_COUNT_LIMIT = "parquet.page.row.count.limit";
158158
public static final String PAGE_WRITE_CHECKSUM_ENABLED = "parquet.page.write-checksum.enabled";
159159
public static final String STATISTICS_ENABLED = "parquet.column.statistics.enabled";
160+
public static final String SIZE_STATISTICS_ENABLED = "parquet.size.statistics.enabled";
160161

161162
public static JobSummaryLevel getJobSummaryLevel(Configuration conf) {
162163
String level = conf.get(JOB_SUMMARY_LEVEL);
@@ -409,6 +410,22 @@ public static boolean getStatisticsEnabled(Configuration conf, String columnPath
409410
return conf.getBoolean(STATISTICS_ENABLED, ParquetProperties.DEFAULT_STATISTICS_ENABLED);
410411
}
411412

413+
public static void setSizeStatisticsEnabled(Configuration conf, boolean enabled) {
414+
conf.setBoolean(SIZE_STATISTICS_ENABLED, enabled);
415+
}
416+
417+
public static void setSizeStatisticsEnabled(Configuration conf, String path, boolean enabled) {
418+
conf.setBoolean(SIZE_STATISTICS_ENABLED + "#" + path, enabled);
419+
}
420+
421+
public static boolean getSizeStatisticsEnabled(Configuration conf) {
422+
return conf.getBoolean(SIZE_STATISTICS_ENABLED, ParquetProperties.DEFAULT_SIZE_STATISTICS_ENABLED);
423+
}
424+
425+
public static boolean getSizeStatisticsEnabled(Configuration conf, String path) {
426+
return conf.getBoolean(SIZE_STATISTICS_ENABLED + "#" + path, getSizeStatisticsEnabled(conf));
427+
}
428+
412429
private WriteSupport<T> writeSupport;
413430
private ParquetOutputCommitter committer;
414431

parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -918,6 +918,30 @@ public SELF withStatisticsEnabled(boolean enabled) {
918918
return self();
919919
}
920920

921+
/**
922+
* Sets the size statistics enabled/disabled for the specified column. All column size statistics are enabled by default.
923+
*
924+
* @param columnPath the path of the column (dot-string)
925+
* @param enabled whether to collect size statistics for the column
926+
* @return this builder for method chaining
927+
*/
928+
public SELF withSizeStatisticsEnabled(String columnPath, boolean enabled) {
929+
encodingPropsBuilder.withSizeStatisticsEnabled(columnPath, enabled);
930+
return self();
931+
}
932+
933+
/**
934+
* Sets whether size statistics are enabled globally. When disabled, size statistics will not be collected
935+
* for any column unless explicitly enabled for specific columns.
936+
*
937+
* @param enabled whether to collect size statistics globally
938+
* @return this builder for method chaining
939+
*/
940+
public SELF withSizeStatisticsEnabled(boolean enabled) {
941+
encodingPropsBuilder.withSizeStatisticsEnabled(enabled);
942+
return self();
943+
}
944+
921945
/**
922946
* Build a {@link ParquetWriter} with the accumulated configuration.
923947
*

parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,11 @@
3232
import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType;
3333
import static org.apache.parquet.schema.MessageTypeParser.parseMessageType;
3434
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
35+
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN;
36+
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
3537
import static org.apache.parquet.schema.Type.Repetition.REQUIRED;
3638
import static org.junit.Assert.assertEquals;
39+
import static org.junit.Assert.assertNull;
3740
import static org.junit.Assert.assertTrue;
3841

3942
import com.google.common.collect.ImmutableMap;
@@ -524,4 +527,69 @@ private void testParquetFileNumberOfBlocks(
524527
assertEquals(expectedNumberOfBlocks, footer.getBlocks().size());
525528
}
526529
}
530+
531+
@Test
532+
public void testSizeStatisticsControl() throws Exception {
533+
MessageType schema = Types.buildMessage()
534+
.required(BINARY)
535+
.named("string_field")
536+
.required(BOOLEAN)
537+
.named("boolean_field")
538+
.required(INT32)
539+
.named("int32_field")
540+
.named("test_schema");
541+
542+
SimpleGroupFactory factory = new SimpleGroupFactory(schema);
543+
544+
// Create test data
545+
Group group = factory.newGroup()
546+
.append("string_field", "test")
547+
.append("boolean_field", true)
548+
.append("int32_field", 42);
549+
550+
// Test global disable
551+
File file = temp.newFile();
552+
temp.delete();
553+
Path path = new Path(file.getAbsolutePath());
554+
try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
555+
.withType(schema)
556+
.withSizeStatisticsEnabled(false)
557+
.build()) {
558+
writer.write(group);
559+
}
560+
561+
try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
562+
// Verify size statistics are disabled globally
563+
for (BlockMetaData block : reader.getFooter().getBlocks()) {
564+
for (ColumnChunkMetaData column : block.getColumns()) {
565+
assertNull(column.getSizeStatistics());
566+
}
567+
}
568+
}
569+
570+
// Test column-specific control
571+
file = temp.newFile();
572+
temp.delete();
573+
path = new Path(file.getAbsolutePath());
574+
try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
575+
.withType(schema)
576+
.withSizeStatisticsEnabled(true) // enable globally
577+
.withSizeStatisticsEnabled("boolean_field", false) // disable for specific column
578+
.build()) {
579+
writer.write(group);
580+
}
581+
582+
try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
583+
// Verify size statistics are enabled for all columns except boolean_field
584+
for (BlockMetaData block : reader.getFooter().getBlocks()) {
585+
for (ColumnChunkMetaData column : block.getColumns()) {
586+
if (column.getPath().toDotString().equals("boolean_field")) {
587+
assertNull(column.getSizeStatistics());
588+
} else {
589+
assertTrue(column.getSizeStatistics().isValid());
590+
}
591+
}
592+
}
593+
}
594+
}
527595
}

0 commit comments

Comments
 (0)