Skip to content

Commit 5b02b42

Browse files
committed
Adaptive compression for v2 page
1 parent ee2c751 commit 5b02b42

File tree

6 files changed

+355
-113
lines changed

6 files changed

+355
-113
lines changed

parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ public class ParquetProperties {
6868
public static final boolean DEFAULT_SIZE_STATISTICS_ENABLED = true;
6969

7070
public static final boolean DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED = true;
71+
public static final double DEFAULT_V2_PAGE_COMPRESS_THRESHOLD = 0.98;
7172

7273
/**
7374
* @deprecated This shared instance can cause thread safety issues when used by multiple builders concurrently.
@@ -120,6 +121,7 @@ public static WriterVersion fromString(String name) {
120121
private final int statisticsTruncateLength;
121122
private final boolean statisticsEnabled;
122123
private final boolean sizeStatisticsEnabled;
124+
private final double v2PageCompressThreshold;
123125

124126
// The expected NDV (number of distinct values) for each columns
125127
private final ColumnProperty<Long> bloomFilterNDVs;
@@ -154,6 +156,8 @@ private ParquetProperties(Builder builder) {
154156
this.statisticsTruncateLength = builder.statisticsTruncateLength;
155157
this.statisticsEnabled = builder.statisticsEnabled;
156158
this.sizeStatisticsEnabled = builder.sizeStatisticsEnabled;
159+
this.v2PageCompressThreshold = builder.v2PageCompressThreshold;
160+
157161
this.bloomFilterNDVs = builder.bloomFilterNDVs.build();
158162
this.bloomFilterFPPs = builder.bloomFilterFPPs.build();
159163
this.bloomFilterEnabled = builder.bloomFilterEnabled.build();
@@ -322,6 +326,10 @@ public boolean getPageWriteChecksumEnabled() {
322326
return pageWriteChecksumEnabled;
323327
}
324328

329+
public double v2PageCompressThreshold() {
330+
return v2PageCompressThreshold;
331+
}
332+
325333
public OptionalLong getBloomFilterNDV(ColumnDescriptor column) {
326334
Long ndv = bloomFilterNDVs.getValue(column);
327335
return ndv == null ? OptionalLong.empty() : OptionalLong.of(ndv);
@@ -388,7 +396,8 @@ public String toString() {
388396
+ "Page row count limit to " + getPageRowCountLimit() + '\n'
389397
+ "Writing page checksums is: " + (getPageWriteChecksumEnabled() ? "on" : "off") + '\n'
390398
+ "Statistics enabled: " + statisticsEnabled + '\n'
391-
+ "Size statistics enabled: " + sizeStatisticsEnabled;
399+
+ "Size statistics enabled: " + sizeStatisticsEnabled + '\n'
400+
+ "V2 page compress threshold: " + v2PageCompressThreshold;
392401
}
393402

394403
public static class Builder {
@@ -406,6 +415,7 @@ public static class Builder {
406415
private int statisticsTruncateLength = DEFAULT_STATISTICS_TRUNCATE_LENGTH;
407416
private boolean statisticsEnabled = DEFAULT_STATISTICS_ENABLED;
408417
private boolean sizeStatisticsEnabled = DEFAULT_SIZE_STATISTICS_ENABLED;
418+
private double v2PageCompressThreshold = DEFAULT_V2_PAGE_COMPRESS_THRESHOLD;
409419
private final ColumnProperty.Builder<Long> bloomFilterNDVs;
410420
private final ColumnProperty.Builder<Double> bloomFilterFPPs;
411421
private int maxBloomFilterBytes = DEFAULT_MAX_BLOOM_FILTER_BYTES;
@@ -460,6 +470,7 @@ private Builder(ParquetProperties toCopy) {
460470
this.extraMetaData = toCopy.extraMetaData;
461471
this.statistics = ColumnProperty.builder(toCopy.statistics);
462472
this.sizeStatistics = ColumnProperty.builder(toCopy.sizeStatistics);
473+
this.v2PageCompressThreshold = toCopy.v2PageCompressThreshold();
463474
}
464475

465476
/**
@@ -756,6 +767,21 @@ public Builder withSizeStatisticsEnabled(String columnPath, boolean enabled) {
756767
return this;
757768
}
758769

770+
/**
771+
* Sets the compression threshold for V2 data pages.
772+
*
773+
* <p>When the compression ratio (compressed size / uncompressed size) exceeds this threshold,
774+
* the uncompressed data will be used instead. For example, with a threshold of 0.98, if
775+
* compression only saves 2% of space, the data will not be compressed.
776+
*
777+
* @param threshold the compression ratio threshold, default is {@value #DEFAULT_V2_PAGE_COMPRESS_THRESHOLD}
778+
* @return this builder for method chaining
779+
*/
780+
public Builder withV2PageCompressThreshold(double threshold) {
781+
this.v2PageCompressThreshold = threshold;
782+
return this;
783+
}
784+
759785
public ParquetProperties build() {
760786
ParquetProperties properties = new ParquetProperties(this);
761787
// we pass a constructed but uninitialized factory to ParquetProperties above as currently

0 commit comments

Comments
 (0)