Skip to content

Commit 9e231dc

Browse files
authored
Allow bytestreamsplit available via Hadoop Configuration (#3340)
1 parent 04e2f19 commit 9e231dc

File tree

2 files changed

+62
-0
lines changed

2 files changed

+62
-0
lines changed

parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,9 @@
8080
* # To enable/disable dictionary encoding
8181
* parquet.enable.dictionary=true # false to disable dictionary encoding
8282
*
83+
* # To enable/disable BYTE_STREAM_SPLIT encoding
84+
* parquet.enable.bytestreamsplit=false # true to enable BYTE_STREAM_SPLIT encoding
85+
*
8386
* # To enable/disable summary metadata aggregation at the end of a MR job
8487
* # The default is true (enabled)
8588
* parquet.enable.summary-metadata=true # false to disable summary aggregation
@@ -137,6 +140,7 @@ public static enum JobSummaryLevel {
137140
public static final String WRITE_SUPPORT_CLASS = "parquet.write.support.class";
138141
public static final String DICTIONARY_PAGE_SIZE = "parquet.dictionary.page.size";
139142
public static final String ENABLE_DICTIONARY = "parquet.enable.dictionary";
143+
public static final String ENABLE_BYTE_STREAM_SPLIT = "parquet.enable.bytestreamsplit";
140144
public static final String VALIDATION = "parquet.validation";
141145
public static final String WRITER_VERSION = "parquet.writer.version";
142146
public static final String MEMORY_POOL_RATIO = "parquet.memory.pool.ratio";
@@ -270,6 +274,11 @@ public static boolean getEnableDictionary(Configuration configuration) {
270274
return configuration.getBoolean(ENABLE_DICTIONARY, ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED);
271275
}
272276

277+
public static boolean getByteStreamSplitEnabled(Configuration configuration) {
278+
return configuration.getBoolean(
279+
ENABLE_BYTE_STREAM_SPLIT, ParquetProperties.DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED);
280+
}
281+
273282
public static int getMinRowCountForPageSizeCheck(Configuration configuration) {
274283
return configuration.getInt(
275284
MIN_ROW_COUNT_FOR_PAGE_SIZE_CHECK, ParquetProperties.DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK);
@@ -503,6 +512,7 @@ public RecordWriter<Void, T> getRecordWriter(Configuration conf, Path file, Comp
503512
.withPageSize(getPageSize(conf))
504513
.withDictionaryPageSize(getDictionaryPageSize(conf))
505514
.withDictionaryEncoding(getEnableDictionary(conf))
515+
.withByteStreamSplitEncoding(getByteStreamSplitEnabled(conf))
506516
.withWriterVersion(getWriterVersion(conf))
507517
.estimateRowCountForPageSizeCheck(getEstimatePageSizeCheck(conf))
508518
.withMinRowCountForPageSizeCheck(getMinRowCountForPageSizeCheck(conf))
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.parquet.hadoop;
20+
21+
import static org.junit.Assert.assertEquals;
22+
import static org.junit.Assert.assertFalse;
23+
import static org.junit.Assert.assertTrue;
24+
25+
import org.apache.hadoop.conf.Configuration;
26+
import org.apache.parquet.column.ParquetProperties;
27+
import org.junit.Test;
28+
29+
public class TestByteStreamSplitConfiguration {
30+
@Test
31+
public void testDefault() throws Exception {
32+
Configuration conf = new Configuration();
33+
// default should be false
34+
assertEquals(
35+
ParquetProperties.DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED,
36+
ParquetOutputFormat.getByteStreamSplitEnabled(conf));
37+
}
38+
39+
@Test
40+
public void testSetTrue() throws Exception {
41+
Configuration conf = new Configuration();
42+
conf.setBoolean(ParquetOutputFormat.ENABLE_BYTE_STREAM_SPLIT, true);
43+
assertTrue(ParquetOutputFormat.getByteStreamSplitEnabled(conf));
44+
}
45+
46+
@Test
47+
public void testSetFalse() throws Exception {
48+
Configuration conf = new Configuration();
49+
conf.setBoolean(ParquetOutputFormat.ENABLE_BYTE_STREAM_SPLIT, false);
50+
assertFalse(ParquetOutputFormat.getByteStreamSplitEnabled(conf));
51+
}
52+
}

0 commit comments

Comments
 (0)