Skip to content

Commit 9f4b270

Browse files
committed
PARQUET-34: Fix DictionaryFilter logic
1 parent 9241ce2 commit 9f4b270

File tree

2 files changed

+16
-15
lines changed

2 files changed

+16
-15
lines changed

parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -506,18 +506,19 @@ public Boolean visit(Size size) {
506506
}
507507

508508
try {
509-
// We know the block has at most `dictSize` array element values
509+
// We know the block has at least as many array elements as the dictionary sizes
510510
final Set<?> dict = expandDictionary(meta);
511511
if (dict == null) {
512512
return BLOCK_MIGHT_MATCH;
513513
}
514-
int dictSize = dict.size();
514+
int numDistinctValues = dict.size();
515515
final boolean blockCannotMatch = size.filter(
516-
(eq) -> eq > dictSize,
517-
(lt) -> false,
518-
(lte) -> false,
519-
(gt) -> gt >= dictSize,
520-
(gte) -> gte > dictSize);
516+
(eq) -> eq < numDistinctValues,
517+
(lt) -> lt <= numDistinctValues,
518+
(lte) -> lte < numDistinctValues,
519+
(gt) -> false,
520+
(gte) -> false);
521+
521522
return blockCannotMatch ? BLOCK_CANNOT_MATCH : BLOCK_MIGHT_MATCH;
522523
} catch (IOException e) {
523524
LOG.warn("Failed to process dictionary for filter evaluation.", e);

parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -511,14 +511,14 @@ public void testGtEqDouble() throws Exception {
511511
public void testSizeBinary() throws Exception {
512512
BinaryColumn b = binaryColumn("repeated_binary_field");
513513

514-
// DictionaryFilter knows that `repeated_binary_field` column has at most 26 element values
515-
assertTrue(canDrop(size(b, Operators.Size.Operator.GT, 26), ccmd, dictionaries));
516-
assertTrue(canDrop(size(b, Operators.Size.Operator.GTE, 27), ccmd, dictionaries));
517-
assertTrue(canDrop(size(b, Operators.Size.Operator.EQ, 27), ccmd, dictionaries));
518-
519-
assertFalse(canDrop(size(b, Operators.Size.Operator.LT, 27), ccmd, dictionaries));
520-
assertFalse(canDrop(size(b, Operators.Size.Operator.LTE, 26), ccmd, dictionaries));
521-
assertFalse(canDrop(size(b, Operators.Size.Operator.EQ, 26), ccmd, dictionaries));
514+
// DictionaryFilter knows that `repeated_binary_field` column has at least 26 element values
515+
assertFalse(canDrop(size(b, Operators.Size.Operator.GT, 26), ccmd, dictionaries));
516+
assertFalse(canDrop(size(b, Operators.Size.Operator.GTE, 27), ccmd, dictionaries));
517+
assertFalse(canDrop(size(b, Operators.Size.Operator.EQ, 27), ccmd, dictionaries));
518+
519+
assertTrue(canDrop(size(b, Operators.Size.Operator.LT, 26), ccmd, dictionaries));
520+
assertTrue(canDrop(size(b, Operators.Size.Operator.LTE, 25), ccmd, dictionaries));
521+
assertTrue(canDrop(size(b, Operators.Size.Operator.EQ, 25), ccmd, dictionaries));
522522

523523
// If column doesn't exist in meta, it should be treated as having size 0
524524
BinaryColumn nonExistentColumn = binaryColumn("nonexistant_col");

0 commit comments

Comments
 (0)