Skip to content

Commit

Permalink
PARQUET-34: Fix DictionaryFilter logic
Browse files Browse the repository at this point in the history
  • Loading branch information
clairemcginty committed Dec 6, 2024
1 parent 9241ce2 commit 9f4b270
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -506,18 +506,19 @@ public Boolean visit(Size size) {
}

try {
// We know the block has at most `dictSize` array element values
// We know the block has at least as many array elements as the dictionary sizes
final Set<?> dict = expandDictionary(meta);
if (dict == null) {
return BLOCK_MIGHT_MATCH;
}
int dictSize = dict.size();
int numDistinctValues = dict.size();
final boolean blockCannotMatch = size.filter(
(eq) -> eq > dictSize,
(lt) -> false,
(lte) -> false,
(gt) -> gt >= dictSize,
(gte) -> gte > dictSize);
(eq) -> eq < numDistinctValues,
(lt) -> lt <= numDistinctValues,
(lte) -> lte < numDistinctValues,
(gt) -> false,
(gte) -> false);

return blockCannotMatch ? BLOCK_CANNOT_MATCH : BLOCK_MIGHT_MATCH;
} catch (IOException e) {
LOG.warn("Failed to process dictionary for filter evaluation.", e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -511,14 +511,14 @@ public void testGtEqDouble() throws Exception {
public void testSizeBinary() throws Exception {
BinaryColumn b = binaryColumn("repeated_binary_field");

// DictionaryFilter knows that `repeated_binary_field` column has at most 26 element values
assertTrue(canDrop(size(b, Operators.Size.Operator.GT, 26), ccmd, dictionaries));
assertTrue(canDrop(size(b, Operators.Size.Operator.GTE, 27), ccmd, dictionaries));
assertTrue(canDrop(size(b, Operators.Size.Operator.EQ, 27), ccmd, dictionaries));

assertFalse(canDrop(size(b, Operators.Size.Operator.LT, 27), ccmd, dictionaries));
assertFalse(canDrop(size(b, Operators.Size.Operator.LTE, 26), ccmd, dictionaries));
assertFalse(canDrop(size(b, Operators.Size.Operator.EQ, 26), ccmd, dictionaries));
// DictionaryFilter knows that `repeated_binary_field` column has at least 26 element values
assertFalse(canDrop(size(b, Operators.Size.Operator.GT, 26), ccmd, dictionaries));
assertFalse(canDrop(size(b, Operators.Size.Operator.GTE, 27), ccmd, dictionaries));
assertFalse(canDrop(size(b, Operators.Size.Operator.EQ, 27), ccmd, dictionaries));

assertTrue(canDrop(size(b, Operators.Size.Operator.LT, 26), ccmd, dictionaries));
assertTrue(canDrop(size(b, Operators.Size.Operator.LTE, 25), ccmd, dictionaries));
assertTrue(canDrop(size(b, Operators.Size.Operator.EQ, 25), ccmd, dictionaries));

// If column doesn't exist in meta, it should be treated as having size 0
BinaryColumn nonExistentColumn = binaryColumn("nonexistant_col");
Expand Down

0 comments on commit 9f4b270

Please sign in to comment.