-
Notifications
You must be signed in to change notification settings - Fork 1.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GH-1452: implement Size() filter for repeated columns #3098
base: master
Are you sure you want to change the base?
Changes from all commits
58ab077
b0e3526
8c6a11a
ee219e5
d3323e2
c555fa2
5d08d3d
685c918
495d504
6a7207b
d2e8dc3
2e8ba22
9586427
39981e9
3b46d8f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -505,6 +505,82 @@ public <R> R filter( | |||||||||||||
} | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
public static final class Size implements FilterPredicate, Serializable { | ||||||||||||||
public enum Operator { | ||||||||||||||
EQ, | ||||||||||||||
LT, | ||||||||||||||
LTE, | ||||||||||||||
GT, | ||||||||||||||
GTE | ||||||||||||||
Comment on lines
+512
to
+514
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
IIRC, these are commonly used abbreviations? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think every project uses a slightly different abbreviation: https://github.com/apache/iceberg/blob/c0bd4bfbceeaf3cb6e4ead675fcb47232361af3c/api/src/main/java/org/apache/iceberg/transforms/ProjectionUtil.java#L42-L61 👯 |
||||||||||||||
} | ||||||||||||||
|
||||||||||||||
private final Column<?> column; | ||||||||||||||
private final Operator operator; | ||||||||||||||
private final int value; | ||||||||||||||
|
||||||||||||||
Size(Column<?> column, Operator operator, int value) { | ||||||||||||||
this.column = column; | ||||||||||||||
this.operator = operator; | ||||||||||||||
if (value < 0 || (operator == Operator.LT && value == 0)) { | ||||||||||||||
throw new IllegalArgumentException("Invalid predicate " + this + ": array size can never be negative"); | ||||||||||||||
} | ||||||||||||||
this.value = value; | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
@Override | ||||||||||||||
public <R> R accept(Visitor<R> visitor) { | ||||||||||||||
return visitor.visit(this); | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
public int getValue() { | ||||||||||||||
return value; | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
public Column<?> getColumn() { | ||||||||||||||
return column; | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
public <R> R filter( | ||||||||||||||
Function<Integer, R> onEq, | ||||||||||||||
Function<Integer, R> onLt, | ||||||||||||||
Function<Integer, R> onLtEq, | ||||||||||||||
Function<Integer, R> onGt, | ||||||||||||||
Function<Integer, R> onGtEq) { | ||||||||||||||
if (operator == Operator.EQ) { | ||||||||||||||
return onEq.apply(value); | ||||||||||||||
} else if (operator == Operator.LT) { | ||||||||||||||
return onLt.apply(value); | ||||||||||||||
} else if (operator == Operator.LTE) { | ||||||||||||||
return onLtEq.apply(value); | ||||||||||||||
} else if (operator == Operator.GT) { | ||||||||||||||
return onGt.apply(value); | ||||||||||||||
} else if (operator == Operator.GTE) { | ||||||||||||||
return onGtEq.apply(value); | ||||||||||||||
} else { | ||||||||||||||
throw new UnsupportedOperationException("Operator " + operator + " cannot be used with size() filter"); | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
@Override | ||||||||||||||
public boolean equals(Object o) { | ||||||||||||||
if (this == o) return true; | ||||||||||||||
if (o == null || getClass() != o.getClass()) return false; | ||||||||||||||
|
||||||||||||||
return column.equals(((Size) o).column) && operator == ((Size) o).operator && value == ((Size) o).value; | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
@Override | ||||||||||||||
public int hashCode() { | ||||||||||||||
return Objects.hash(column, operator, value); | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
@Override | ||||||||||||||
public String toString() { | ||||||||||||||
return "size(" + column.getColumnPath().toDotString() + " " | ||||||||||||||
+ operator.toString().toLowerCase() + " " + value + ")"; | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
public static final class NotIn<T extends Comparable<T>> extends SetColumnFilterPredicate<T> { | ||||||||||||||
|
||||||||||||||
NotIn(Column<T> column, Set<T> values) { | ||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,9 +38,11 @@ | |
import org.apache.parquet.filter2.predicate.Operators.NotIn; | ||
import org.apache.parquet.filter2.predicate.Operators.Or; | ||
import org.apache.parquet.filter2.predicate.Operators.SetColumnFilterPredicate; | ||
import org.apache.parquet.filter2.predicate.Operators.Size; | ||
import org.apache.parquet.filter2.predicate.Operators.UserDefined; | ||
import org.apache.parquet.hadoop.metadata.ColumnPath; | ||
import org.apache.parquet.schema.MessageType; | ||
import org.apache.parquet.schema.Type; | ||
|
||
/** | ||
* Inspects the column types found in the provided {@link FilterPredicate} and compares them | ||
|
@@ -135,6 +137,12 @@ public <T extends Comparable<T>> Void visit(Contains<T> pred) { | |
return null; | ||
} | ||
|
||
@Override | ||
public Void visit(Size size) { | ||
validateColumn(size.getColumn(), true, true); | ||
return null; | ||
} | ||
|
||
@Override | ||
public Void visit(And and) { | ||
and.getLeft().accept(this); | ||
|
@@ -175,14 +183,15 @@ private <T extends Comparable<T>> void validateColumnFilterPredicate(SetColumnFi | |
} | ||
|
||
private <T extends Comparable<T>> void validateColumnFilterPredicate(Contains<T> pred) { | ||
validateColumn(pred.getColumn(), true); | ||
validateColumn(pred.getColumn(), true, false); | ||
} | ||
|
||
private <T extends Comparable<T>> void validateColumn(Column<T> column) { | ||
validateColumn(column, false); | ||
validateColumn(column, false, false); | ||
} | ||
|
||
private <T extends Comparable<T>> void validateColumn(Column<T> column, boolean shouldBeRepeated) { | ||
private <T extends Comparable<T>> void validateColumn( | ||
Column<T> column, boolean isRepeatedColumn, boolean mustBeRequired) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it precise to rename these parameters as below?
|
||
ColumnPath path = column.getColumnPath(); | ||
|
||
Class<?> alreadySeen = columnTypesEncountered.get(path); | ||
|
@@ -204,15 +213,21 @@ private <T extends Comparable<T>> void validateColumn(Column<T> column, boolean | |
return; | ||
} | ||
|
||
if (shouldBeRepeated && descriptor.getMaxRepetitionLevel() == 0) { | ||
if (isRepeatedColumn && descriptor.getMaxRepetitionLevel() == 0) { | ||
throw new IllegalArgumentException( | ||
"FilterPredicate for column " + path.toDotString() + " requires a repeated " | ||
+ "schema, but found max repetition level " + descriptor.getMaxRepetitionLevel()); | ||
} else if (!shouldBeRepeated && descriptor.getMaxRepetitionLevel() > 0) { | ||
} else if (!isRepeatedColumn && descriptor.getMaxRepetitionLevel() > 0) { | ||
throw new IllegalArgumentException("FilterPredicates do not currently support repeated columns. " | ||
+ "Column " + path.toDotString() + " is repeated."); | ||
} | ||
|
||
if (mustBeRequired && descriptor.getPrimitiveType().isRepetition(Type.Repetition.OPTIONAL)) { | ||
throw new IllegalArgumentException("FilterPredicate for column " + path.toDotString() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the column is |
||
+ " requires schema to have repetition REQUIRED, but found " | ||
+ descriptor.getPrimitiveType().getRepetition() + "."); | ||
} | ||
|
||
ValidTypeMap.assertTypeValid(column, descriptor.getType()); | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,7 @@ | |
|
||
import java.util.Arrays; | ||
import java.util.Objects; | ||
import java.util.function.Function; | ||
import org.apache.parquet.io.api.Binary; | ||
|
||
/** | ||
|
@@ -223,6 +224,83 @@ public void reset() { | |
} | ||
} | ||
|
||
class CountingValueInspector extends ValueInspector { | ||
private int observedValueCount; | ||
private final ValueInspector delegate; | ||
|
||
/** | ||
* Triggering function to update the underlying delegate. We want to be careful not to trigger before | ||
* all relevant column values have been considered. | ||
* | ||
* For example, given the predicate `size(col, LT, 3)` and a record with 4 array values, we don't want the | ||
* underlying `lt(3)` predicate to be evaluated on the first or second elements of the array, since it would | ||
* return a premature True value. | ||
*/ | ||
private final Function<Integer, Boolean> shouldUpdateDelegate; | ||
|
||
public CountingValueInspector(ValueInspector delegate, Function<Integer, Boolean> shouldUpdateDelegate) { | ||
this.observedValueCount = 0; | ||
this.delegate = delegate; | ||
this.shouldUpdateDelegate = shouldUpdateDelegate; | ||
clairemcginty marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
@Override | ||
public void updateNull() { | ||
delegate.update(observedValueCount); | ||
if (!delegate.isKnown()) { | ||
delegate.updateNull(); | ||
} | ||
setResult(delegate.getResult()); | ||
} | ||
|
||
@Override | ||
public void update(int value) { | ||
incrementCount(); | ||
} | ||
|
||
@Override | ||
public void update(long value) { | ||
incrementCount(); | ||
} | ||
|
||
@Override | ||
public void update(double value) { | ||
incrementCount(); | ||
} | ||
|
||
@Override | ||
public void update(float value) { | ||
incrementCount(); | ||
} | ||
|
||
@Override | ||
public void update(boolean value) { | ||
incrementCount(); | ||
} | ||
|
||
@Override | ||
public void update(Binary value) { | ||
incrementCount(); | ||
} | ||
|
||
@Override | ||
public void reset() { | ||
super.reset(); | ||
delegate.reset(); | ||
observedValueCount = 0; | ||
} | ||
|
||
private void incrementCount() { | ||
observedValueCount++; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it better to check whether the result is known before doing anything? |
||
if (!delegate.isKnown() && shouldUpdateDelegate.apply(observedValueCount)) { | ||
delegate.update(observedValueCount); | ||
if (delegate.isKnown()) { | ||
setResult(delegate.getResult()); | ||
} | ||
} | ||
} | ||
} | ||
|
||
// base class for and / or | ||
abstract static class BinaryLogical implements IncrementallyUpdatedFilterPredicate { | ||
private final IncrementallyUpdatedFilterPredicate left; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we support notEqual for completeness, though not that useful?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was thinking the same for the
LogicalInverter
above