From 1ccaa6d98bcf8ead9e25fdd358fc0e398b05c8f2 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Thu, 24 Oct 2024 13:04:51 -0700 Subject: [PATCH] Cleanup and remove redundant parquet stats skipping tests --- .../engine/parquet_stats_skipping/tests.rs | 852 ++++-------------- kernel/src/scan/data_skipping.rs | 22 +- kernel/src/scan/data_skipping/tests.rs | 99 +- 3 files changed, 223 insertions(+), 750 deletions(-) diff --git a/kernel/src/engine/parquet_stats_skipping/tests.rs b/kernel/src/engine/parquet_stats_skipping/tests.rs index 069db64cb..5e4e4130b 100644 --- a/kernel/src/engine/parquet_stats_skipping/tests.rs +++ b/kernel/src/engine/parquet_stats_skipping/tests.rs @@ -1,9 +1,26 @@ use super::*; -use crate::expressions::{ArrayData, Expression, StructData}; +use crate::expressions::Expression as Expr; use crate::predicates::PredicateEvaluator; -use crate::schema::ArrayType; use crate::DataType; +const TRUE: Option = Some(true); +const FALSE: Option = Some(false); +const NULL: Option = None; + +macro_rules! expect_eq { + ( $expr: expr, $expect: expr, $fmt: literal ) => { + let expect = ($expect); + let result = ($expr); + assert!( + result == expect, + "Expected {} = {:?}, got {:?}", + format!($fmt), + expect, + result + ); + }; +} + struct UnimplementedTestFilter; impl ParquetStatsProvider for UnimplementedTestFilter { fn get_parquet_min_stat(&self, _col: &str, _data_type: &DataType) -> Option { @@ -23,284 +40,88 @@ impl ParquetStatsProvider for UnimplementedTestFilter { } } -struct JunctionTest { - inputs: &'static [Option], - expect_and: Option, - expect_or: Option, -} +/// Tests apply_variadic and apply_scalar +#[test] +fn test_junctions() { + use VariadicOperator::*; -macro_rules! expect_eq { - ( $expr: expr, $expect: expr, $fmt: literal ) => { - let expect = ($expect); - let result = ($expr); - assert!( - result == expect, - "Expected {} = {:?}, got {:?}", - format!($fmt), - expect, - result - ); - }; -} -impl JunctionTest { - fn new( - inputs: &'static [Option], - expect_and: Option, - expect_or: Option, - ) -> Self { - Self { - inputs, - expect_and, - expect_or, - } - } - fn do_test(&self) { - use VariadicOperator::*; - let filter = UnimplementedTestFilter; - let inputs: Vec<_> = self - .inputs + + let test_cases = &[ + // Every combo of 0, 1 and 2 inputs + (&[] as &[Option], TRUE, FALSE), + (&[TRUE], TRUE, TRUE), + (&[FALSE], FALSE, FALSE), + (&[NULL], NULL, NULL), + (&[TRUE, TRUE], TRUE, TRUE), + (&[TRUE, FALSE], FALSE, TRUE), + (&[TRUE, NULL], NULL, TRUE), + (&[FALSE, TRUE], FALSE, TRUE), + (&[FALSE, FALSE], FALSE, FALSE), + (&[FALSE, NULL], FALSE, NULL), + (&[NULL, TRUE], NULL, TRUE), + (&[NULL, FALSE], FALSE, NULL), + (&[NULL, NULL], NULL, NULL), + // Every combo of 1:2 + (&[TRUE, FALSE, FALSE], FALSE, TRUE), + (&[FALSE, TRUE, FALSE], FALSE, TRUE), + (&[FALSE, FALSE, TRUE], FALSE, TRUE), + (&[TRUE, NULL, NULL], NULL, TRUE), + (&[NULL, TRUE, NULL], NULL, TRUE), + (&[NULL, NULL, TRUE], NULL, TRUE), + (&[FALSE, TRUE, TRUE], FALSE, TRUE), + (&[TRUE, FALSE, TRUE], FALSE, TRUE), + (&[TRUE, TRUE, FALSE], FALSE, TRUE), + (&[FALSE, NULL, NULL], FALSE, NULL), + (&[NULL, FALSE, NULL], FALSE, NULL), + (&[NULL, NULL, FALSE], FALSE, NULL), + (&[NULL, TRUE, TRUE], NULL, TRUE), + (&[TRUE, NULL, TRUE], NULL, TRUE), + (&[TRUE, TRUE, NULL], NULL, TRUE), + (&[NULL, FALSE, FALSE], FALSE, NULL), + (&[FALSE, NULL, FALSE], FALSE, NULL), + (&[FALSE, FALSE, NULL], FALSE, NULL), + // Every unique ordering of 3 + (&[TRUE, FALSE, NULL], FALSE, TRUE), + (&[TRUE, NULL, FALSE], FALSE, TRUE), + (&[FALSE, TRUE, NULL], FALSE, TRUE), + (&[FALSE, NULL, TRUE], FALSE, TRUE), + (&[NULL, TRUE, FALSE], FALSE, TRUE), + (&[NULL, FALSE, TRUE], FALSE, TRUE), + ]; + + let filter = UnimplementedTestFilter; + for (inputs, expect_and, expect_or) in test_cases { + let inputs: Vec<_> = inputs .iter() .map(|val| match val { - Some(v) => Expression::literal(v), - None => Expression::null_literal(DataType::BOOLEAN), + Some(v) => Expr::literal(v), + None => Expr::null_literal(DataType::BOOLEAN), }) .collect(); expect_eq!( filter.eval_variadic(And, &inputs, false), - self.expect_and, + *expect_and, "AND({inputs:?})" ); expect_eq!( filter.eval_variadic(Or, &inputs, false), - self.expect_or, + *expect_or, "OR({inputs:?})" ); expect_eq!( filter.eval_variadic(And, &inputs, true), - self.expect_and.map(|val| !val), + expect_and.map(|val| !val), "NOT(AND({inputs:?}))" ); expect_eq!( filter.eval_variadic(Or, &inputs, true), - self.expect_or.map(|val| !val), + expect_or.map(|val| !val), "NOT(OR({inputs:?}))" ); } } -/// Tests apply_variadic and apply_scalar -#[test] -fn test_junctions() { - let test_case = JunctionTest::new; - const TRUE: Option = Some(true); - const FALSE: Option = Some(false); - const NULL: Option = None; - let test_cases = &[ - // Every combo of 0, 1 and 2 inputs - test_case(&[] as &[Option], TRUE, FALSE), - test_case(&[TRUE], TRUE, TRUE), - test_case(&[FALSE], FALSE, FALSE), - test_case(&[NULL], NULL, NULL), - test_case(&[TRUE, TRUE], TRUE, TRUE), - test_case(&[TRUE, FALSE], FALSE, TRUE), - test_case(&[TRUE, NULL], NULL, TRUE), - test_case(&[FALSE, TRUE], FALSE, TRUE), - test_case(&[FALSE, FALSE], FALSE, FALSE), - test_case(&[FALSE, NULL], FALSE, NULL), - test_case(&[NULL, TRUE], NULL, TRUE), - test_case(&[NULL, FALSE], FALSE, NULL), - test_case(&[NULL, NULL], NULL, NULL), - // Every combo of 1:2 - test_case(&[TRUE, FALSE, FALSE], FALSE, TRUE), - test_case(&[FALSE, TRUE, FALSE], FALSE, TRUE), - test_case(&[FALSE, FALSE, TRUE], FALSE, TRUE), - test_case(&[TRUE, NULL, NULL], NULL, TRUE), - test_case(&[NULL, TRUE, NULL], NULL, TRUE), - test_case(&[NULL, NULL, TRUE], NULL, TRUE), - test_case(&[FALSE, TRUE, TRUE], FALSE, TRUE), - test_case(&[TRUE, FALSE, TRUE], FALSE, TRUE), - test_case(&[TRUE, TRUE, FALSE], FALSE, TRUE), - test_case(&[FALSE, NULL, NULL], FALSE, NULL), - test_case(&[NULL, FALSE, NULL], FALSE, NULL), - test_case(&[NULL, NULL, FALSE], FALSE, NULL), - test_case(&[NULL, TRUE, TRUE], NULL, TRUE), - test_case(&[TRUE, NULL, TRUE], NULL, TRUE), - test_case(&[TRUE, TRUE, NULL], NULL, TRUE), - test_case(&[NULL, FALSE, FALSE], FALSE, NULL), - test_case(&[FALSE, NULL, FALSE], FALSE, NULL), - test_case(&[FALSE, FALSE, NULL], FALSE, NULL), - // Every unique ordering of 3 - test_case(&[TRUE, FALSE, NULL], FALSE, TRUE), - test_case(&[TRUE, NULL, FALSE], FALSE, TRUE), - test_case(&[FALSE, TRUE, NULL], FALSE, TRUE), - test_case(&[FALSE, NULL, TRUE], FALSE, TRUE), - test_case(&[NULL, TRUE, FALSE], FALSE, TRUE), - test_case(&[NULL, FALSE, TRUE], FALSE, TRUE), - ]; - for test_case in test_cases { - test_case.do_test(); - } -} - -// tests apply_binary_scalars -#[test] -fn test_binary_scalars() { - use Scalar::*; - let smaller_values = &[ - Integer(1), - Long(1), - Short(1), - Byte(1), - Float(1.0), - Double(1.0), - String("1".into()), - Boolean(false), - Timestamp(1), - TimestampNtz(1), - Date(1), - Binary(vec![1]), - Decimal(1, 10, 10), // invalid value, - Null(DataType::LONG), - Struct(StructData::try_new(vec![], vec![]).unwrap()), - Array(ArrayData::new( - ArrayType::new(DataType::LONG, false), - vec![], - )), - ]; - let larger_values = &[ - Integer(10), - Long(10), - Short(10), - Byte(10), - Float(10.0), - Double(10.0), - String("10".into()), - Boolean(true), - Timestamp(10), - TimestampNtz(10), - Date(10), - Binary(vec![10]), - Decimal(10, 10, 10), // invalid value - Null(DataType::LONG), - Struct(StructData::try_new(vec![], vec![]).unwrap()), - Array(ArrayData::new( - ArrayType::new(DataType::LONG, false), - vec![], - )), - ]; - - // scalars of different types are always incomparable - use BinaryOperator::*; - let binary_ops = [ - Equal, - NotEqual, - LessThan, - LessThanOrEqual, - GreaterThan, - GreaterThanOrEqual, - ]; - let compare = |op, a, b| { - PredicateEvaluator::eval_binary_scalars(&UnimplementedTestFilter, op, a, b, false) - }; - for (i, a) in smaller_values.iter().enumerate() { - for b in smaller_values.iter().skip(i + 1) { - for op in binary_ops { - let result = compare(op, a, b); - let a_type = a.data_type(); - let b_type = b.data_type(); - assert!( - result.is_none(), - "{a_type:?} should not be comparable to {b_type:?}" - ); - } - } - } - - let expect_if_comparable_type = |s: &_, expect| match s { - Null(_) | Decimal(..) | Struct(_) | Array(_) => None, - _ => Some(expect), - }; - - // Test same-type comparisons where a == b - for (a, b) in smaller_values.iter().zip(smaller_values.iter()) { - expect_eq!( - compare(Equal, a, b), - expect_if_comparable_type(a, true), - "{a:?} == {b:?}" - ); - - expect_eq!( - compare(NotEqual, a, b), - expect_if_comparable_type(a, false), - "{a:?} != {b:?}" - ); - - expect_eq!( - compare(LessThan, a, b), - expect_if_comparable_type(a, false), - "{a:?} < {b:?}" - ); - - expect_eq!( - compare(GreaterThan, a, b), - expect_if_comparable_type(a, false), - "{a:?} > {b:?}" - ); - - expect_eq!( - compare(LessThanOrEqual, a, b), - expect_if_comparable_type(a, true), - "{a:?} <= {b:?}" - ); - - expect_eq!( - compare(GreaterThanOrEqual, a, b), - expect_if_comparable_type(a, true), - "{a:?} >= {b:?}" - ); - } - - // Test same-type comparisons where a < b - for (a, b) in smaller_values.iter().zip(larger_values.iter()) { - expect_eq!( - compare(Equal, a, b), - expect_if_comparable_type(a, false), - "{a:?} == {b:?}" - ); - - expect_eq!( - compare(NotEqual, a, b), - expect_if_comparable_type(a, true), - "{a:?} != {b:?}" - ); - - expect_eq!( - compare(LessThan, a, b), - expect_if_comparable_type(a, true), - "{a:?} < {b:?}" - ); - - expect_eq!( - compare(GreaterThan, a, b), - expect_if_comparable_type(a, false), - "{a:?} > {b:?}" - ); - - expect_eq!( - compare(LessThanOrEqual, a, b), - expect_if_comparable_type(a, true), - "{a:?} <= {b:?}" - ); - - expect_eq!( - compare(GreaterThanOrEqual, a, b), - expect_if_comparable_type(a, false), - "{a:?} >= {b:?}" - ); - } -} - struct MinMaxTestFilter { min: Option, max: Option, @@ -333,376 +154,53 @@ impl ParquetStatsProvider for MinMaxTestFilter { } } -#[test] -fn test_binary_eq_ne() { - use BinaryOperator::*; - - const LO: Scalar = Scalar::Long(1); - const MID: Scalar = Scalar::Long(10); - const HI: Scalar = Scalar::Long(100); - let col = &Expression::column("x"); - - for inverted in [false, true] { - // negative test -- mismatched column type - expect_eq!( - MinMaxTestFilter::new(MID.into(), MID.into()).eval_binary( - Equal, - col, - &Expression::literal("10"), - inverted, - ), - None, - "{col} == '10' (min: {MID}, max: {MID}, inverted: {inverted})" - ); - - // quick test for literal-literal comparisons - expect_eq!( - MinMaxTestFilter::new(MID.into(), MID.into()).eval_binary( - Equal, - &MID.into(), - &MID.into(), - inverted, - ), - Some(!inverted), - "{MID} == {MID} (min: {MID}, max: {MID}, inverted: {inverted})" - ); - - // quick test for literal-column comparisons - expect_eq!( - MinMaxTestFilter::new(MID.into(), MID.into()).eval_binary( - Equal, - &MID.into(), - col, - inverted, - ), - Some(!inverted), - "{MID} == {col} (min: {MID}, max: {MID}, inverted: {inverted})" - ); - - expect_eq!( - MinMaxTestFilter::new(MID.into(), MID.into()).eval_binary( - Equal, - col, - &MID.into(), - inverted, - ), - Some(!inverted), - "{col} == {MID} (min: {MID}, max: {MID}, inverted: {inverted})" - ); - - expect_eq!( - MinMaxTestFilter::new(LO.into(), HI.into()).eval_binary( - Equal, - col, - &MID.into(), - inverted, - ), - Some(true), // min..max range includes both EQ and NE - "{col} == {MID} (min: {LO}, max: {HI}, inverted: {inverted})" - ); - - expect_eq!( - MinMaxTestFilter::new(LO.into(), MID.into()).eval_binary( - Equal, - col, - &HI.into(), - inverted, - ), - Some(inverted), - "{col} == {HI} (min: {LO}, max: {MID}, inverted: {inverted})" - ); - - expect_eq!( - MinMaxTestFilter::new(MID.into(), HI.into()).eval_binary( - Equal, - col, - &LO.into(), - inverted, - ), - Some(inverted), - "{col} == {LO} (min: {MID}, max: {HI}, inverted: {inverted})" - ); - - // negative test -- mismatched column type - expect_eq!( - MinMaxTestFilter::new(MID.into(), MID.into()).eval_binary( - NotEqual, - col, - &Expression::literal("10"), - inverted, - ), - None, - "{col} != '10' (min: {MID}, max: {MID}, inverted: {inverted})" - ); - - expect_eq!( - MinMaxTestFilter::new(MID.into(), MID.into()).eval_binary( - NotEqual, - col, - &MID.into(), - inverted, - ), - Some(inverted), - "{col} != {MID} (min: {MID}, max: {MID}, inverted: {inverted})" - ); - - expect_eq!( - MinMaxTestFilter::new(LO.into(), HI.into()).eval_binary( - NotEqual, - col, - &MID.into(), - inverted, - ), - Some(true), // min..max range includes both EQ and NE - "{col} != {MID} (min: {LO}, max: {HI}, inverted: {inverted})" - ); - - expect_eq!( - MinMaxTestFilter::new(LO.into(), MID.into()).eval_binary( - NotEqual, - col, - &HI.into(), - inverted, - ), - Some(!inverted), - "{col} != {HI} (min: {LO}, max: {MID}, inverted: {inverted})" - ); - - expect_eq!( - MinMaxTestFilter::new(MID.into(), HI.into()).eval_binary( - NotEqual, - col, - &LO.into(), - inverted, - ), - Some(!inverted), - "{col} != {LO} (min: {MID}, max: {HI}, inverted: {inverted})" - ); - } -} #[test] -fn test_binary_lt_ge() { - use BinaryOperator::*; - - const LO: Scalar = Scalar::Long(1); - const MID: Scalar = Scalar::Long(10); - const HI: Scalar = Scalar::Long(100); - let col = &Expression::column("x"); - - for inverted in [false, true] { - expect_eq!( - MinMaxTestFilter::new(MID.into(), MID.into()).eval_binary( - LessThan, - col, - &MID.into(), - inverted, - ), - Some(inverted), - "{col} < {MID} (min: {MID}, max: {MID}, inverted: {inverted})" - ); - - expect_eq!( - MinMaxTestFilter::new(LO.into(), HI.into()).eval_binary( - LessThan, - col, - &MID.into(), - inverted, - ), - Some(true), // min..max range includes both LT and GE - "{col} < {MID} (min: {LO}, max: {HI}, inverted: {inverted})" - ); - - expect_eq!( - MinMaxTestFilter::new(LO.into(), MID.into()).eval_binary( - LessThan, - col, - &HI.into(), - inverted, - ), - Some(!inverted), - "{col} < {HI} (min: {LO}, max: {MID}, inverted: {inverted})" - ); - - expect_eq!( - MinMaxTestFilter::new(MID.into(), HI.into()).eval_binary( - LessThan, - col, - &LO.into(), - inverted, - ), - Some(inverted), - "{col} < {LO} (min: {MID}, max: {HI}, inverted: {inverted})" - ); - - expect_eq!( - MinMaxTestFilter::new(MID.into(), MID.into()).eval_binary( - GreaterThanOrEqual, - col, - &MID.into(), - inverted, - ), - Some(!inverted), - "{col} >= {MID} (min: {MID}, max: {MID}, inverted: {inverted})" - ); - - expect_eq!( - MinMaxTestFilter::new(LO.into(), HI.into()).eval_binary( - GreaterThanOrEqual, - col, - &MID.into(), - inverted, - ), - Some(true), // min..max range includes both EQ and NE - "{col} >= {MID} (min: {LO}, max: {HI}, inverted: {inverted})" - ); - - expect_eq!( - MinMaxTestFilter::new(LO.into(), MID.into()).eval_binary( - GreaterThanOrEqual, - col, - &HI.into(), - inverted, - ), - Some(inverted), - "{col} >= {HI} (min: {LO}, max: {MID}, inverted: {inverted})" - ); - - expect_eq!( - MinMaxTestFilter::new(MID.into(), HI.into()).eval_binary( - GreaterThanOrEqual, - col, - &LO.into(), - inverted, - ), - Some(!inverted), - "{col} >= {LO} (min: {MID}, max: {HI}, inverted: {inverted})" - ); - } -} - -#[test] -fn test_binary_le_gt() { - use BinaryOperator::*; - - const LO: Scalar = Scalar::Long(1); - const MID: Scalar = Scalar::Long(10); - const HI: Scalar = Scalar::Long(100); - let col = &Expression::column("x"); - - for inverted in [false, true] { - // negative test -- mismatched column type - expect_eq!( - MinMaxTestFilter::new(MID.into(), MID.into()).eval_binary( - LessThanOrEqual, - col, - &Expression::literal("10"), - inverted, - ), - None, - "{col} <= '10' (min: {MID}, max: {MID}, inverted: {inverted})" - ); - - expect_eq!( - MinMaxTestFilter::new(MID.into(), MID.into()).eval_binary( - LessThanOrEqual, - col, - &MID.into(), - inverted, - ), - Some(!inverted), - "{col} <= {MID} (min: {MID}, max: {MID}, inverted: {inverted})" - ); +fn test_eval_binary_comparisons() { + let col = &Expr::column("x"); + let five = &Scalar::from(5); + let ten = &Scalar::from(10); + let fifteen = &Scalar::from(15); + let null = &Scalar::Null(DataType::INTEGER); + + let expressions = [ + Expr::lt(col.clone(), ten.clone()), + Expr::le(col.clone(), ten.clone()), + Expr::eq(col.clone(), ten.clone()), + Expr::ne(col.clone(), ten.clone()), + Expr::gt(col.clone(), ten.clone()), + Expr::ge(col.clone(), ten.clone()), + ]; - expect_eq!( - MinMaxTestFilter::new(LO.into(), HI.into()).eval_binary( - LessThanOrEqual, - col, - &MID.into(), - inverted, - ), - Some(true), // min..max range includes both LT and GE - "{col} <= {MID} (min: {LO}, max: {HI}, inverted: {inverted})" - ); + let do_test = |min: &Scalar, max: &Scalar, expected: &[Option]| { + let filter = MinMaxTestFilter::new(Some(min.clone()), Some(max.clone())); + for (expr, expect) in expressions.iter().zip(expected.iter()) { + expect_eq!( + filter.eval_expr(&expr, false), + *expect, + "{expr:#?} with [{min}..{max}]" + ); + } + }; - expect_eq!( - MinMaxTestFilter::new(LO.into(), MID.into()).eval_binary( - LessThanOrEqual, - col, - &HI.into(), - inverted, - ), - Some(!inverted), - "{col} <= {HI} (min: {LO}, max: {MID}, inverted: {inverted})" - ); + // value < min = max (15..15 = 10, 15..15 <= 10, etc) + do_test(fifteen, fifteen, &[FALSE, FALSE, FALSE, TRUE, TRUE, TRUE]); - expect_eq!( - MinMaxTestFilter::new(MID.into(), HI.into()).eval_binary( - LessThanOrEqual, - col, - &LO.into(), - inverted, - ), - Some(inverted), - "{col} <= {LO} (min: {MID}, max: {HI}, inverted: {inverted})" - ); + // min = max = value (10..10 = 10, 10..10 <= 10, etc) + // + // NOTE: missing min or max stat produces NULL output if the expression needed it. + do_test(ten, ten, &[FALSE, TRUE, TRUE, FALSE, FALSE, TRUE]); + do_test(null, ten, &[NULL, NULL, NULL, NULL, FALSE, TRUE]); + do_test(ten, null, &[FALSE, TRUE, NULL, NULL, NULL, NULL]); - // negative test -- mismatched column type - expect_eq!( - MinMaxTestFilter::new(MID.into(), MID.into()).eval_binary( - GreaterThan, - col, - &Expression::literal("10"), - inverted, - ), - None, - "{col} > '10' (min: {MID}, max: {MID}, inverted: {inverted})" - ); + // min = max < value (5..5 = 10, 5..5 <= 10, etc) + do_test(five, five, &[TRUE, TRUE, FALSE, TRUE, FALSE, FALSE]); - expect_eq!( - MinMaxTestFilter::new(MID.into(), MID.into()).eval_binary( - GreaterThan, - col, - &MID.into(), - inverted, - ), - Some(inverted), - "{col} > {MID} (min: {MID}, max: {MID}, inverted: {inverted})" - ); + // value = min < max (5..15 = 10, 5..15 <= 10, etc) + do_test(ten, fifteen, &[FALSE, TRUE, TRUE, TRUE, TRUE, TRUE]); - expect_eq!( - MinMaxTestFilter::new(LO.into(), HI.into()).eval_binary( - GreaterThan, - col, - &MID.into(), - inverted, - ), - Some(true), // min..max range includes both EQ and NE - "{col} > {MID} (min: {LO}, max: {HI}, inverted: {inverted})" - ); - - expect_eq!( - MinMaxTestFilter::new(LO.into(), MID.into()).eval_binary( - GreaterThan, - col, - &HI.into(), - inverted, - ), - Some(inverted), - "{col} > {HI} (min: {LO}, max: {MID}, inverted: {inverted})" - ); - - expect_eq!( - MinMaxTestFilter::new(MID.into(), HI.into()).eval_binary( - GreaterThan, - col, - &LO.into(), - inverted, - ), - Some(!inverted), - "{col} > {LO} (min: {MID}, max: {HI}, inverted: {inverted})" - ); - } + // min < value < max (5..15 = 10, 5..15 <= 10, etc) + do_test(five, fifteen, &[TRUE, TRUE, TRUE, TRUE, TRUE, TRUE]); } struct NullCountTestFilter { @@ -736,59 +234,29 @@ impl ParquetStatsProvider for NullCountTestFilter { } #[test] -fn test_not_null() { - use UnaryOperator::IsNull; - - let col = &Expression::column("x"); - for inverted in [false, true] { - expect_eq!( - NullCountTestFilter::new(None, 10).eval_unary(IsNull, col, inverted), - None, - "{col} IS NULL (nullcount: None, rowcount: 10, inverted: {inverted})" - ); - - expect_eq!( - NullCountTestFilter::new(Some(0), 10).eval_unary(IsNull, col, inverted), - Some(inverted), - "{col} IS NULL (nullcount: 0, rowcount: 10, inverted: {inverted})" - ); +fn test_eval_is_null() { + let col = &Expr::column("x"); + let expressions = [Expr::is_null(col.clone()), !Expr::is_null(col.clone())]; + + let do_test = |nullcount: i64, expected: &[Option]| { + let filter = NullCountTestFilter::new(Some(nullcount), 2); + for (expr, expect) in expressions.iter().zip(expected) { + expect_eq!( + filter.eval_expr(expr, false), + *expect, + "{expr:#?} ({nullcount} nulls)" + ); + } + }; - expect_eq!( - NullCountTestFilter::new(Some(5), 10).eval_unary(IsNull, col, inverted), - Some(true), - "{col} IS NULL (nullcount: 5, rowcount: 10, inverted: {inverted})" - ); + // no nulls + do_test(0, &[FALSE, TRUE]); - expect_eq!( - NullCountTestFilter::new(Some(10), 10).eval_unary(IsNull, col, inverted), - Some(!inverted), - "{col} IS NULL (nullcount: 10, rowcount: 10, inverted: {inverted})" - ); - } -} + // some nulls + do_test(1, &[TRUE, TRUE]); -#[test] -fn test_bool_col() { - use Scalar::Boolean; - const TRUE: Scalar = Boolean(true); - const FALSE: Scalar = Boolean(false); - for inverted in [false, true] { - expect_eq!( - MinMaxTestFilter::new(TRUE.into(), TRUE.into()).eval_column("x", inverted), - Some(!inverted), - "x as boolean (min: TRUE, max: TRUE, inverted: {inverted})" - ); - expect_eq!( - MinMaxTestFilter::new(FALSE.into(), TRUE.into()).eval_column("x", inverted), - Some(true), - "x as boolean (min: FALSE, max: TRUE, inverted: {inverted})" - ); - expect_eq!( - MinMaxTestFilter::new(FALSE.into(), FALSE.into()).eval_column("x", inverted), - Some(inverted), - "x as boolean (min: FALSE, max: FALSE, inverted: {inverted})" - ); - } + // all nulls + do_test(2, &[TRUE, FALSE]); } struct AllNullTestFilter; @@ -812,44 +280,44 @@ impl ParquetStatsProvider for AllNullTestFilter { #[test] fn test_sql_where() { - let col = &Expression::column("x"); - let val = &Expression::literal(1); - const NULL: Expression = Expression::Literal(Scalar::Null(DataType::BOOLEAN)); - const FALSE: Expression = Expression::Literal(Scalar::Boolean(false)); - const TRUE: Expression = Expression::Literal(Scalar::Boolean(true)); + let col = &Expr::column("x"); + let val = &Expr::literal(1); + const NULL: Expr = Expr::Literal(Scalar::Null(DataType::BOOLEAN)); + const FALSE: Expr = Expr::Literal(Scalar::Boolean(false)); + const TRUE: Expr = Expr::Literal(Scalar::Boolean(true)); // Basic sanity checks expect_eq!(AllNullTestFilter.eval_sql_where(val), None, "WHERE {val}"); expect_eq!(AllNullTestFilter.eval_sql_where(col), None, "WHERE {col}"); expect_eq!( - AllNullTestFilter.eval_sql_where(&Expression::is_null(col.clone())), + AllNullTestFilter.eval_sql_where(&Expr::is_null(col.clone())), Some(true), // No injected NULL checks "WHERE {col} IS NULL" ); expect_eq!( - AllNullTestFilter.eval_sql_where(&Expression::lt(TRUE, FALSE)), + AllNullTestFilter.eval_sql_where(&Expr::lt(TRUE, FALSE)), Some(false), // Injected NULL checks don't short circuit when inputs are NOT NULL "WHERE {TRUE} < {FALSE}" ); // Constrast normal vs SQL WHERE semantics - comparison expect_eq!( - AllNullTestFilter.eval_expr(&Expression::lt(col.clone(), val.clone()), false), + AllNullTestFilter.eval_expr(&Expr::lt(col.clone(), val.clone()), false), None, "{col} < {val}" ); expect_eq!( - AllNullTestFilter.eval_sql_where(&Expression::lt(col.clone(), val.clone())), + AllNullTestFilter.eval_sql_where(&Expr::lt(col.clone(), val.clone())), Some(false), "WHERE {col} < {val}" ); expect_eq!( - AllNullTestFilter.eval_expr(&Expression::lt(val.clone(), col.clone()), false), + AllNullTestFilter.eval_expr(&Expr::lt(val.clone(), col.clone()), false), None, "{val} < {col}" ); expect_eq!( - AllNullTestFilter.eval_sql_where(&Expression::lt(val.clone(), col.clone())), + AllNullTestFilter.eval_sql_where(&Expr::lt(val.clone(), col.clone())), Some(false), "WHERE {val} < {col}" ); @@ -857,16 +325,16 @@ fn test_sql_where() { // Constrast normal vs SQL WHERE semantics - comparison inside AND expect_eq!( AllNullTestFilter.eval_expr( - &Expression::and(NULL, Expression::lt(col.clone(), val.clone())), + &Expr::and(NULL, Expr::lt(col.clone(), val.clone())), false ), None, "{NULL} AND {col} < {val}" ); expect_eq!( - AllNullTestFilter.eval_sql_where(&Expression::and( + AllNullTestFilter.eval_sql_where(&Expr::and( NULL, - Expression::lt(col.clone(), val.clone()), + Expr::lt(col.clone(), val.clone()), )), Some(false), "WHERE {NULL} AND {col} < {val}" @@ -874,16 +342,16 @@ fn test_sql_where() { expect_eq!( AllNullTestFilter.eval_expr( - &Expression::and(TRUE, Expression::lt(col.clone(), val.clone())), + &Expr::and(TRUE, Expr::lt(col.clone(), val.clone())), false ), None, // NULL (from the NULL check) is stronger than TRUE "{TRUE} AND {col} < {val}" ); expect_eq!( - AllNullTestFilter.eval_sql_where(&Expression::and( + AllNullTestFilter.eval_sql_where(&Expr::and( TRUE, - Expression::lt(col.clone(), val.clone()), + Expr::lt(col.clone(), val.clone()), )), Some(false), // FALSE (from the NULL check) is stronger than TRUE "WHERE {TRUE} AND {col} < {val}" @@ -892,9 +360,9 @@ fn test_sql_where() { // Contrast normal vs. SQL WHERE semantics - comparison inside AND inside AND expect_eq!( AllNullTestFilter.eval_expr( - &Expression::and( + &Expr::and( TRUE, - Expression::and(NULL, Expression::lt(col.clone(), val.clone())), + Expr::and(NULL, Expr::lt(col.clone(), val.clone())), ), false, ), @@ -902,9 +370,9 @@ fn test_sql_where() { "{TRUE} AND ({NULL} AND {col} < {val})" ); expect_eq!( - AllNullTestFilter.eval_sql_where(&Expression::and( + AllNullTestFilter.eval_sql_where(&Expr::and( TRUE, - Expression::and(NULL, Expression::lt(col.clone(), val.clone())), + Expr::and(NULL, Expr::lt(col.clone(), val.clone())), )), Some(false), "WHERE {TRUE} AND ({NULL} AND {col} < {val})" @@ -913,9 +381,9 @@ fn test_sql_where() { // Semantics are the same for comparison inside OR inside AND expect_eq!( AllNullTestFilter.eval_expr( - &Expression::or( + &Expr::or( FALSE, - Expression::and(NULL, Expression::lt(col.clone(), val.clone())), + Expr::and(NULL, Expr::lt(col.clone(), val.clone())), ), false, ), @@ -923,9 +391,9 @@ fn test_sql_where() { "{FALSE} OR ({NULL} AND {col} < {val})" ); expect_eq!( - AllNullTestFilter.eval_sql_where(&Expression::or( + AllNullTestFilter.eval_sql_where(&Expr::or( FALSE, - Expression::and(NULL, Expression::lt(col.clone(), val.clone())), + Expr::and(NULL, Expr::lt(col.clone(), val.clone())), )), None, "WHERE {FALSE} OR ({NULL} AND {col} < {val})" diff --git a/kernel/src/scan/data_skipping.rs b/kernel/src/scan/data_skipping.rs index 094c215b4..34ca7e9ed 100644 --- a/kernel/src/scan/data_skipping.rs +++ b/kernel/src/scan/data_skipping.rs @@ -260,13 +260,21 @@ impl DataSkippingPredicateEvaluator for DataSkippingPredicateCreator { if inverted { op = op.invert(); } - // NOTE: By flattening, AND can produce TRUE where NULL would otherwise be expected. But - // this is for data skipping, which only cares about FALSE. So we accept and keep it simple. - let exprs = exprs.into_iter(); - let exprs: Vec<_> = match op { - VariadicOperator::And => exprs.flatten().collect(), - VariadicOperator::Or => exprs.collect::>()?, - }; + // NOTE: We can potentially see a LOT of NULL inputs in a big WHERE clause with lots of + // unsupported data skipping operations. We can't "just" flatten them all away for AND, + // because could produce TRUE where NULL would otherwise be expected. Similarly, we don't + // want to "just" try_collect inputs for OR, because that can cause OR to produce NULL where + // FALSE would otherwise be expected. So, we filter out all nulls except the first, + // observing that one NULL is enough to produce the correct behavior during predicate eval. + let mut have_null = false; + let exprs: Vec<_> = exprs.into_iter().flat_map(|e| match e { + Some(expr) => Some(expr), + None if have_null => None, + None => { + have_null = true; + Some(Expr::null_literal(DataType::BOOLEAN)) + } + }).collect(); Some(Expr::variadic(op, exprs)) } } diff --git a/kernel/src/scan/data_skipping/tests.rs b/kernel/src/scan/data_skipping/tests.rs index 175ea0b72..018eb2f76 100644 --- a/kernel/src/scan/data_skipping/tests.rs +++ b/kernel/src/scan/data_skipping/tests.rs @@ -105,52 +105,49 @@ fn test_eval_binary_comparisons() { #[test] fn test_eval_variadic() { - // NOTE: The data skipping logic can produce TRUE instead of NULL in some cases. This is - // harmless, because data skipping only cares about FALSE (NULL and TRUE are treated the same). let test_cases = &[ - // Every combo of 0, 1 and 2 inputs - (&[] as &[Option], TRUE, FALSE, FALSE, TRUE), - (&[TRUE], TRUE, TRUE, FALSE, FALSE), - (&[FALSE], FALSE, FALSE, TRUE, TRUE), - (&[NULL], TRUE, NULL, NULL, TRUE), // NULL -> TRUE - (&[TRUE, TRUE], TRUE, TRUE, FALSE, FALSE), - (&[TRUE, FALSE], FALSE, TRUE, TRUE, FALSE), - (&[TRUE, NULL], TRUE, NULL, NULL, FALSE), // NULL -> TRUE - (&[FALSE, TRUE], FALSE, TRUE, TRUE, FALSE), - (&[FALSE, FALSE], FALSE, FALSE, TRUE, TRUE), - (&[FALSE, NULL], FALSE, NULL, NULL, TRUE), // NULL -> TRUE - (&[NULL, TRUE], TRUE, NULL, NULL, FALSE), // NULL -> TRUE - (&[NULL, FALSE], FALSE, NULL, NULL, TRUE), // NULL -> TRUE - (&[NULL, NULL], TRUE, NULL, NULL, TRUE), // NULL -> TRUE + (&[]as &[Option], TRUE, FALSE), + (&[TRUE], TRUE, TRUE), + (&[FALSE], FALSE, FALSE), + (&[NULL], NULL, NULL), + (&[TRUE, TRUE], TRUE, TRUE), + (&[TRUE, FALSE], FALSE, TRUE), + (&[TRUE, NULL], NULL, TRUE), + (&[FALSE, TRUE], FALSE, TRUE), + (&[FALSE, FALSE], FALSE, FALSE), + (&[FALSE, NULL], FALSE, NULL), + (&[NULL, TRUE], NULL, TRUE), + (&[NULL, FALSE], FALSE, NULL), + (&[NULL, NULL], NULL, NULL), // Every combo of 1:2 - (&[TRUE, FALSE, FALSE], FALSE, TRUE, TRUE, FALSE), - (&[FALSE, TRUE, FALSE], FALSE, TRUE, TRUE, FALSE), - (&[FALSE, FALSE, TRUE], FALSE, TRUE, TRUE, FALSE), - (&[TRUE, NULL, NULL], TRUE, NULL, NULL, FALSE), // NULL -> TRUE - (&[NULL, TRUE, NULL], TRUE, NULL, NULL, FALSE), // NULL -> TRUE - (&[NULL, NULL, TRUE], TRUE, NULL, NULL, FALSE), // NULL -> TRUE - (&[FALSE, TRUE, TRUE], FALSE, TRUE, TRUE, FALSE), - (&[TRUE, FALSE, TRUE], FALSE, TRUE, TRUE, FALSE), - (&[TRUE, TRUE, FALSE], FALSE, TRUE, TRUE, FALSE), - (&[FALSE, NULL, NULL], FALSE, NULL, NULL, TRUE), // NULL -> TRUE - (&[NULL, FALSE, NULL], FALSE, NULL, NULL, TRUE), // NULL -> TRUE - (&[NULL, NULL, FALSE], FALSE, NULL, NULL, TRUE), // NULL -> TRUE - (&[NULL, TRUE, TRUE], TRUE, NULL, NULL, FALSE), // NULL -> TRUE - (&[TRUE, NULL, TRUE], TRUE, NULL, NULL, FALSE), // NULL -> TRUE - (&[TRUE, TRUE, NULL], TRUE, NULL, NULL, FALSE), // NULL -> TRUE - (&[NULL, FALSE, FALSE], FALSE, NULL, NULL, TRUE), // NULL -> TRUE - (&[FALSE, NULL, FALSE], FALSE, NULL, NULL, TRUE), // NULL -> TRUE - (&[FALSE, FALSE, NULL], FALSE, NULL, NULL, TRUE), // NULL -> TRUE + (&[TRUE, FALSE, FALSE], FALSE, TRUE), + (&[FALSE, TRUE, FALSE], FALSE, TRUE), + (&[FALSE, FALSE, TRUE], FALSE, TRUE), + (&[TRUE, NULL, NULL], NULL, TRUE), + (&[NULL, TRUE, NULL], NULL, TRUE), + (&[NULL, NULL, TRUE], NULL, TRUE), + (&[FALSE, TRUE, TRUE], FALSE, TRUE), + (&[TRUE, FALSE, TRUE], FALSE, TRUE), + (&[TRUE, TRUE, FALSE], FALSE, TRUE), + (&[FALSE, NULL, NULL], FALSE, NULL), + (&[NULL, FALSE, NULL], FALSE, NULL), + (&[NULL, NULL, FALSE], FALSE, NULL), + (&[NULL, TRUE, TRUE], NULL, TRUE), + (&[TRUE, NULL, TRUE], NULL, TRUE), + (&[TRUE, TRUE, NULL], NULL, TRUE), + (&[NULL, FALSE, FALSE], FALSE, NULL), + (&[FALSE, NULL, FALSE], FALSE, NULL), + (&[FALSE, FALSE, NULL], FALSE, NULL), // Every unique ordering of 3 - (&[TRUE, FALSE, NULL], FALSE, NULL, NULL, FALSE), - (&[TRUE, NULL, FALSE], FALSE, NULL, NULL, FALSE), - (&[FALSE, TRUE, NULL], FALSE, NULL, NULL, FALSE), - (&[FALSE, NULL, TRUE], FALSE, NULL, NULL, FALSE), - (&[NULL, TRUE, FALSE], FALSE, NULL, NULL, FALSE), - (&[NULL, FALSE, TRUE], FALSE, NULL, NULL, FALSE), + (&[TRUE, FALSE, NULL], FALSE, TRUE), + (&[TRUE, NULL, FALSE], FALSE, TRUE), + (&[FALSE, TRUE, NULL], FALSE, TRUE), + (&[FALSE, NULL, TRUE], FALSE, TRUE), + (&[NULL, TRUE, FALSE], FALSE, TRUE), + (&[NULL, FALSE, TRUE], FALSE, TRUE), ]; let filter = DefaultPredicateEvaluator::from(UnimplementedColumnResolver); - for (inputs, expect_and, expect_or, expect_not_and, expect_not_or) in test_cases { + for (inputs, expect_and, expect_or) in test_cases { let inputs: Vec<_> = inputs .iter() .map(|val| match val { @@ -160,34 +157,34 @@ fn test_eval_variadic() { .collect(); let expr = Expr::and_from(inputs.clone()); - let pred = as_data_skipping_predicate(&expr, false); + let pred = as_data_skipping_predicate(&expr, false).unwrap(); expect_eq!( - pred.and_then(|p| filter.eval_expr(&p, false)), + filter.eval_expr(&pred, false), *expect_and, "AND({inputs:?})" ); let expr = Expr::or_from(inputs.clone()); - let pred = as_data_skipping_predicate(&expr, false); + let pred = as_data_skipping_predicate(&expr, false).unwrap(); expect_eq!( - pred.and_then(|p| filter.eval_expr(&p, false)), + filter.eval_expr(&pred, false), *expect_or, "OR({inputs:?})" ); let expr = Expr::and_from(inputs.clone()); - let pred = as_data_skipping_predicate(&expr, true); + let pred = as_data_skipping_predicate(&expr, true).unwrap(); expect_eq!( - pred.and_then(|p| filter.eval_expr(&p, false)), - *expect_not_and, + filter.eval_expr(&pred, false), + expect_and.map(|val| !val), "NOT AND({inputs:?})" ); let expr = Expr::or_from(inputs.clone()); - let pred = as_data_skipping_predicate(&expr, true); + let pred = as_data_skipping_predicate(&expr, true).unwrap(); expect_eq!( - pred.and_then(|p| filter.eval_expr(&p, false)), - *expect_not_or, + filter.eval_expr(&pred, false), + expect_or.map(|val| !val), "NOT OR({inputs:?})" ); }