From 47d7201de7831a912b8192a2d5207191d8196660 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 15 Oct 2024 14:28:37 -0700 Subject: [PATCH 1/6] [Chore] Use existing helpers to simplify expression handling code --- ffi/src/expressions.rs | 15 ++-- kernel/src/actions/deletion_vector.rs | 2 +- kernel/src/actions/visitors.rs | 2 +- kernel/src/engine/arrow_conversion.rs | 26 +++---- kernel/src/engine/arrow_expression.rs | 77 +++++++------------ kernel/src/engine/arrow_utils.rs | 6 +- .../engine/parquet_stats_skipping/tests.rs | 36 ++++----- kernel/src/expressions/mod.rs | 14 ++-- kernel/src/scan/mod.rs | 2 +- kernel/tests/read.rs | 6 +- 10 files changed, 72 insertions(+), 114 deletions(-) diff --git a/ffi/src/expressions.rs b/ffi/src/expressions.rs index 9bd1a01cb..087cae163 100644 --- a/ffi/src/expressions.rs +++ b/ffi/src/expressions.rs @@ -5,7 +5,7 @@ use crate::{ ReferenceSet, TryFromStringSlice, }; use delta_kernel::{ - expressions::{BinaryOperator, Expression, Scalar, UnaryOperator}, + expressions::{BinaryOperator, Expression, UnaryOperator}, DeltaResult, }; @@ -56,12 +56,10 @@ fn visit_expression_binary( a: usize, b: usize, ) -> usize { - let left = unwrap_kernel_expression(state, a).map(Box::new); - let right = unwrap_kernel_expression(state, b).map(Box::new); + let left = unwrap_kernel_expression(state, a); + let right = unwrap_kernel_expression(state, b); match left.zip(right) { - Some((left, right)) => { - wrap_expression(state, Expression::BinaryOperation { op, left, right }) - } + Some((left, right)) => wrap_expression(state, Expression::binary(op, left, right)), None => 0, // invalid child => invalid node } } @@ -182,10 +180,7 @@ fn visit_expression_literal_string_impl( state: &mut KernelExpressionVisitorState, value: DeltaResult, ) -> DeltaResult { - Ok(wrap_expression( - state, - Expression::Literal(Scalar::from(value?)), - )) + Ok(wrap_expression(state, Expression::literal(value?))) } // We need to get parse.expand working to be able to macro everything below, see issue #255 diff --git a/kernel/src/actions/deletion_vector.rs b/kernel/src/actions/deletion_vector.rs index 566bc1d37..8d1b7b0ee 100644 --- a/kernel/src/actions/deletion_vector.rs +++ b/kernel/src/actions/deletion_vector.rs @@ -59,7 +59,7 @@ impl DeletionVectorDescriptor { let path_len = self.path_or_inline_dv.len(); require!( path_len >= 20, - Error::deletion_vector("Invalid length {path_len}, must be >= 20",) + Error::deletion_vector("Invalid length {path_len}, must be >= 20") ); let prefix_len = path_len - 20; let decoded = z85::decode(&self.path_or_inline_dv[prefix_len..]) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index b01723c0b..ef47968ef 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -513,7 +513,7 @@ mod tests { app_id: "myApp2".to_string(), version: 4, last_updated: Some(1670892998177), - },) + }) ); assert_eq!( actual.remove("myApp"), diff --git a/kernel/src/engine/arrow_conversion.rs b/kernel/src/engine/arrow_conversion.rs index a1f5273e1..0d7c2775a 100644 --- a/kernel/src/engine/arrow_conversion.rs +++ b/kernel/src/engine/arrow_conversion.rs @@ -213,27 +213,21 @@ impl TryFrom<&ArrowDataType> for DataType { ArrowDataType::Struct(fields) => { DataType::try_struct_type(fields.iter().map(|field| field.as_ref().try_into())) } - ArrowDataType::List(field) => Ok(DataType::Array(Box::new(ArrayType::new( - (*field).data_type().try_into()?, - (*field).is_nullable(), - )))), - ArrowDataType::LargeList(field) => Ok(DataType::Array(Box::new(ArrayType::new( - (*field).data_type().try_into()?, - (*field).is_nullable(), - )))), - ArrowDataType::FixedSizeList(field, _) => Ok(DataType::Array(Box::new( - ArrayType::new((*field).data_type().try_into()?, (*field).is_nullable()), - ))), + ArrowDataType::List(field) => { + Ok(ArrayType::new((*field).data_type().try_into()?, (*field).is_nullable()).into()) + } + ArrowDataType::LargeList(field) => { + Ok(ArrayType::new((*field).data_type().try_into()?, (*field).is_nullable()).into()) + } + ArrowDataType::FixedSizeList(field, _) => { + Ok(ArrayType::new((*field).data_type().try_into()?, (*field).is_nullable()).into()) + } ArrowDataType::Map(field, _) => { if let ArrowDataType::Struct(struct_fields) = field.data_type() { let key_type = DataType::try_from(struct_fields[0].data_type())?; let value_type = DataType::try_from(struct_fields[1].data_type())?; let value_type_nullable = struct_fields[1].is_nullable(); - Ok(DataType::Map(Box::new(MapType::new( - key_type, - value_type, - value_type_nullable, - )))) + Ok(MapType::new(key_type, value_type, value_type_nullable).into()) } else { panic!("DataType::Map should contain a struct field child"); } diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs index 93cc247ac..092f13571 100644 --- a/kernel/src/engine/arrow_expression.rs +++ b/kernel/src/engine/arrow_expression.rs @@ -447,17 +447,9 @@ mod tests { let array = ListArray::new(field.clone(), offsets, Arc::new(values), None); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array.clone())]).unwrap(); - let not_op = Expression::binary( - BinaryOperator::NotIn, - Expression::literal(5), - Expression::column("item"), - ); + let not_op = Expression::binary(BinaryOperator::NotIn, 5, Expression::column("item")); - let in_op = Expression::binary( - BinaryOperator::NotIn, - Expression::literal(5), - Expression::column("item"), - ); + let in_op = Expression::binary(BinaryOperator::NotIn, 5, Expression::column("item")); let result = evaluate_expression(¬_op, &batch, None).unwrap(); let expected = BooleanArray::from(vec![true, false, true]); @@ -475,11 +467,7 @@ mod tests { let schema = Schema::new([field.clone()]); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(values.clone())]).unwrap(); - let in_op = Expression::binary( - BinaryOperator::NotIn, - Expression::literal(5), - Expression::column("item"), - ); + let in_op = Expression::binary(BinaryOperator::NotIn, 5, Expression::column("item")); let in_result = evaluate_expression(&in_op, &batch, None); @@ -498,11 +486,11 @@ mod tests { let in_op = Expression::binary( BinaryOperator::NotIn, - Expression::literal(5), - Expression::literal(Scalar::Array(ArrayData::new( + 5, + Scalar::Array(ArrayData::new( ArrayType::new(DeltaDataTypes::INTEGER, false), vec![Scalar::Integer(1), Scalar::Integer(2)], - ))), + )), ); let in_result = evaluate_expression(&in_op, &batch, None).unwrap(); @@ -549,17 +537,10 @@ mod tests { let array = ListArray::new(field.clone(), offsets, Arc::new(values), None); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array.clone())]).unwrap(); - let str_not_op = Expression::binary( - BinaryOperator::NotIn, - Expression::literal("bye"), - Expression::column("item"), - ); + let str_not_op = + Expression::binary(BinaryOperator::NotIn, "bye", Expression::column("item")); - let str_in_op = Expression::binary( - BinaryOperator::In, - Expression::literal("hi"), - Expression::column("item"), - ); + let str_in_op = Expression::binary(BinaryOperator::In, "hi", Expression::column("item")); let result = evaluate_expression(&str_in_op, &batch, None).unwrap(); let expected = BooleanArray::from(vec![true, true, true]); @@ -609,23 +590,23 @@ mod tests { let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values)]).unwrap(); let column = Expression::column("a"); - let expression = Box::new(column.clone().add(Expression::Literal(Scalar::Integer(1)))); + let expression = column.clone().add(Expression::literal(1)); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(Int32Array::from(vec![2, 3, 4])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = Box::new(column.clone().sub(Expression::Literal(Scalar::Integer(1)))); + let expression = column.clone().sub(Expression::literal(1)); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(Int32Array::from(vec![0, 1, 2])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = Box::new(column.clone().mul(Expression::Literal(Scalar::Integer(2)))); + let expression = column.clone().mul(Expression::literal(2)); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(Int32Array::from(vec![2, 4, 6])); assert_eq!(results.as_ref(), expected.as_ref()); // TODO handle type casting - let expression = Box::new(column.div(Expression::Literal(Scalar::Integer(1)))); + let expression = column.div(Expression::literal(1)); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(Int32Array::from(vec![1, 2, 3])); assert_eq!(results.as_ref(), expected.as_ref()) @@ -646,17 +627,17 @@ mod tests { let column_a = Expression::column("a"); let column_b = Expression::column("b"); - let expression = Box::new(column_a.clone().add(column_b.clone())); + let expression = column_a.clone().add(column_b.clone()); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(Int32Array::from(vec![2, 4, 6])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = Box::new(column_a.clone().sub(column_b.clone())); + let expression = column_a.clone().sub(column_b.clone()); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(Int32Array::from(vec![0, 0, 0])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = Box::new(column_a.clone().mul(column_b)); + let expression = column_a.clone().mul(column_b); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(Int32Array::from(vec![1, 4, 9])); assert_eq!(results.as_ref(), expected.as_ref()); @@ -668,34 +649,34 @@ mod tests { let values = Int32Array::from(vec![1, 2, 3]); let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values)]).unwrap(); let column = Expression::column("a"); - let lit = Expression::Literal(Scalar::Integer(2)); + let lit = Expression::literal(2); - let expression = Box::new(column.clone().lt(lit.clone())); + let expression = column.clone().lt(lit.clone()); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(BooleanArray::from(vec![true, false, false])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = Box::new(column.clone().lt_eq(lit.clone())); + let expression = column.clone().lt_eq(lit.clone()); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(BooleanArray::from(vec![true, true, false])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = Box::new(column.clone().gt(lit.clone())); + let expression = column.clone().gt(lit.clone()); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(BooleanArray::from(vec![false, false, true])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = Box::new(column.clone().gt_eq(lit.clone())); + let expression = column.clone().gt_eq(lit.clone()); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(BooleanArray::from(vec![false, true, true])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = Box::new(column.clone().eq(lit.clone())); + let expression = column.clone().eq(lit.clone()); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(BooleanArray::from(vec![false, true, false])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = Box::new(column.clone().ne(lit.clone())); + let expression = column.clone().ne(lit.clone()); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(BooleanArray::from(vec![true, false, true])); assert_eq!(results.as_ref(), expected.as_ref()); @@ -718,32 +699,28 @@ mod tests { let column_a = Expression::column("a"); let column_b = Expression::column("b"); - let expression = Box::new(column_a.clone().and(column_b.clone())); + let expression = column_a.clone().and(column_b.clone()); let results = evaluate_expression(&expression, &batch, Some(&crate::schema::DataType::BOOLEAN)) .unwrap(); let expected = Arc::new(BooleanArray::from(vec![false, false])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = Box::new(column_a.clone().and(Expression::literal(true))); + let expression = column_a.clone().and(Expression::literal(true)); let results = evaluate_expression(&expression, &batch, Some(&crate::schema::DataType::BOOLEAN)) .unwrap(); let expected = Arc::new(BooleanArray::from(vec![true, false])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = Box::new(column_a.clone().or(column_b)); + let expression = column_a.clone().or(column_b); let results = evaluate_expression(&expression, &batch, Some(&crate::schema::DataType::BOOLEAN)) .unwrap(); let expected = Arc::new(BooleanArray::from(vec![true, true])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = Box::new( - column_a - .clone() - .or(Expression::literal(Scalar::Boolean(false))), - ); + let expression = column_a.clone().or(Expression::literal(false)); let results = evaluate_expression(&expression, &batch, Some(&crate::schema::DataType::BOOLEAN)) .unwrap(); diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index c0c9b0cae..7edbe2828 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -952,11 +952,7 @@ mod tests { fn mask_with_map() { let requested_schema = Arc::new(StructType::new([StructField::new( "map", - DataType::Map(Box::new(MapType::new( - DataType::INTEGER, - DataType::STRING, - false, - ))), + MapType::new(DataType::INTEGER, DataType::STRING, false), false, )])); let parquet_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new_map( diff --git a/kernel/src/engine/parquet_stats_skipping/tests.rs b/kernel/src/engine/parquet_stats_skipping/tests.rs index fc7f05eef..a95ac4102 100644 --- a/kernel/src/engine/parquet_stats_skipping/tests.rs +++ b/kernel/src/engine/parquet_stats_skipping/tests.rs @@ -854,34 +854,34 @@ fn test_sql_where() { // Constrast normal vs SQL WHERE semantics - comparison inside AND expect_eq!( AllNullTestFilter.apply_expr( - &Expression::and_from([NULL, Expression::lt(col.clone(), val.clone()),]), + &Expression::and(NULL, Expression::lt(col.clone(), val.clone())), false ), None, "{NULL} AND {col} < {val}" ); expect_eq!( - AllNullTestFilter.apply_sql_where(&Expression::and_from([ + AllNullTestFilter.apply_sql_where(&Expression::and( NULL, Expression::lt(col.clone(), val.clone()), - ])), + )), Some(false), "WHERE {NULL} AND {col} < {val}" ); expect_eq!( AllNullTestFilter.apply_expr( - &Expression::and_from([TRUE, Expression::lt(col.clone(), val.clone()),]), + &Expression::and(TRUE, Expression::lt(col.clone(), val.clone())), false ), None, // NULL (from the NULL check) is stronger than TRUE "{TRUE} AND {col} < {val}" ); expect_eq!( - AllNullTestFilter.apply_sql_where(&Expression::and_from([ + AllNullTestFilter.apply_sql_where(&Expression::and( TRUE, Expression::lt(col.clone(), val.clone()), - ])), + )), Some(false), // FALSE (from the NULL check) is stronger than TRUE "WHERE {TRUE} AND {col} < {val}" ); @@ -889,20 +889,20 @@ fn test_sql_where() { // Contrast normal vs. SQL WHERE semantics - comparison inside AND inside AND expect_eq!( AllNullTestFilter.apply_expr( - &Expression::and_from([ + &Expression::and( TRUE, - Expression::and_from([NULL, Expression::lt(col.clone(), val.clone()),]), - ]), + Expression::and(NULL, Expression::lt(col.clone(), val.clone())), + ), false, ), None, "{TRUE} AND ({NULL} AND {col} < {val})" ); expect_eq!( - AllNullTestFilter.apply_sql_where(&Expression::and_from([ + AllNullTestFilter.apply_sql_where(&Expression::and( TRUE, - Expression::and_from([NULL, Expression::lt(col.clone(), val.clone()),]), - ])), + Expression::and(NULL, Expression::lt(col.clone(), val.clone())), + )), Some(false), "WHERE {TRUE} AND ({NULL} AND {col} < {val})" ); @@ -910,20 +910,20 @@ fn test_sql_where() { // Semantics are the same for comparison inside OR inside AND expect_eq!( AllNullTestFilter.apply_expr( - &Expression::or_from([ + &Expression::or( FALSE, - Expression::and_from([NULL, Expression::lt(col.clone(), val.clone()),]), - ]), + Expression::and(NULL, Expression::lt(col.clone(), val.clone())), + ), false, ), None, "{FALSE} OR ({NULL} AND {col} < {val})" ); expect_eq!( - AllNullTestFilter.apply_sql_where(&Expression::or_from([ + AllNullTestFilter.apply_sql_where(&Expression::or( FALSE, - Expression::and_from([NULL, Expression::lt(col.clone(), val.clone()),]), - ])), + Expression::and(NULL, Expression::lt(col.clone(), val.clone())), + )), None, "WHERE {FALSE} OR ({NULL} AND {col} < {val})" ); diff --git a/kernel/src/expressions/mod.rs b/kernel/src/expressions/mod.rs index 7482c3bb7..e745167ed 100644 --- a/kernel/src/expressions/mod.rs +++ b/kernel/src/expressions/mod.rs @@ -134,6 +134,13 @@ pub enum Expression { Column(String), /// A struct computed from a Vec of expressions Struct(Vec), + /// A unary operation. + UnaryOperation { + /// The operator. + op: UnaryOperator, + /// The expression. + expr: Box, + }, /// A binary operation. BinaryOperation { /// The operator. @@ -143,13 +150,6 @@ pub enum Expression { /// The right-hand side of the operation. right: Box, }, - /// A unary operation. - UnaryOperation { - /// The operator. - op: UnaryOperator, - /// The expression. - expr: Box, - }, VariadicOperation { /// The operator. op: VariadicOperator, diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index d10a7dd13..b37452668 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -496,7 +496,7 @@ fn transform_to_logical_internal( partition_values.get(name), field.data_type(), )?; - Ok::(Expression::Literal(value_expression)) + Ok::(value_expression.into()) } ColumnType::Selected(field_name) => Ok(Expression::column(field_name)), }) diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index d1e673d9c..068547761 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -339,11 +339,7 @@ async fn stats() -> Result<(), Box> { (NotEqual, 8, vec![&batch2, &batch1]), ]; for (op, value, expected_batches) in test_cases { - let predicate = Expression::BinaryOperation { - op, - left: Box::new(Expression::column("id")), - right: Box::new(Expression::literal(value)), - }; + let predicate = Expression::binary(op, Expression::column("id"), value); let scan = snapshot .clone() .scan_builder() From 69605440ffcd78de27a7fc19908b8a9ea617ce69 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 15 Oct 2024 15:11:58 -0700 Subject: [PATCH 2/6] add into-expr support for binary operators and use it --- kernel/src/engine/arrow_expression.rs | 25 ++++---- kernel/src/expressions/mod.rs | 72 ++++++++++------------- kernel/src/scan/data_skipping.rs | 23 ++++---- kernel/tests/read.rs | 82 ++++++++++++--------------- 4 files changed, 88 insertions(+), 114 deletions(-) diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs index 092f13571..ca4be0cfc 100644 --- a/kernel/src/engine/arrow_expression.rs +++ b/kernel/src/engine/arrow_expression.rs @@ -590,23 +590,23 @@ mod tests { let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values)]).unwrap(); let column = Expression::column("a"); - let expression = column.clone().add(Expression::literal(1)); + let expression = column.clone().add(1); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(Int32Array::from(vec![2, 3, 4])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = column.clone().sub(Expression::literal(1)); + let expression = column.clone().sub(1); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(Int32Array::from(vec![0, 1, 2])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = column.clone().mul(Expression::literal(2)); + let expression = column.clone().mul(2); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(Int32Array::from(vec![2, 4, 6])); assert_eq!(results.as_ref(), expected.as_ref()); // TODO handle type casting - let expression = column.div(Expression::literal(1)); + let expression = column.div(1); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(Int32Array::from(vec![1, 2, 3])); assert_eq!(results.as_ref(), expected.as_ref()) @@ -649,34 +649,33 @@ mod tests { let values = Int32Array::from(vec![1, 2, 3]); let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values)]).unwrap(); let column = Expression::column("a"); - let lit = Expression::literal(2); - let expression = column.clone().lt(lit.clone()); + let expression = column.clone().lt(2); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(BooleanArray::from(vec![true, false, false])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = column.clone().lt_eq(lit.clone()); + let expression = column.clone().lt_eq(2); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(BooleanArray::from(vec![true, true, false])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = column.clone().gt(lit.clone()); + let expression = column.clone().gt(2); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(BooleanArray::from(vec![false, false, true])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = column.clone().gt_eq(lit.clone()); + let expression = column.clone().gt_eq(2); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(BooleanArray::from(vec![false, true, true])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = column.clone().eq(lit.clone()); + let expression = column.clone().eq(2); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(BooleanArray::from(vec![false, true, false])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = column.clone().ne(lit.clone()); + let expression = column.clone().ne(2); let results = evaluate_expression(&expression, &batch, None).unwrap(); let expected = Arc::new(BooleanArray::from(vec![true, false, true])); assert_eq!(results.as_ref(), expected.as_ref()); @@ -706,7 +705,7 @@ mod tests { let expected = Arc::new(BooleanArray::from(vec![false, false])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = column_a.clone().and(Expression::literal(true)); + let expression = column_a.clone().and(true); let results = evaluate_expression(&expression, &batch, Some(&crate::schema::DataType::BOOLEAN)) .unwrap(); @@ -720,7 +719,7 @@ mod tests { let expected = Arc::new(BooleanArray::from(vec![true, true])); assert_eq!(results.as_ref(), expected.as_ref()); - let expression = column_a.clone().or(Expression::literal(false)); + let expression = column_a.clone().or(false); let results = evaluate_expression(&expression, &batch, Some(&crate::schema::DataType::BOOLEAN)) .unwrap(); diff --git a/kernel/src/expressions/mod.rs b/kernel/src/expressions/mod.rs index e745167ed..24bdf84fd 100644 --- a/kernel/src/expressions/mod.rs +++ b/kernel/src/expressions/mod.rs @@ -286,57 +286,57 @@ impl Expression { } /// Create a new expression `self == other` - pub fn eq(self, other: Self) -> Self { + pub fn eq(self, other: impl Into) -> Self { Self::binary(BinaryOperator::Equal, self, other) } /// Create a new expression `self != other` - pub fn ne(self, other: Self) -> Self { + pub fn ne(self, other: impl Into) -> Self { Self::binary(BinaryOperator::NotEqual, self, other) } /// Create a new expression `self <= other` - pub fn le(self, other: Self) -> Self { + pub fn le(self, other: impl Into) -> Self { Self::binary(BinaryOperator::LessThanOrEqual, self, other) } /// Create a new expression `self < other` - pub fn lt(self, other: Self) -> Self { + pub fn lt(self, other: impl Into) -> Self { Self::binary(BinaryOperator::LessThan, self, other) } /// Create a new expression `self >= other` - pub fn ge(self, other: Self) -> Self { + pub fn ge(self, other: impl Into) -> Self { Self::binary(BinaryOperator::GreaterThanOrEqual, self, other) } /// Create a new expression `self > other` - pub fn gt(self, other: Self) -> Self { + pub fn gt(self, other: impl Into) -> Self { Self::binary(BinaryOperator::GreaterThan, self, other) } /// Create a new expression `self >= other` - pub fn gt_eq(self, other: Self) -> Self { + pub fn gt_eq(self, other: impl Into) -> Self { Self::binary(BinaryOperator::GreaterThanOrEqual, self, other) } /// Create a new expression `self <= other` - pub fn lt_eq(self, other: Self) -> Self { + pub fn lt_eq(self, other: impl Into) -> Self { Self::binary(BinaryOperator::LessThanOrEqual, self, other) } /// Create a new expression `self AND other` - pub fn and(self, other: Self) -> Self { - Self::and_from([self, other]) + pub fn and(self, other: impl Into) -> Self { + Self::and_from([self, other.into()]) } /// Create a new expression `self OR other` - pub fn or(self, other: Self) -> Self { - Self::or_from([self, other]) + pub fn or(self, other: impl Into) -> Self { + Self::or_from([self, other.into()]) } /// Create a new expression `DISTINCT(self, other)` - pub fn distinct(self, other: Self) -> Self { + pub fn distinct(self, other: impl Into) -> Self { Self::binary(BinaryOperator::Distinct, self, other) } @@ -374,34 +374,34 @@ impl std::ops::Not for Expression { } } -impl std::ops::Add for Expression { +impl> std::ops::Add for Expression { type Output = Self; - fn add(self, rhs: Expression) -> Self::Output { + fn add(self, rhs: R) -> Self::Output { Self::binary(BinaryOperator::Plus, self, rhs) } } -impl std::ops::Sub for Expression { +impl> std::ops::Sub for Expression { type Output = Self; - fn sub(self, rhs: Expression) -> Self { + fn sub(self, rhs: R) -> Self { Self::binary(BinaryOperator::Minus, self, rhs) } } -impl std::ops::Mul for Expression { +impl> std::ops::Mul for Expression { type Output = Self; - fn mul(self, rhs: Expression) -> Self { + fn mul(self, rhs: R) -> Self { Self::binary(BinaryOperator::Multiply, self, rhs) } } -impl std::ops::Div for Expression { +impl> std::ops::Div for Expression { type Output = Self; - fn div(self, rhs: Expression) -> Self { + fn div(self, rhs: R) -> Self { Self::binary(BinaryOperator::Divide, self, rhs) } } @@ -415,38 +415,26 @@ mod tests { let col_ref = Expr::column("x"); let cases = [ (col_ref.clone(), "Column(x)"), - (col_ref.clone().eq(Expr::literal(2)), "Column(x) = 2"), + (col_ref.clone().eq(2), "Column(x) = 2"), + ((col_ref.clone() - 4).lt(10), "Column(x) - 4 < 10"), + ((col_ref.clone() + 4) / 10 * 42, "Column(x) + 4 / 10 * 42"), ( - (col_ref.clone() - Expr::literal(4)).lt(Expr::literal(10)), - "Column(x) - 4 < 10", - ), - ( - (col_ref.clone() + Expr::literal(4)) / Expr::literal(10) * Expr::literal(42), - "Column(x) + 4 / 10 * 42", - ), - ( - col_ref - .clone() - .gt_eq(Expr::literal(2)) - .and(col_ref.clone().lt_eq(Expr::literal(10))), + col_ref.clone().gt_eq(2).and(col_ref.clone().lt_eq(10)), "AND(Column(x) >= 2, Column(x) <= 10)", ), ( Expr::and_from([ - col_ref.clone().gt_eq(Expr::literal(2)), - col_ref.clone().lt_eq(Expr::literal(10)), - col_ref.clone().lt_eq(Expr::literal(100)), + col_ref.clone().gt_eq(2), + col_ref.clone().lt_eq(10), + col_ref.clone().lt_eq(100), ]), "AND(Column(x) >= 2, Column(x) <= 10, Column(x) <= 100)", ), ( - col_ref - .clone() - .gt(Expr::literal(2)) - .or(col_ref.clone().lt(Expr::literal(10))), + col_ref.clone().gt(2).or(col_ref.clone().lt(10)), "OR(Column(x) > 2, Column(x) < 10)", ), - (col_ref.eq(Expr::literal("foo")), "Column(x) = 'foo'"), + (col_ref.eq("foo"), "Column(x) = 'foo'"), ]; for (expr, expected) in cases { diff --git a/kernel/src/scan/data_skipping.rs b/kernel/src/scan/data_skipping.rs index 1619d98aa..0bc8e3e08 100644 --- a/kernel/src/scan/data_skipping.rs +++ b/kernel/src/scan/data_skipping.rs @@ -17,8 +17,8 @@ use crate::{Engine, EngineData, ExpressionEvaluator, JsonHandler}; /// by the default for tightBounds being true, so we have to check if it's EITHER `null` OR `true` fn get_tight_null_expr(null_col: String) -> Expr { Expr::and( - Expr::distinct(Expr::column("tightBounds"), Expr::literal(false)), - Expr::gt(Expr::column(null_col), Expr::literal(0i64)), + Expr::distinct(Expr::column("tightBounds"), false), + Expr::gt(Expr::column(null_col), 0i64), ) } @@ -28,7 +28,7 @@ fn get_tight_null_expr(null_col: String) -> Expr { /// doesn't help us) fn get_wide_null_expr(null_col: String) -> Expr { Expr::and( - Expr::eq(Expr::column("tightBounds"), Expr::literal(false)), + Expr::eq(Expr::column("tightBounds"), false), Expr::eq(Expr::column("numRecords"), Expr::column(null_col)), ) } @@ -38,7 +38,7 @@ fn get_wide_null_expr(null_col: String) -> Expr { /// the default for tightBounds being true, so we have to check if it's EITHER `null` OR `true` fn get_tight_not_null_expr(null_col: String) -> Expr { Expr::and( - Expr::distinct(Expr::column("tightBounds"), Expr::literal(false)), + Expr::distinct(Expr::column("tightBounds"), false), Expr::lt(Expr::column(null_col), Expr::column("numRecords")), ) } @@ -49,7 +49,7 @@ fn get_tight_not_null_expr(null_col: String) -> Expr { /// there is a possibility of a NOT null fn get_wide_not_null_expr(null_col: String) -> Expr { Expr::and( - Expr::eq(Expr::column("tightBounds"), Expr::literal(false)), + Expr::eq(Expr::column("tightBounds"), false), Expr::ne(Expr::column("numRecords"), Expr::column(null_col)), ) } @@ -126,14 +126,14 @@ fn as_data_skipping_predicate(expr: &Expr) -> Option { } NotEqual => { return Some(Expr::or( - Expr::gt(Column(format!("minValues.{}", col)), Literal(val.clone())), - Expr::lt(Column(format!("maxValues.{}", col)), Literal(val.clone())), + Expr::gt(Column(format!("minValues.{}", col)), val.clone()), + Expr::lt(Column(format!("maxValues.{}", col)), val.clone()), )); } _ => return None, // unsupported operation }; let col = format!("{}.{}", stats_col, col); - Some(Expr::binary(op, Column(col), Literal(val.clone()))) + Some(Expr::binary(op, Column(col), val.clone())) } // push down Not by inverting everything below it UnaryOperation { op: Not, expr } => as_inverted_data_skipping_predicate(expr), @@ -186,11 +186,10 @@ impl DataSkippingFilter { }); static STATS_EXPR: LazyLock = LazyLock::new(|| Expr::column("add.stats")); static FILTER_EXPR: LazyLock = - LazyLock::new(|| Expr::column("predicate").distinct(Expr::literal(false))); + LazyLock::new(|| Expr::column("predicate").distinct(false)); - let predicate = match predicate { - Some(predicate) => predicate, - None => return None, + let Some(predicate) = predicate else { + return None; }; debug!("Creating a data skipping filter for {}", &predicate); diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index 068547761..0314321f1 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -654,27 +654,27 @@ fn table_for_numbers(nums: Vec) -> Vec { fn predicate_on_number() -> Result<(), Box> { let cases = vec![ ( - Expression::column("number").lt(Expression::literal(4i64)), + Expression::column("number").lt(4i64), table_for_numbers(vec![1, 2, 3]), ), ( - Expression::column("number").le(Expression::literal(4i64)), + Expression::column("number").le(4i64), table_for_numbers(vec![1, 2, 3, 4]), ), ( - Expression::column("number").gt(Expression::literal(4i64)), + Expression::column("number").gt(4i64), table_for_numbers(vec![5, 6]), ), ( - Expression::column("number").ge(Expression::literal(4i64)), + Expression::column("number").ge(4i64), table_for_numbers(vec![4, 5, 6]), ), ( - Expression::column("number").eq(Expression::literal(4i64)), + Expression::column("number").eq(4i64), table_for_numbers(vec![4]), ), ( - Expression::column("number").ne(Expression::literal(4i64)), + Expression::column("number").ne(4i64), table_for_numbers(vec![1, 2, 3, 5, 6]), ), ]; @@ -694,27 +694,27 @@ fn predicate_on_number() -> Result<(), Box> { fn predicate_on_number_not() -> Result<(), Box> { let cases = vec![ ( - Expression::not(Expression::column("number").lt(Expression::literal(4i64))), + Expression::not(Expression::column("number").lt(4i64)), table_for_numbers(vec![4, 5, 6]), ), ( - Expression::not(Expression::column("number").le(Expression::literal(4i64))), + Expression::not(Expression::column("number").le(4i64)), table_for_numbers(vec![5, 6]), ), ( - Expression::not(Expression::column("number").gt(Expression::literal(4i64))), + Expression::not(Expression::column("number").gt(4i64)), table_for_numbers(vec![1, 2, 3, 4]), ), ( - Expression::not(Expression::column("number").ge(Expression::literal(4i64))), + Expression::not(Expression::column("number").ge(4i64)), table_for_numbers(vec![1, 2, 3]), ), ( - Expression::not(Expression::column("number").eq(Expression::literal(4i64))), + Expression::not(Expression::column("number").eq(4i64)), table_for_numbers(vec![1, 2, 3, 5, 6]), ), ( - Expression::not(Expression::column("number").ne(Expression::literal(4i64))), + Expression::not(Expression::column("number").ne(4i64)), table_for_numbers(vec![4]), ), ]; @@ -822,30 +822,26 @@ fn and_or_predicates() -> Result<(), Box> { let cases = vec![ ( Expression::column("number") - .gt(Expression::literal(4i64)) - .and(Expression::column("a_float").gt(Expression::literal(5.5))), + .gt(4i64) + .and(Expression::column("a_float").gt(5.5)), table_for_numbers(vec![6]), ), ( Expression::column("number") - .gt(Expression::literal(4i64)) - .and(Expression::not( - Expression::column("a_float").gt(Expression::literal(5.5)), - )), + .gt(4i64) + .and(Expression::not(Expression::column("a_float").gt(5.5))), table_for_numbers(vec![5]), ), ( Expression::column("number") - .gt(Expression::literal(4i64)) - .or(Expression::column("a_float").gt(Expression::literal(5.5))), + .gt(4i64) + .or(Expression::column("a_float").gt(5.5)), table_for_numbers(vec![5, 6]), ), ( Expression::column("number") - .gt(Expression::literal(4i64)) - .or(Expression::not( - Expression::column("a_float").gt(Expression::literal(5.5)), - )), + .gt(4i64) + .or(Expression::not(Expression::column("a_float").gt(5.5))), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ]; @@ -866,36 +862,32 @@ fn not_and_or_predicates() -> Result<(), Box> { ( Expression::not( Expression::column("number") - .gt(Expression::literal(4i64)) - .and(Expression::column("a_float").gt(Expression::literal(5.5))), + .gt(4i64) + .and(Expression::column("a_float").gt(5.5)), ), table_for_numbers(vec![1, 2, 3, 4, 5]), ), ( Expression::not( Expression::column("number") - .gt(Expression::literal(4i64)) - .and(Expression::not( - Expression::column("a_float").gt(Expression::literal(5.5)), - )), + .gt(4i64) + .and(Expression::not(Expression::column("a_float").gt(5.5))), ), table_for_numbers(vec![1, 2, 3, 4, 6]), ), ( Expression::not( Expression::column("number") - .gt(Expression::literal(4i64)) - .or(Expression::column("a_float").gt(Expression::literal(5.5))), + .gt(4i64) + .or(Expression::column("a_float").gt(5.5)), ), table_for_numbers(vec![1, 2, 3, 4]), ), ( Expression::not( Expression::column("number") - .gt(Expression::literal(4i64)) - .or(Expression::not( - Expression::column("a_float").gt(Expression::literal(5.5)), - )), + .gt(4i64) + .or(Expression::not(Expression::column("a_float").gt(5.5))), ), vec![], ), @@ -913,31 +905,30 @@ fn not_and_or_predicates() -> Result<(), Box> { #[test] fn invalid_skips_none_predicates() -> Result<(), Box> { + let empty_struct = Expression::struct_expr(vec![]); let cases = vec![ ( Expression::literal(3i64), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - Expression::column("number").distinct(Expression::literal(3i64)), + Expression::column("number").distinct(3i64), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - Expression::column("number").gt(Expression::struct_expr(vec![])), + Expression::column("number").gt(empty_struct.clone()), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - Expression::column("number").and(Expression::struct_expr(vec![]).is_null()), + Expression::column("number").and(empty_struct.clone().is_null()), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - Expression::not(Expression::column("number").gt(Expression::struct_expr(vec![]))), + Expression::not(Expression::column("number").gt(empty_struct.clone())), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - Expression::not( - Expression::column("number").and(Expression::struct_expr(vec![]).is_null()), - ), + Expression::not(Expression::column("number").and(empty_struct.clone().is_null())), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ]; @@ -971,10 +962,7 @@ fn with_predicate_and_removes() -> Result<(), Box> { read_table_data_str( "./tests/data/table-with-dv-small/", None, - Some(Expression::gt( - Expression::column("value"), - Expression::literal(3), - )), + Some(Expression::gt(Expression::column("value"), 3)), expected, )?; Ok(()) From 1687d557859fa12ca4adc71f9cfbfafb75b36e6c Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 15 Oct 2024 16:10:17 -0700 Subject: [PATCH 3/6] Expression::Column references a ColumnName object instead of a String --- ffi/src/expressions.rs | 3 +- kernel/src/actions/set_transaction.rs | 2 +- kernel/src/engine/arrow_expression.rs | 37 ++++--- .../parquet_row_group_skipping/tests.rs | 30 +++--- .../engine/parquet_stats_skipping/tests.rs | 10 +- kernel/src/expressions/column_name.rs | 99 +++++++++++++++++++ kernel/src/expressions/mod.rs | 41 +++++--- kernel/src/expressions/scalars.rs | 2 +- kernel/src/scan/data_skipping.rs | 58 ++++++----- kernel/src/scan/log_replay.rs | 12 +-- kernel/src/scan/mod.rs | 10 +- kernel/src/snapshot.rs | 4 +- kernel/tests/read.rs | 90 +++++++++-------- 13 files changed, 268 insertions(+), 130 deletions(-) create mode 100644 kernel/src/expressions/column_name.rs diff --git a/ffi/src/expressions.rs b/ffi/src/expressions.rs index 087cae163..64495e9de 100644 --- a/ffi/src/expressions.rs +++ b/ffi/src/expressions.rs @@ -146,7 +146,8 @@ fn visit_expression_column_impl( state: &mut KernelExpressionVisitorState, name: DeltaResult, ) -> DeltaResult { - Ok(wrap_expression(state, Expression::Column(name?))) + // TODO: We can't actually assume the column name is so easily splittable! + Ok(wrap_expression(state, Expression::split_column(name?))) } #[no_mangle] diff --git a/kernel/src/actions/set_transaction.rs b/kernel/src/actions/set_transaction.rs index d6701c19a..a6e20e8c8 100644 --- a/kernel/src/actions/set_transaction.rs +++ b/kernel/src/actions/set_transaction.rs @@ -52,7 +52,7 @@ impl SetTransactionScanner { // checkpoint part when patitioned by `add.path` like the Delta spec requires. There's no // point filtering by a particular app id, even if we have one, because app ids are all in // the a single checkpoint part having large min/max range (because they're usually uuids). - let meta_predicate = Expr::column("txn.appId").is_not_null(); + let meta_predicate = Expr::split_column("txn.appId").is_not_null(); self.snapshot .log_segment .replay(engine, schema.clone(), schema, Some(meta_predicate)) diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs index ca4be0cfc..708f48f8e 100644 --- a/kernel/src/engine/arrow_expression.rs +++ b/kernel/src/engine/arrow_expression.rs @@ -447,9 +447,10 @@ mod tests { let array = ListArray::new(field.clone(), offsets, Arc::new(values), None); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array.clone())]).unwrap(); - let not_op = Expression::binary(BinaryOperator::NotIn, 5, Expression::column("item")); + let not_op = + Expression::binary(BinaryOperator::NotIn, 5, Expression::simple_column("item")); - let in_op = Expression::binary(BinaryOperator::NotIn, 5, Expression::column("item")); + let in_op = Expression::binary(BinaryOperator::NotIn, 5, Expression::simple_column("item")); let result = evaluate_expression(¬_op, &batch, None).unwrap(); let expected = BooleanArray::from(vec![true, false, true]); @@ -467,7 +468,7 @@ mod tests { let schema = Schema::new([field.clone()]); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(values.clone())]).unwrap(); - let in_op = Expression::binary(BinaryOperator::NotIn, 5, Expression::column("item")); + let in_op = Expression::binary(BinaryOperator::NotIn, 5, Expression::simple_column("item")); let in_result = evaluate_expression(&in_op, &batch, None); @@ -512,8 +513,8 @@ mod tests { let in_op = Expression::binary( BinaryOperator::NotIn, - Expression::column("item"), - Expression::column("item"), + Expression::simple_column("item"), + Expression::simple_column("item"), ); let in_result = evaluate_expression(&in_op, &batch, None); @@ -537,10 +538,14 @@ mod tests { let array = ListArray::new(field.clone(), offsets, Arc::new(values), None); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array.clone())]).unwrap(); - let str_not_op = - Expression::binary(BinaryOperator::NotIn, "bye", Expression::column("item")); + let str_not_op = Expression::binary( + BinaryOperator::NotIn, + "bye", + Expression::simple_column("item"), + ); - let str_in_op = Expression::binary(BinaryOperator::In, "hi", Expression::column("item")); + let str_in_op = + Expression::binary(BinaryOperator::In, "hi", Expression::simple_column("item")); let result = evaluate_expression(&str_in_op, &batch, None).unwrap(); let expected = BooleanArray::from(vec![true, true, true]); @@ -557,7 +562,7 @@ mod tests { let values = Int32Array::from(vec![1, 2, 3]); let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values.clone())]).unwrap(); - let column = Expression::column("a"); + let column = Expression::simple_column("a"); let results = evaluate_expression(&column, &batch, None).unwrap(); assert_eq!(results.as_ref(), &values); @@ -578,7 +583,7 @@ mod tests { vec![Arc::new(struct_array.clone())], ) .unwrap(); - let column = Expression::column("b.a"); + let column = Expression::split_column("b.a"); let results = evaluate_expression(&column, &batch, None).unwrap(); assert_eq!(results.as_ref(), &values); } @@ -588,7 +593,7 @@ mod tests { let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); let values = Int32Array::from(vec![1, 2, 3]); let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values)]).unwrap(); - let column = Expression::column("a"); + let column = Expression::simple_column("a"); let expression = column.clone().add(1); let results = evaluate_expression(&expression, &batch, None).unwrap(); @@ -624,8 +629,8 @@ mod tests { vec![Arc::new(values.clone()), Arc::new(values)], ) .unwrap(); - let column_a = Expression::column("a"); - let column_b = Expression::column("b"); + let column_a = Expression::simple_column("a"); + let column_b = Expression::simple_column("b"); let expression = column_a.clone().add(column_b.clone()); let results = evaluate_expression(&expression, &batch, None).unwrap(); @@ -648,7 +653,7 @@ mod tests { let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); let values = Int32Array::from(vec![1, 2, 3]); let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values)]).unwrap(); - let column = Expression::column("a"); + let column = Expression::simple_column("a"); let expression = column.clone().lt(2); let results = evaluate_expression(&expression, &batch, None).unwrap(); @@ -695,8 +700,8 @@ mod tests { ], ) .unwrap(); - let column_a = Expression::column("a"); - let column_b = Expression::column("b"); + let column_a = Expression::simple_column("a"); + let column_b = Expression::simple_column("b"); let expression = column_a.clone().and(column_b.clone()); let results = diff --git a/kernel/src/engine/parquet_row_group_skipping/tests.rs b/kernel/src/engine/parquet_row_group_skipping/tests.rs index 6f5dd3a48..370b0c115 100644 --- a/kernel/src/engine/parquet_row_group_skipping/tests.rs +++ b/kernel/src/engine/parquet_row_group_skipping/tests.rs @@ -39,21 +39,21 @@ fn test_get_stat_values() { // The expression doesn't matter -- it just needs to mention all the columns we care about. let columns = Expression::and_from(vec![ - Expression::column("varlen.utf8"), - Expression::column("numeric.ints.int64"), - Expression::column("numeric.ints.int32"), - Expression::column("numeric.ints.int16"), - Expression::column("numeric.ints.int8"), - Expression::column("numeric.floats.float32"), - Expression::column("numeric.floats.float64"), - Expression::column("bool"), - Expression::column("varlen.binary"), - Expression::column("numeric.decimals.decimal32"), - Expression::column("numeric.decimals.decimal64"), - Expression::column("numeric.decimals.decimal128"), - Expression::column("chrono.date32"), - Expression::column("chrono.timestamp"), - Expression::column("chrono.timestamp_ntz"), + Expression::split_column("varlen.utf8"), + Expression::split_column("numeric.ints.int64"), + Expression::split_column("numeric.ints.int32"), + Expression::split_column("numeric.ints.int16"), + Expression::split_column("numeric.ints.int8"), + Expression::split_column("numeric.floats.float32"), + Expression::split_column("numeric.floats.float64"), + Expression::simple_column("bool"), + Expression::split_column("varlen.binary"), + Expression::split_column("numeric.decimals.decimal32"), + Expression::split_column("numeric.decimals.decimal64"), + Expression::split_column("numeric.decimals.decimal128"), + Expression::split_column("chrono.date32"), + Expression::split_column("chrono.timestamp"), + Expression::split_column("chrono.timestamp_ntz"), ]); let filter = RowGroupFilter::new(metadata.metadata().row_group(0), &columns); diff --git a/kernel/src/engine/parquet_stats_skipping/tests.rs b/kernel/src/engine/parquet_stats_skipping/tests.rs index a95ac4102..f28cfc6f5 100644 --- a/kernel/src/engine/parquet_stats_skipping/tests.rs +++ b/kernel/src/engine/parquet_stats_skipping/tests.rs @@ -337,7 +337,7 @@ fn test_binary_eq_ne() { const LO: Scalar = Scalar::Long(1); const MID: Scalar = Scalar::Long(10); const HI: Scalar = Scalar::Long(100); - let col = &Expression::column("x"); + let col = &Expression::simple_column("x"); for inverted in [false, true] { // negative test -- mismatched column type @@ -485,7 +485,7 @@ fn test_binary_lt_ge() { const LO: Scalar = Scalar::Long(1); const MID: Scalar = Scalar::Long(10); const HI: Scalar = Scalar::Long(100); - let col = &Expression::column("x"); + let col = &Expression::simple_column("x"); for inverted in [false, true] { expect_eq!( @@ -585,7 +585,7 @@ fn test_binary_le_gt() { const LO: Scalar = Scalar::Long(1); const MID: Scalar = Scalar::Long(10); const HI: Scalar = Scalar::Long(100); - let col = &Expression::column("x"); + let col = &Expression::simple_column("x"); for inverted in [false, true] { // negative test -- mismatched column type @@ -736,7 +736,7 @@ impl ParquetStatsSkippingFilter for NullCountTestFilter { fn test_not_null() { use UnaryOperator::IsNull; - let col = &Expression::column("x"); + let col = &Expression::simple_column("x"); for inverted in [false, true] { expect_eq!( NullCountTestFilter::new(None, 10).apply_unary(IsNull, col, inverted), @@ -809,7 +809,7 @@ impl ParquetStatsSkippingFilter for AllNullTestFilter { #[test] fn test_sql_where() { - let col = &Expression::column("x"); + let col = &Expression::simple_column("x"); let val = &Expression::literal(1); const NULL: Expression = Expression::Literal(Scalar::Null(DataType::BOOLEAN)); const FALSE: Expression = Expression::Literal(Scalar::Boolean(false)); diff --git a/kernel/src/expressions/column_name.rs b/kernel/src/expressions/column_name.rs new file mode 100644 index 000000000..affaa1c54 --- /dev/null +++ b/kernel/src/expressions/column_name.rs @@ -0,0 +1,99 @@ +use std::borrow::Borrow; +use std::fmt::{Display, Formatter}; +use std::hash::{Hash, Hasher}; +use std::ops::Deref; + +/// A (possibly nested) column name. +// TODO: Track name as a path rather than a single string +#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)] +pub struct ColumnName { + path: String, +} + +impl ColumnName { + /// Creates a simple (non-nested) column name. + // TODO: A simple path name containing periods will be treated as nested right now. + pub fn simple(name: impl Into) -> Self { + Self { path: name.into() } + } + + /// Parses a nested column name by splitting it at periods. Caller affirms that field names in + /// the input string do not contain any periods. + pub fn split(path: impl AsRef) -> Self { + // TODO: Actually parse this! + Self::simple(path.as_ref()) + } + + /// Joins two column names + pub fn join(left: impl Into, right: impl Into) -> ColumnName { + Self::simple(format!("{}.{}", left.into(), right.into())) + } + + /// The path of field names for this column name + pub fn path(&self) -> &String { + &self.path + } + + /// Consumes this column name and returns the path of field names as a vector. + pub fn into_inner(self) -> String { + self.path + } +} + +impl> From for ColumnName { + fn from(value: T) -> Self { + Self::simple(value) + } +} + +impl Display for ColumnName { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + (**self).fmt(f) + } +} + +impl Deref for ColumnName { + type Target = String; + + fn deref(&self) -> &String { + &self.path + } +} + +// Allows searching collections of `ColumnName` without an owned key value +impl Borrow for ColumnName { + fn borrow(&self) -> &String { + self + } +} + +// Allows searching collections of `&ColumnName` without an owned key value. Needed because there is +// apparently no blanket `impl Borrow for &T where T: Borrow`, even tho `Eq` [1] and +// `Hash` [2] both have blanket impl for treating `&T` like `T`. +// +// [1] https://doc.rust-lang.org/std/cmp/trait.Eq.html#impl-Eq-for-%26A +// [2] https://doc.rust-lang.org/std/hash/trait.Hash.html#impl-Hash-for-%26T +impl Borrow for &ColumnName { + fn borrow(&self) -> &String { + self + } +} + +impl Hash for ColumnName { + fn hash(&self, hasher: &mut H) { + (**self).hash(hasher) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_column_name() { + assert_eq!( + ColumnName::join(ColumnName::split("a.b.c"), "d"), + ColumnName::split("a.b.c.d") + ); + } +} diff --git a/kernel/src/expressions/mod.rs b/kernel/src/expressions/mod.rs index 24bdf84fd..e665b42ae 100644 --- a/kernel/src/expressions/mod.rs +++ b/kernel/src/expressions/mod.rs @@ -5,9 +5,11 @@ use std::fmt::{Display, Formatter}; use itertools::Itertools; +pub use self::column_name::ColumnName; pub use self::scalars::{ArrayData, Scalar, StructData}; use crate::DataType; +mod column_name; mod scalars; #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -131,7 +133,7 @@ pub enum Expression { /// A literal value. Literal(Scalar), /// A column reference by name. - Column(String), + Column(ColumnName), /// A struct computed from a Vec of expressions Struct(Vec), /// A unary operation. @@ -165,11 +167,17 @@ impl> From for Expression { } } +impl From for Expression { + fn from(value: ColumnName) -> Self { + Self::Column(value) + } +} + impl Display for Expression { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { - Self::Literal(l) => write!(f, "{}", l), - Self::Column(name) => write!(f, "Column({})", name), + Self::Literal(l) => write!(f, "{l}"), + Self::Column(name) => write!(f, "Column({name})"), Self::Struct(exprs) => write!( f, "Struct({})", @@ -179,11 +187,11 @@ impl Display for Expression { op: BinaryOperator::Distinct, left, right, - } => write!(f, "DISTINCT({}, {})", left, right), - Self::BinaryOperation { op, left, right } => write!(f, "{} {} {}", left, op, right), + } => write!(f, "DISTINCT({left}, {right})"), + Self::BinaryOperation { op, left, right } => write!(f, "{left} {op} {right}"), Self::UnaryOperation { op, expr } => match op { - UnaryOperator::Not => write!(f, "NOT {}", expr), - UnaryOperator::IsNull => write!(f, "{} IS NULL", expr), + UnaryOperator::Not => write!(f, "NOT {expr}"), + UnaryOperator::IsNull => write!(f, "{expr} IS NULL"), }, Self::VariadicOperation { op, exprs } => match op { VariadicOperator::And => { @@ -207,21 +215,28 @@ impl Display for Expression { impl Expression { /// Returns a set of columns referenced by this expression. - pub fn references(&self) -> HashSet<&str> { + pub fn references(&self) -> HashSet<&ColumnName> { let mut set = HashSet::new(); for expr in self.walk() { if let Self::Column(name) = expr { - set.insert(name.as_str()); + set.insert(name); } } set } - /// Create an new expression for a column reference - pub fn column(name: impl ToString) -> Self { - Self::Column(name.to_string()) + /// Creates a new column reference, for a simple (not nested) column name. + /// See [`ColumnName::simple`] for details. + pub fn simple_column(name: impl Into) -> Self { + ColumnName::simple(name).into() + } + + /// Creates a new column reference, for a splittable nested column name. + /// See [`ColumnName::split`] for details. + pub fn split_column(name: impl AsRef) -> Self { + ColumnName::split(name.as_ref()).into() } /// Create a new expression for a literal value @@ -412,7 +427,7 @@ mod tests { #[test] fn test_expression_format() { - let col_ref = Expr::column("x"); + let col_ref = Expr::simple_column("x"); let cases = [ (col_ref.clone(), "Column(x)"), (col_ref.clone().eq(2), "Column(x) = 2"), diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs index 8c934aa3a..9ec7b8cfd 100644 --- a/kernel/src/expressions/scalars.rs +++ b/kernel/src/expressions/scalars.rs @@ -555,7 +555,7 @@ mod tests { elements: vec![Scalar::Integer(1), Scalar::Integer(2), Scalar::Integer(3)], }); - let column = Expression::column("item"); + let column = Expression::simple_column("item"); let array_op = Expression::binary(BinaryOperator::In, 10, array.clone()); let array_not_op = Expression::binary(BinaryOperator::NotIn, 10, array); let column_op = Expression::binary(BinaryOperator::In, PI, column.clone()); diff --git a/kernel/src/scan/data_skipping.rs b/kernel/src/scan/data_skipping.rs index 0bc8e3e08..50171d0e7 100644 --- a/kernel/src/scan/data_skipping.rs +++ b/kernel/src/scan/data_skipping.rs @@ -8,17 +8,19 @@ use tracing::debug; use crate::actions::visitors::SelectionVectorVisitor; use crate::actions::{get_log_schema, ADD_NAME}; use crate::error::DeltaResult; -use crate::expressions::{BinaryOperator, Expression as Expr, UnaryOperator, VariadicOperator}; +use crate::expressions::{ + BinaryOperator, ColumnName, Expression as Expr, UnaryOperator, VariadicOperator, +}; use crate::schema::{DataType, PrimitiveType, SchemaRef, SchemaTransform, StructField, StructType}; use crate::{Engine, EngineData, ExpressionEvaluator, JsonHandler}; /// Get the expression that checks if a col could be null, assuming tight_bounds = true. In this /// case a column can contain null if any value > 0 is in the nullCount. This is further complicated /// by the default for tightBounds being true, so we have to check if it's EITHER `null` OR `true` -fn get_tight_null_expr(null_col: String) -> Expr { +fn get_tight_null_expr(null_col: ColumnName) -> Expr { Expr::and( - Expr::distinct(Expr::column("tightBounds"), false), - Expr::gt(Expr::column(null_col), 0i64), + Expr::distinct(Expr::simple_column("tightBounds"), false), + Expr::gt(null_col.into(), 0i64), ) } @@ -26,20 +28,20 @@ fn get_tight_null_expr(null_col: String) -> Expr { /// case, we can only check if the WHOLE column is null, by checking if the number of records is /// equal to the null count, since all other values of nullCount must be ignored (except 0, which /// doesn't help us) -fn get_wide_null_expr(null_col: String) -> Expr { +fn get_wide_null_expr(null_col: ColumnName) -> Expr { Expr::and( - Expr::eq(Expr::column("tightBounds"), false), - Expr::eq(Expr::column("numRecords"), Expr::column(null_col)), + Expr::eq(Expr::simple_column("tightBounds"), false), + Expr::eq(Expr::simple_column("numRecords"), null_col), ) } /// Get the expression that checks if a col could NOT be null, assuming tight_bounds = true. In this /// case a column has a NOT NULL record if nullCount < numRecords. This is further complicated by /// the default for tightBounds being true, so we have to check if it's EITHER `null` OR `true` -fn get_tight_not_null_expr(null_col: String) -> Expr { +fn get_tight_not_null_expr(null_col: ColumnName) -> Expr { Expr::and( - Expr::distinct(Expr::column("tightBounds"), false), - Expr::lt(Expr::column(null_col), Expr::column("numRecords")), + Expr::distinct(Expr::simple_column("tightBounds"), false), + Expr::lt(null_col.into(), Expr::simple_column("numRecords")), ) } @@ -47,10 +49,10 @@ fn get_tight_not_null_expr(null_col: String) -> Expr { /// this case, we can only check if the WHOLE column null, by checking if the nullCount == /// numRecords. So by inverting that check and seeing if nullCount != numRecords, we can check if /// there is a possibility of a NOT null -fn get_wide_not_null_expr(null_col: String) -> Expr { +fn get_wide_not_null_expr(null_col: ColumnName) -> Expr { Expr::and( - Expr::eq(Expr::column("tightBounds"), false), - Expr::ne(Expr::column("numRecords"), Expr::column(null_col)), + Expr::eq(Expr::simple_column("tightBounds"), false), + Expr::ne(Expr::simple_column("numRecords"), null_col), ) } @@ -64,7 +66,7 @@ fn as_inverted_data_skipping_predicate(expr: &Expr) -> Option { // to check if a column could NOT have a null, we need two different checks, to see // if the bounds are tight and then to actually do the check if let Column(col) = expr.as_ref() { - let null_col = format!("nullCount.{col}"); + let null_col = ColumnName::join("nullCount", col.clone()); Some(Expr::or( get_tight_not_null_expr(null_col.clone()), get_wide_not_null_expr(null_col), @@ -126,14 +128,20 @@ fn as_data_skipping_predicate(expr: &Expr) -> Option { } NotEqual => { return Some(Expr::or( - Expr::gt(Column(format!("minValues.{}", col)), val.clone()), - Expr::lt(Column(format!("maxValues.{}", col)), val.clone()), + Expr::gt( + ColumnName::join("minValues", col.clone()).into(), + val.clone(), + ), + Expr::lt( + ColumnName::join("maxValues", col.clone()).into(), + val.clone(), + ), )); } _ => return None, // unsupported operation }; - let col = format!("{}.{}", stats_col, col); - Some(Expr::binary(op, Column(col), val.clone())) + let col = ColumnName::join(stats_col, col.clone()); + Some(Expr::binary(op, col, val.clone())) } // push down Not by inverting everything below it UnaryOperation { op: Not, expr } => as_inverted_data_skipping_predicate(expr), @@ -141,7 +149,7 @@ fn as_data_skipping_predicate(expr: &Expr) -> Option { // to check if a column could have a null, we need two different checks, to see if // the bounds are tight and then to actually do the check if let Column(col) = expr.as_ref() { - let null_col = format!("nullCount.{col}"); + let null_col = ColumnName::join("nullCount", col.clone()); Some(Expr::or( get_tight_null_expr(null_col.clone()), get_wide_null_expr(null_col), @@ -184,9 +192,9 @@ impl DataSkippingFilter { static PREDICATE_SCHEMA: LazyLock = LazyLock::new(|| { DataType::struct_type([StructField::new("predicate", DataType::BOOLEAN, true)]) }); - static STATS_EXPR: LazyLock = LazyLock::new(|| Expr::column("add.stats")); + static STATS_EXPR: LazyLock = LazyLock::new(|| Expr::split_column("add.stats")); static FILTER_EXPR: LazyLock = - LazyLock::new(|| Expr::column("predicate").distinct(false)); + LazyLock::new(|| Expr::simple_column("predicate").distinct(false)); let Some(predicate) = predicate else { return None; @@ -199,7 +207,7 @@ impl DataSkippingFilter { // extracting the corresponding field from the table schema, and inserting that field. let data_fields: Vec<_> = table_schema .fields() - .filter(|field| field_names.contains(&field.name.as_str())) + .filter(|field| field_names.contains(&field.name)) .cloned() .collect(); if data_fields.is_empty() { @@ -310,10 +318,10 @@ mod tests { #[test] fn test_rewrite_basic_comparison() { - let column = Expr::column("a"); + let column = Expr::simple_column("a"); let lit_int = Expr::literal(1_i32); - let min_col = Expr::column("minValues.a"); - let max_col = Expr::column("maxValues.a"); + let min_col = Expr::split_column("minValues.a"); + let max_col = Expr::split_column("maxValues.a"); let cases = [ ( diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 4c9ee34ac..92b1dc583 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -161,12 +161,12 @@ impl LogReplayScanner { fn get_add_transform_expr(&self) -> Expression { Expression::Struct(vec![ - Expression::column("add.path"), - Expression::column("add.size"), - Expression::column("add.modificationTime"), - Expression::column("add.stats"), - Expression::column("add.deletionVector"), - Expression::Struct(vec![Expression::column("add.partitionValues")]), + Expression::split_column("add.path"), + Expression::split_column("add.size"), + Expression::split_column("add.modificationTime"), + Expression::split_column("add.stats"), + Expression::split_column("add.deletionVector"), + Expression::Struct(vec![Expression::split_column("add.partitionValues")]), ]) } diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index b37452668..a91dd26c6 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -9,7 +9,7 @@ use url::Url; use crate::actions::deletion_vector::{split_vector, treemap_to_bools, DeletionVectorDescriptor}; use crate::actions::{get_log_schema, ADD_NAME, REMOVE_NAME}; -use crate::expressions::{Expression, Scalar}; +use crate::expressions::{ColumnName, Expression, Scalar}; use crate::features::ColumnMappingMode; use crate::scan::state::{DvInfo, Stats}; use crate::schema::{DataType, Schema, SchemaRef, StructField, StructType}; @@ -167,7 +167,7 @@ impl ScanResult { /// to materialize the partition column. pub enum ColumnType { // A column, selected from the data, as is - Selected(String), + Selected(ColumnName), // A partition column that needs to be added back in Partition(usize), } @@ -427,7 +427,7 @@ fn get_state_info( let physical_field = logical_field.make_physical(column_mapping_mode)?; let physical_name = physical_field.name.clone(); read_fields.push(physical_field); - Ok(ColumnType::Selected(physical_name)) + Ok(ColumnType::Selected(ColumnName::simple(physical_name))) } }) .try_collect()?; @@ -498,7 +498,7 @@ fn transform_to_logical_internal( )?; Ok::(value_expression.into()) } - ColumnType::Selected(field_name) => Ok(Expression::column(field_name)), + ColumnType::Selected(field_name) => Ok(field_name.clone().into()), }) .try_collect()?; let read_expression = Expression::Struct(all_fields); @@ -763,7 +763,7 @@ mod tests { assert_eq!(data.len(), 1); // Ineffective predicate pushdown attempted, so the one data file should be returned. - let int_col = Expression::column("numeric.ints.int32"); + let int_col = Expression::split_column("numeric.ints.int32"); let value = Expression::literal(1000i32); let predicate = int_col.clone().gt(value.clone()); let scan = snapshot diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index 6c3aa2140..a3ffc7787 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -110,8 +110,8 @@ impl LogSegment { // filter out log files that do not contain metadata or protocol information use Expression as Expr; let meta_predicate = Expr::or( - Expr::column("metaData.id").is_not_null(), - Expr::column("protocol.minReaderVersion").is_not_null(), + Expr::split_column("metaData.id").is_not_null(), + Expr::split_column("protocol.minReaderVersion").is_not_null(), ); // read the same protocol and metadata schema for both commits and checkpoints self.replay(engine, schema.clone(), schema, Some(meta_predicate)) diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index 0314321f1..0fb9916b8 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -339,7 +339,7 @@ async fn stats() -> Result<(), Box> { (NotEqual, 8, vec![&batch2, &batch1]), ]; for (op, value, expected_batches) in test_cases { - let predicate = Expression::binary(op, Expression::column("id"), value); + let predicate = Expression::binary(op, Expression::simple_column("id"), value); let scan = snapshot .clone() .scan_builder() @@ -654,27 +654,27 @@ fn table_for_numbers(nums: Vec) -> Vec { fn predicate_on_number() -> Result<(), Box> { let cases = vec![ ( - Expression::column("number").lt(4i64), + Expression::simple_column("number").lt(4i64), table_for_numbers(vec![1, 2, 3]), ), ( - Expression::column("number").le(4i64), + Expression::simple_column("number").le(4i64), table_for_numbers(vec![1, 2, 3, 4]), ), ( - Expression::column("number").gt(4i64), + Expression::simple_column("number").gt(4i64), table_for_numbers(vec![5, 6]), ), ( - Expression::column("number").ge(4i64), + Expression::simple_column("number").ge(4i64), table_for_numbers(vec![4, 5, 6]), ), ( - Expression::column("number").eq(4i64), + Expression::simple_column("number").eq(4i64), table_for_numbers(vec![4]), ), ( - Expression::column("number").ne(4i64), + Expression::simple_column("number").ne(4i64), table_for_numbers(vec![1, 2, 3, 5, 6]), ), ]; @@ -694,27 +694,27 @@ fn predicate_on_number() -> Result<(), Box> { fn predicate_on_number_not() -> Result<(), Box> { let cases = vec![ ( - Expression::not(Expression::column("number").lt(4i64)), + Expression::not(Expression::simple_column("number").lt(4i64)), table_for_numbers(vec![4, 5, 6]), ), ( - Expression::not(Expression::column("number").le(4i64)), + Expression::not(Expression::simple_column("number").le(4i64)), table_for_numbers(vec![5, 6]), ), ( - Expression::not(Expression::column("number").gt(4i64)), + Expression::not(Expression::simple_column("number").gt(4i64)), table_for_numbers(vec![1, 2, 3, 4]), ), ( - Expression::not(Expression::column("number").ge(4i64)), + Expression::not(Expression::simple_column("number").ge(4i64)), table_for_numbers(vec![1, 2, 3]), ), ( - Expression::not(Expression::column("number").eq(4i64)), + Expression::not(Expression::simple_column("number").eq(4i64)), table_for_numbers(vec![1, 2, 3, 5, 6]), ), ( - Expression::not(Expression::column("number").ne(4i64)), + Expression::not(Expression::simple_column("number").ne(4i64)), table_for_numbers(vec![4]), ), ]; @@ -743,8 +743,8 @@ fn predicate_on_number_with_not_null() -> Result<(), Box> "./tests/data/basic_partitioned", Some(&["a_float", "number"]), Some(Expression::and( - Expression::column("number").is_not_null(), - Expression::column("number").lt(Expression::literal(3i64)), + Expression::simple_column("number").is_not_null(), + Expression::simple_column("number").lt(Expression::literal(3i64)), )), expected, )?; @@ -757,7 +757,7 @@ fn predicate_null() -> Result<(), Box> { read_table_data_str( "./tests/data/basic_partitioned", Some(&["a_float", "number"]), - Some(Expression::column("number").is_null()), + Some(Expression::simple_column("number").is_null()), expected, )?; Ok(()) @@ -784,7 +784,7 @@ fn mixed_null() -> Result<(), Box> { read_table_data_str( "./tests/data/mixed-nulls", Some(&["part", "n"]), - Some(Expression::column("n").is_null()), + Some(Expression::simple_column("n").is_null()), expected, )?; Ok(()) @@ -811,7 +811,7 @@ fn mixed_not_null() -> Result<(), Box> { read_table_data_str( "./tests/data/mixed-nulls", Some(&["part", "n"]), - Some(Expression::column("n").is_not_null()), + Some(Expression::simple_column("n").is_not_null()), expected, )?; Ok(()) @@ -821,27 +821,31 @@ fn mixed_not_null() -> Result<(), Box> { fn and_or_predicates() -> Result<(), Box> { let cases = vec![ ( - Expression::column("number") + Expression::simple_column("number") .gt(4i64) - .and(Expression::column("a_float").gt(5.5)), + .and(Expression::simple_column("a_float").gt(5.5)), table_for_numbers(vec![6]), ), ( - Expression::column("number") + Expression::simple_column("number") .gt(4i64) - .and(Expression::not(Expression::column("a_float").gt(5.5))), + .and(Expression::not( + Expression::simple_column("a_float").gt(5.5), + )), table_for_numbers(vec![5]), ), ( - Expression::column("number") + Expression::simple_column("number") .gt(4i64) - .or(Expression::column("a_float").gt(5.5)), + .or(Expression::simple_column("a_float").gt(5.5)), table_for_numbers(vec![5, 6]), ), ( - Expression::column("number") + Expression::simple_column("number") .gt(4i64) - .or(Expression::not(Expression::column("a_float").gt(5.5))), + .or(Expression::not( + Expression::simple_column("a_float").gt(5.5), + )), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ]; @@ -861,33 +865,37 @@ fn not_and_or_predicates() -> Result<(), Box> { let cases = vec![ ( Expression::not( - Expression::column("number") + Expression::simple_column("number") .gt(4i64) - .and(Expression::column("a_float").gt(5.5)), + .and(Expression::simple_column("a_float").gt(5.5)), ), table_for_numbers(vec![1, 2, 3, 4, 5]), ), ( Expression::not( - Expression::column("number") + Expression::simple_column("number") .gt(4i64) - .and(Expression::not(Expression::column("a_float").gt(5.5))), + .and(Expression::not( + Expression::simple_column("a_float").gt(5.5), + )), ), table_for_numbers(vec![1, 2, 3, 4, 6]), ), ( Expression::not( - Expression::column("number") + Expression::simple_column("number") .gt(4i64) - .or(Expression::column("a_float").gt(5.5)), + .or(Expression::simple_column("a_float").gt(5.5)), ), table_for_numbers(vec![1, 2, 3, 4]), ), ( Expression::not( - Expression::column("number") + Expression::simple_column("number") .gt(4i64) - .or(Expression::not(Expression::column("a_float").gt(5.5))), + .or(Expression::not( + Expression::simple_column("a_float").gt(5.5), + )), ), vec![], ), @@ -912,23 +920,25 @@ fn invalid_skips_none_predicates() -> Result<(), Box> { table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - Expression::column("number").distinct(3i64), + Expression::simple_column("number").distinct(3i64), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - Expression::column("number").gt(empty_struct.clone()), + Expression::simple_column("number").gt(empty_struct.clone()), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - Expression::column("number").and(empty_struct.clone().is_null()), + Expression::simple_column("number").and(empty_struct.clone().is_null()), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - Expression::not(Expression::column("number").gt(empty_struct.clone())), + Expression::not(Expression::simple_column("number").gt(empty_struct.clone())), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - Expression::not(Expression::column("number").and(empty_struct.clone().is_null())), + Expression::not( + Expression::simple_column("number").and(empty_struct.clone().is_null()), + ), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ]; @@ -962,7 +972,7 @@ fn with_predicate_and_removes() -> Result<(), Box> { read_table_data_str( "./tests/data/table-with-dv-small/", None, - Some(Expression::gt(Expression::column("value"), 3)), + Some(Expression::gt(Expression::simple_column("value"), 3)), expected, )?; Ok(()) From 491d946d5db2bbeb8ce285278c02892859c4b739 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 21 Oct 2024 14:40:19 -0700 Subject: [PATCH 4/6] use proc macro for safety --- derive-macros/src/lib.rs | 41 ++- ffi/src/expressions.rs | 7 +- kernel/src/actions/set_transaction.rs | 10 +- kernel/src/engine/arrow_expression.rs | 36 +-- .../parquet_row_group_skipping/tests.rs | 31 ++- .../engine/parquet_stats_skipping/tests.rs | 12 +- kernel/src/expressions/column_name.rs | 258 ++++++++++++++++-- kernel/src/expressions/mod.rs | 21 +- kernel/src/expressions/scalars.rs | 4 +- kernel/src/scan/data_skipping.rs | 55 ++-- kernel/src/scan/log_replay.rs | 14 +- kernel/src/scan/mod.rs | 6 +- kernel/src/snapshot.rs | 5 +- kernel/tests/read.rs | 92 +++---- 14 files changed, 409 insertions(+), 183 deletions(-) diff --git a/derive-macros/src/lib.rs b/derive-macros/src/lib.rs index 671bf9220..968a9c150 100644 --- a/derive-macros/src/lib.rs +++ b/derive-macros/src/lib.rs @@ -1,7 +1,46 @@ use proc_macro2::{Ident, TokenStream}; use quote::{quote, quote_spanned}; use syn::spanned::Spanned; -use syn::{parse_macro_input, Data, DataStruct, DeriveInput, Fields, Meta, PathArguments, Type}; +use syn::{ + parse_macro_input, Data, DataStruct, DeriveInput, Error, Fields, Meta, PathArguments, Type, +}; + +fn parse_column_name( + input: proc_macro::TokenStream, + allow_dots: bool, + transform: impl FnOnce(syn::LitStr) -> TokenStream, +) -> proc_macro::TokenStream { + let is_valid = |c: char| c.is_ascii_alphanumeric() || c == '_' || (allow_dots && c == '.'); + let err = match syn::parse(input) { + Ok(syn::Lit::Str(name)) => match name.value().chars().find(|c| !is_valid(*c)) { + None => return transform(name).into(), + Some(bad_char) => Error::new(name.span(), format!("Invalid character: {bad_char:?}")), + }, + Ok(lit) => Error::new(lit.span(), "Expected a string literal"), + Err(err) => err, + }; + err.into_compile_error().into() +} + +/// Parses a simple column name into a single-element array of field names. See +/// [`delta_kernel::expressions::column_name::simple_column_name`] for details. +#[proc_macro] +pub fn parse_simple_column_name(input: proc_macro::TokenStream) -> proc_macro::TokenStream { + parse_column_name(input, false, |name| { + quote_spanned! { name.span() => [#name] } + }) +} + +/// Parses a nested column name into an array of simple field names. See +/// [`delta_kernel::expressions::column_name::nested_column_name`] for details. +#[proc_macro] +pub fn parse_nested_column_name(input: proc_macro::TokenStream) -> proc_macro::TokenStream { + parse_column_name(input, true, |name| { + let path = name.value(); + let path = path.split('.').map(proc_macro2::Literal::string); + quote_spanned! { name.span() => [#(#path),*] } + }) +} /// Derive a `delta_kernel::schemas::ToDataType` implementation for the annotated struct. The actual /// field names in the schema (and therefore of the struct members) are all mandated by the Delta diff --git a/ffi/src/expressions.rs b/ffi/src/expressions.rs index 64495e9de..19c334038 100644 --- a/ffi/src/expressions.rs +++ b/ffi/src/expressions.rs @@ -5,7 +5,7 @@ use crate::{ ReferenceSet, TryFromStringSlice, }; use delta_kernel::{ - expressions::{BinaryOperator, Expression, UnaryOperator}, + expressions::{BinaryOperator, ColumnName, Expression, UnaryOperator}, DeltaResult, }; @@ -146,8 +146,9 @@ fn visit_expression_column_impl( state: &mut KernelExpressionVisitorState, name: DeltaResult, ) -> DeltaResult { - // TODO: We can't actually assume the column name is so easily splittable! - Ok(wrap_expression(state, Expression::split_column(name?))) + // TODO: FIXME: This is incorrect if any field name in the column path contains a period. + let name = ColumnName::new(name?.split('.')).into(); + Ok(wrap_expression(state, name)) } #[no_mangle] diff --git a/kernel/src/actions/set_transaction.rs b/kernel/src/actions/set_transaction.rs index 06c85d274..c2134ee43 100644 --- a/kernel/src/actions/set_transaction.rs +++ b/kernel/src/actions/set_transaction.rs @@ -2,8 +2,9 @@ use std::sync::{Arc, LazyLock}; use crate::actions::visitors::SetTransactionVisitor; use crate::actions::{get_log_schema, SetTransaction, SET_TRANSACTION_NAME}; +use crate::expressions::nested_column; use crate::snapshot::Snapshot; -use crate::{DeltaResult, Engine, EngineData, Expression, ExpressionRef, SchemaRef}; +use crate::{DeltaResult, Engine, EngineData, ExpressionRef, SchemaRef}; pub use crate::actions::visitors::SetTransactionMap; pub struct SetTransactionScanner { @@ -52,11 +53,8 @@ impl SetTransactionScanner { // checkpoint part when patitioned by `add.path` like the Delta spec requires. There's no // point filtering by a particular app id, even if we have one, because app ids are all in // the a single checkpoint part having large min/max range (because they're usually uuids). - static META_PREDICATE: LazyLock> = LazyLock::new(|| { - Some(Arc::new( - Expression::split_column("txn.appId").is_not_null(), - )) - }); + static META_PREDICATE: LazyLock> = + LazyLock::new(|| Some(Arc::new(nested_column!("txn.appId").is_not_null()))); self.snapshot .log_segment .replay(engine, schema.clone(), schema, META_PREDICATE.clone()) diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs index 755f9ae49..35fc04499 100644 --- a/kernel/src/engine/arrow_expression.rs +++ b/kernel/src/engine/arrow_expression.rs @@ -431,10 +431,9 @@ mod tests { let array = ListArray::new(field.clone(), offsets, Arc::new(values), None); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array.clone())]).unwrap(); - let not_op = - Expression::binary(BinaryOperator::NotIn, 5, Expression::simple_column("item")); + let not_op = Expression::binary(BinaryOperator::NotIn, 5, simple_column!("item")); - let in_op = Expression::binary(BinaryOperator::In, 5, Expression::simple_column("item")); + let in_op = Expression::binary(BinaryOperator::In, 5, simple_column!("item")); let result = evaluate_expression(¬_op, &batch, None).unwrap(); let expected = BooleanArray::from(vec![true, false, true]); @@ -452,7 +451,7 @@ mod tests { let schema = Schema::new([field.clone()]); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(values.clone())]).unwrap(); - let in_op = Expression::binary(BinaryOperator::NotIn, 5, Expression::simple_column("item")); + let in_op = Expression::binary(BinaryOperator::NotIn, 5, simple_column!("item")); let in_result = evaluate_expression(&in_op, &batch, None); @@ -497,8 +496,8 @@ mod tests { let in_op = Expression::binary( BinaryOperator::NotIn, - Expression::simple_column("item"), - Expression::simple_column("item"), + simple_column!("item"), + simple_column!("item"), ); let in_result = evaluate_expression(&in_op, &batch, None); @@ -522,14 +521,9 @@ mod tests { let array = ListArray::new(field.clone(), offsets, Arc::new(values), None); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array.clone())]).unwrap(); - let str_not_op = Expression::binary( - BinaryOperator::NotIn, - "bye", - Expression::simple_column("item"), - ); + let str_not_op = Expression::binary(BinaryOperator::NotIn, "bye", simple_column!("item")); - let str_in_op = - Expression::binary(BinaryOperator::In, "hi", Expression::simple_column("item")); + let str_in_op = Expression::binary(BinaryOperator::In, "hi", simple_column!("item")); let result = evaluate_expression(&str_in_op, &batch, None).unwrap(); let expected = BooleanArray::from(vec![true, true, true]); @@ -546,7 +540,7 @@ mod tests { let values = Int32Array::from(vec![1, 2, 3]); let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values.clone())]).unwrap(); - let column = Expression::simple_column("a"); + let column = simple_column!("a"); let results = evaluate_expression(&column, &batch, None).unwrap(); assert_eq!(results.as_ref(), &values); @@ -567,7 +561,7 @@ mod tests { vec![Arc::new(struct_array.clone())], ) .unwrap(); - let column = Expression::split_column("b.a"); + let column = nested_column!("b.a"); let results = evaluate_expression(&column, &batch, None).unwrap(); assert_eq!(results.as_ref(), &values); } @@ -577,7 +571,7 @@ mod tests { let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); let values = Int32Array::from(vec![1, 2, 3]); let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values)]).unwrap(); - let column = Expression::simple_column("a"); + let column = simple_column!("a"); let expression = column.clone().add(1); let results = evaluate_expression(&expression, &batch, None).unwrap(); @@ -613,8 +607,8 @@ mod tests { vec![Arc::new(values.clone()), Arc::new(values)], ) .unwrap(); - let column_a = Expression::simple_column("a"); - let column_b = Expression::simple_column("b"); + let column_a = simple_column!("a"); + let column_b = simple_column!("b"); let expression = column_a.clone().add(column_b.clone()); let results = evaluate_expression(&expression, &batch, None).unwrap(); @@ -637,7 +631,7 @@ mod tests { let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); let values = Int32Array::from(vec![1, 2, 3]); let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values)]).unwrap(); - let column = Expression::simple_column("a"); + let column = simple_column!("a"); let expression = column.clone().lt(2); let results = evaluate_expression(&expression, &batch, None).unwrap(); @@ -684,8 +678,8 @@ mod tests { ], ) .unwrap(); - let column_a = Expression::simple_column("a"); - let column_b = Expression::simple_column("b"); + let column_a = simple_column!("a"); + let column_b = simple_column!("b"); let expression = column_a.clone().and(column_b.clone()); let results = diff --git a/kernel/src/engine/parquet_row_group_skipping/tests.rs b/kernel/src/engine/parquet_row_group_skipping/tests.rs index 370b0c115..9c2e0e630 100644 --- a/kernel/src/engine/parquet_row_group_skipping/tests.rs +++ b/kernel/src/engine/parquet_row_group_skipping/tests.rs @@ -1,4 +1,5 @@ use super::*; +use crate::expressions::{nested_column, simple_column}; use crate::Expression; use parquet::arrow::arrow_reader::ArrowReaderMetadata; use std::fs::File; @@ -39,21 +40,21 @@ fn test_get_stat_values() { // The expression doesn't matter -- it just needs to mention all the columns we care about. let columns = Expression::and_from(vec![ - Expression::split_column("varlen.utf8"), - Expression::split_column("numeric.ints.int64"), - Expression::split_column("numeric.ints.int32"), - Expression::split_column("numeric.ints.int16"), - Expression::split_column("numeric.ints.int8"), - Expression::split_column("numeric.floats.float32"), - Expression::split_column("numeric.floats.float64"), - Expression::simple_column("bool"), - Expression::split_column("varlen.binary"), - Expression::split_column("numeric.decimals.decimal32"), - Expression::split_column("numeric.decimals.decimal64"), - Expression::split_column("numeric.decimals.decimal128"), - Expression::split_column("chrono.date32"), - Expression::split_column("chrono.timestamp"), - Expression::split_column("chrono.timestamp_ntz"), + nested_column!("varlen.utf8"), + nested_column!("numeric.ints.int64"), + nested_column!("numeric.ints.int32"), + nested_column!("numeric.ints.int16"), + nested_column!("numeric.ints.int8"), + nested_column!("numeric.floats.float32"), + nested_column!("numeric.floats.float64"), + simple_column!("bool"), + nested_column!("varlen.binary"), + nested_column!("numeric.decimals.decimal32"), + nested_column!("numeric.decimals.decimal64"), + nested_column!("numeric.decimals.decimal128"), + nested_column!("chrono.date32"), + nested_column!("chrono.timestamp"), + nested_column!("chrono.timestamp_ntz"), ]); let filter = RowGroupFilter::new(metadata.metadata().row_group(0), &columns); diff --git a/kernel/src/engine/parquet_stats_skipping/tests.rs b/kernel/src/engine/parquet_stats_skipping/tests.rs index f28cfc6f5..9aa8c9228 100644 --- a/kernel/src/engine/parquet_stats_skipping/tests.rs +++ b/kernel/src/engine/parquet_stats_skipping/tests.rs @@ -1,5 +1,5 @@ use super::*; -use crate::expressions::{ArrayData, StructData}; +use crate::expressions::{simple_column, ArrayData, StructData}; use crate::schema::ArrayType; use crate::DataType; @@ -337,7 +337,7 @@ fn test_binary_eq_ne() { const LO: Scalar = Scalar::Long(1); const MID: Scalar = Scalar::Long(10); const HI: Scalar = Scalar::Long(100); - let col = &Expression::simple_column("x"); + let col = &simple_column!("x"); for inverted in [false, true] { // negative test -- mismatched column type @@ -485,7 +485,7 @@ fn test_binary_lt_ge() { const LO: Scalar = Scalar::Long(1); const MID: Scalar = Scalar::Long(10); const HI: Scalar = Scalar::Long(100); - let col = &Expression::simple_column("x"); + let col = &simple_column!("x"); for inverted in [false, true] { expect_eq!( @@ -585,7 +585,7 @@ fn test_binary_le_gt() { const LO: Scalar = Scalar::Long(1); const MID: Scalar = Scalar::Long(10); const HI: Scalar = Scalar::Long(100); - let col = &Expression::simple_column("x"); + let col = &simple_column!("x"); for inverted in [false, true] { // negative test -- mismatched column type @@ -736,7 +736,7 @@ impl ParquetStatsSkippingFilter for NullCountTestFilter { fn test_not_null() { use UnaryOperator::IsNull; - let col = &Expression::simple_column("x"); + let col = &simple_column!("x"); for inverted in [false, true] { expect_eq!( NullCountTestFilter::new(None, 10).apply_unary(IsNull, col, inverted), @@ -809,7 +809,7 @@ impl ParquetStatsSkippingFilter for AllNullTestFilter { #[test] fn test_sql_where() { - let col = &Expression::simple_column("x"); + let col = &simple_column!("x"); let val = &Expression::literal(1); const NULL: Expression = Expression::Literal(Scalar::Null(DataType::BOOLEAN)); const FALSE: Expression = Expression::Literal(Scalar::Boolean(false)); diff --git a/kernel/src/expressions/column_name.rs b/kernel/src/expressions/column_name.rs index affaa1c54..e0f1f718e 100644 --- a/kernel/src/expressions/column_name.rs +++ b/kernel/src/expressions/column_name.rs @@ -2,7 +2,6 @@ use std::borrow::Borrow; use std::fmt::{Display, Formatter}; use std::hash::{Hash, Hasher}; use std::ops::Deref; - /// A (possibly nested) column name. // TODO: Track name as a path rather than a single string #[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)] @@ -11,22 +10,17 @@ pub struct ColumnName { } impl ColumnName { - /// Creates a simple (non-nested) column name. - // TODO: A simple path name containing periods will be treated as nested right now. - pub fn simple(name: impl Into) -> Self { - Self { path: name.into() } - } - - /// Parses a nested column name by splitting it at periods. Caller affirms that field names in - /// the input string do not contain any periods. - pub fn split(path: impl AsRef) -> Self { - // TODO: Actually parse this! - Self::simple(path.as_ref()) + /// Constructs a new column name from an iterator of field names. The field names are joined + /// together to make a single path. + pub fn new(path: impl IntoIterator>) -> Self { + let path: Vec<_> = path.into_iter().map(Into::into).collect(); + let path = path.join("."); + Self { path } } - /// Joins two column names - pub fn join(left: impl Into, right: impl Into) -> ColumnName { - Self::simple(format!("{}.{}", left.into(), right.into())) + /// Joins two column names, concatenating their fields into a single nested column path. + pub fn join(left: ColumnName, right: ColumnName) -> ColumnName { + [left, right].into_iter().collect() } /// The path of field names for this column name @@ -34,15 +28,30 @@ impl ColumnName { &self.path } - /// Consumes this column name and returns the path of field names as a vector. + /// Consumes this column name and returns the path of field names. pub fn into_inner(self) -> String { self.path } } -impl> From for ColumnName { - fn from(value: T) -> Self { - Self::simple(value) +/// Creates a new column name from a path of field names. Each field name is taken as-is, and may +/// contain arbitrary characters (including periods, spaces, etc.). +impl> FromIterator for ColumnName { + fn from_iter(iter: T) -> Self + where + T: IntoIterator, + { + Self::new(iter) + } +} + +/// Creates a new column name by joining multiple column names together. +impl FromIterator for ColumnName { + fn from_iter(iter: T) -> Self + where + T: IntoIterator, + { + Self::new(iter.into_iter().map(ColumnName::into_inner)) } } @@ -85,15 +94,220 @@ impl Hash for ColumnName { } } +/// Creates a simple (non-nested) column name. +/// +/// To avoid accidental misuse, the argument must be a string literal and must not contain any +/// dots. Thus, the following invocations would both fail to compile: +/// +/// ```fail_compile +/// # use delta_kernel::expressions::simple_column_name; +/// let s = "a"; +/// let name = simple_column_name!(s); // not a string literal +/// ``` +/// +/// ```fail_compile +/// # use delta_kernel::expressions::simple_column_name; +/// let name = simple_column_name!("a.b"); // contains dots +/// ``` +/// +/// NOTE: `simple_column_name!("a.b.c")` produces a non-nested column whose name happens to contain +/// two period characters. To interpret the column name as nested with field names "a", "b" and "c", +/// use [`nested_column_name!`] instead. +// NOTE: Macros are only public if exported, which defines them at the root of the crate. But we +// don't want it there. So, we export a hidden macro and pub use it here where we actually want it. +#[macro_export] +#[doc(hidden)] +macro_rules! __simple_column_name { + ( $($name:tt)* ) => { + $crate::expressions::ColumnName::new(delta_kernel_derive::parse_simple_column_name!($($name)*)) + }; +} +#[doc(inline)] +pub use __simple_column_name as simple_column_name; + +/// Creates a nested column name whose field names are all simple column names. This macro is +/// provided as a convenience for the common case where the caller knows the column contains only +/// simple field names and that splitting by periods is safe. +/// +/// To avoid accidental misuse, the argument must be a string literal. Thus, the following +/// invocation would fail to compile: +/// +/// ```fail_compile +/// # use delta_kernel::expressions::nested_column_name; +/// let s = "a.b"; +/// let name = nested_column_name!(s); +/// ``` +/// +/// NOTE: `nested_column_name!("a.b.c")` produces a path with field names "a", "b", and "c". +#[macro_export] +#[doc(hidden)] +macro_rules! __nested_column_name { + ( $($name:tt)* ) => { + $crate::expressions::ColumnName::new(delta_kernel_derive::parse_nested_column_name!($($name)*)) + }; +} +#[doc(inline)] +pub use __nested_column_name as nested_column_name; + +/// Joins two column names together, when one or both inputs might be literal strings representing +/// simple (non-nested) column names. For example, the invocation `joined_column_name!("a", "b")` is +/// equivalent to `ColumnName::join(simple_column_name!("a"), simple_column_name!("b"))` +/// +/// To avoid accidental misuse, at least one argument must be a string literal. Thus, the following +/// invocation would fail to compile: +/// +/// ```fail_compile +/// # use delta_kernel::expressions::joined_column_name; +/// let s = "s"; +/// let name = joined_column_name!(s, s); +/// ``` +/// +/// NOTE: `joined_column_name!("a.b", "c")` produces a path with field names "a.b" and "c". If +/// either argument is a nested path delimited by periods, it should be split first and the result +/// passed to `joined_column_name!`, e.g. `joined_column_name!(nested_column_name!("a.b"), "c")`. +#[macro_export] +#[doc(hidden)] +macro_rules! __joined_column_name { + ( $left:literal, $right:literal ) => { + $crate::expressions::ColumnName::join( + $crate::__simple_column_name!($left), + $crate::__simple_column_name!($right), + ) + }; + ( $left:literal, $right:expr ) => { + $crate::expressions::ColumnName::join($crate::__simple_column_name!($left), $right) + }; + ( $left:expr, $right:literal) => { + $crate::expressions::ColumnName::join($left, $crate::__simple_column_name!($right)) + }; + ( $($other:tt)* ) => { + compile_error!("joined_column_name!() requires at least one string literal input") + }; +} +#[doc(inline)] +pub use __joined_column_name as joined_column_name; + +#[macro_export] +#[doc(hidden)] +macro_rules! __simple_column_expr { + ( $($name:tt)* ) => { + $crate::expressions::Expression::from($crate::expressions::simple_column_name!($($name)*)) + }; +} +#[doc(inline)] +pub use __simple_column_expr as simple_column; + +#[macro_export] +#[doc(hidden)] +macro_rules! __nested_column_expr { + ( $($name:tt)* ) => { + $crate::expressions::Expression::from($crate::expressions::nested_column_name!($($name)*)) + }; +} +#[doc(inline)] +pub use __nested_column_expr as nested_column; + +#[macro_export] +#[doc(hidden)] +macro_rules! __joined_column_expr { + ( $($name:tt)* ) => { + $crate::expressions::Expression::from($crate::expressions::joined_column_name!($($name)*)) + }; +} +#[doc(inline)] +pub use __joined_column_expr as joined_column; + #[cfg(test)] mod test { use super::*; + use delta_kernel_derive::{parse_nested_column_name, parse_simple_column_name}; #[test] - fn test_column_name() { + fn test_parse_column_name_macros() { + assert_eq!(parse_simple_column_name!("a"), ["a"]); + + assert_eq!(parse_nested_column_name!("a"), ["a"]); + assert_eq!(parse_nested_column_name!("a.b"), ["a", "b"]); + assert_eq!(parse_nested_column_name!("a.b.c"), ["a", "b", "c"]); + } + + #[test] + fn test_column_name_macros() { + let simple = simple_column_name!("x"); + let nested = nested_column_name!("x.y"); + + // fails to compile (argument is not a literal string, or contains illegal characters) + //simple_column_name!("a.b"); + //simple_column_name!("a b"); + //simple_column_name!(simple); + //simple_column_name!(simple.clone()); + + assert_eq!(simple_column_name!("a"), ColumnName::new(["a"])); + + // fails to compile (argument is not a literal string, or contains illegal characters) + //nested_column_name!("a b.c d"); + //nested_column_name!(simple); + //nested_column_name!(simple.clone()); + + assert_eq!(nested_column_name!("a"), ColumnName::new(["a"])); // Silly, but legal. + assert_eq!(nested_column_name!("a.b"), ColumnName::new(["a", "b"])); + assert_eq!( + nested_column_name!("a.b.c"), + ColumnName::new(["a", "b", "c"]) + ); + + assert_eq!(joined_column_name!("a", "b"), ColumnName::new(["a", "b"])); + assert_eq!(joined_column_name!("a", "b"), ColumnName::new(["a", "b"])); + + assert_eq!( + joined_column_name!(simple.clone(), "b"), + ColumnName::new(["x", "b"]) + ); + assert_eq!( + joined_column_name!(nested.clone(), "b"), + ColumnName::new(["x.y", "b"]) + ); + assert_eq!( - ColumnName::join(ColumnName::split("a.b.c"), "d"), - ColumnName::split("a.b.c.d") + joined_column_name!("a", simple.clone()), + ColumnName::new(["a", "x"]) ); + assert_eq!( + joined_column_name!("a", nested.clone()), + ColumnName::new(["a", "x.y"]) + ); + + // fails to compile (none of the arguments is a literal, or one of the literals contains illegal characters) + //joined_column_name!(simple, simple); + //joined_column_name!(simple.clone(), simple.clone()); + //joined_column_name!("a.b", "c"); + //joined_column_name!("a", "b.c"); + //joined_column_name!("a.b", "c.d"); + } + + #[test] + fn test_column_name_methods() { + let simple = simple_column_name!("x"); + let nested = nested_column_name!("x.y"); + + // path() + assert_eq!(simple.path(), "x"); + assert_eq!(nested.path(), "x.y"); + + // into_inner() + assert_eq!(simple.clone().into_inner(), "x"); + assert_eq!(nested.clone().into_inner(), "x.y"); + + // impl Deref + let name: &str = &nested; + assert_eq!(name, "x.y"); + + // impl> FromIterator + let name: ColumnName = ["x", "y"].into_iter().collect(); + assert_eq!(name, nested); + + // impl FromIterator + let name: ColumnName = [nested, simple].into_iter().collect(); + assert_eq!(name, nested_column_name!("x.y.x")); } } diff --git a/kernel/src/expressions/mod.rs b/kernel/src/expressions/mod.rs index 03b59d786..eea578ac7 100644 --- a/kernel/src/expressions/mod.rs +++ b/kernel/src/expressions/mod.rs @@ -5,7 +5,10 @@ use std::fmt::{Display, Formatter}; use itertools::Itertools; -pub use self::column_name::ColumnName; +pub use self::column_name::{ + joined_column, joined_column_name, nested_column, nested_column_name, simple_column, + simple_column_name, ColumnName, +}; pub use self::scalars::{ArrayData, Scalar, StructData}; use crate::DataType; @@ -229,18 +232,6 @@ impl Expression { set } - /// Creates a new column reference, for a simple (not nested) column name. - /// See [`ColumnName::simple`] for details. - pub fn simple_column(name: impl Into) -> Self { - ColumnName::simple(name).into() - } - - /// Creates a new column reference, for a splittable nested column name. - /// See [`ColumnName::split`] for details. - pub fn split_column(name: impl AsRef) -> Self { - ColumnName::split(name.as_ref()).into() - } - /// Create a new expression for a literal value pub fn literal(value: impl Into) -> Self { Self::Literal(value.into()) @@ -425,11 +416,11 @@ impl> std::ops::Div for Expression { #[cfg(test)] mod tests { - use super::Expression as Expr; + use super::{simple_column, Expression as Expr}; #[test] fn test_expression_format() { - let col_ref = Expr::simple_column("x"); + let col_ref = simple_column!("x"); let cases = [ (col_ref.clone(), "Column(x)"), (col_ref.clone().eq(2), "Column(x) = 2"), diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs index 9ec7b8cfd..e8f19984c 100644 --- a/kernel/src/expressions/scalars.rs +++ b/kernel/src/expressions/scalars.rs @@ -463,7 +463,7 @@ impl PrimitiveType { mod tests { use std::f32::consts::PI; - use crate::expressions::BinaryOperator; + use crate::expressions::{simple_column, BinaryOperator}; use crate::Expression; use super::*; @@ -555,7 +555,7 @@ mod tests { elements: vec![Scalar::Integer(1), Scalar::Integer(2), Scalar::Integer(3)], }); - let column = Expression::simple_column("item"); + let column = simple_column!("item"); let array_op = Expression::binary(BinaryOperator::In, 10, array.clone()); let array_not_op = Expression::binary(BinaryOperator::NotIn, 10, array); let column_op = Expression::binary(BinaryOperator::In, PI, column.clone()); diff --git a/kernel/src/scan/data_skipping.rs b/kernel/src/scan/data_skipping.rs index 1e78dd749..1c250700a 100644 --- a/kernel/src/scan/data_skipping.rs +++ b/kernel/src/scan/data_skipping.rs @@ -8,7 +8,8 @@ use crate::actions::visitors::SelectionVectorVisitor; use crate::actions::{get_log_schema, ADD_NAME}; use crate::error::DeltaResult; use crate::expressions::{ - BinaryOperator, ColumnName, Expression as Expr, ExpressionRef, UnaryOperator, VariadicOperator, + joined_column, nested_column, simple_column, simple_column_name, BinaryOperator, ColumnName, + Expression as Expr, ExpressionRef, UnaryOperator, VariadicOperator, }; use crate::schema::{DataType, PrimitiveType, SchemaRef, SchemaTransform, StructField, StructType}; use crate::{Engine, EngineData, ExpressionEvaluator, JsonHandler}; @@ -16,10 +17,10 @@ use crate::{Engine, EngineData, ExpressionEvaluator, JsonHandler}; /// Get the expression that checks if a col could be null, assuming tight_bounds = true. In this /// case a column can contain null if any value > 0 is in the nullCount. This is further complicated /// by the default for tightBounds being true, so we have to check if it's EITHER `null` OR `true` -fn get_tight_null_expr(null_col: ColumnName) -> Expr { +fn get_tight_null_expr(null_col: Expr) -> Expr { Expr::and( - Expr::distinct(Expr::simple_column("tightBounds"), false), - Expr::gt(null_col.into(), 0i64), + Expr::distinct(simple_column!("tightBounds"), false), + Expr::gt(null_col, 0i64), ) } @@ -27,20 +28,20 @@ fn get_tight_null_expr(null_col: ColumnName) -> Expr { /// case, we can only check if the WHOLE column is null, by checking if the number of records is /// equal to the null count, since all other values of nullCount must be ignored (except 0, which /// doesn't help us) -fn get_wide_null_expr(null_col: ColumnName) -> Expr { +fn get_wide_null_expr(null_col: Expr) -> Expr { Expr::and( - Expr::eq(Expr::simple_column("tightBounds"), false), - Expr::eq(Expr::simple_column("numRecords"), null_col), + Expr::eq(simple_column!("tightBounds"), false), + Expr::eq(simple_column!("numRecords"), null_col), ) } /// Get the expression that checks if a col could NOT be null, assuming tight_bounds = true. In this /// case a column has a NOT NULL record if nullCount < numRecords. This is further complicated by /// the default for tightBounds being true, so we have to check if it's EITHER `null` OR `true` -fn get_tight_not_null_expr(null_col: ColumnName) -> Expr { +fn get_tight_not_null_expr(null_col: Expr) -> Expr { Expr::and( - Expr::distinct(Expr::simple_column("tightBounds"), false), - Expr::lt(null_col.into(), Expr::simple_column("numRecords")), + Expr::distinct(simple_column!("tightBounds"), false), + Expr::lt(null_col, simple_column!("numRecords")), ) } @@ -48,10 +49,10 @@ fn get_tight_not_null_expr(null_col: ColumnName) -> Expr { /// this case, we can only check if the WHOLE column null, by checking if the nullCount == /// numRecords. So by inverting that check and seeing if nullCount != numRecords, we can check if /// there is a possibility of a NOT null -fn get_wide_not_null_expr(null_col: ColumnName) -> Expr { +fn get_wide_not_null_expr(null_col: Expr) -> Expr { Expr::and( - Expr::eq(Expr::simple_column("tightBounds"), false), - Expr::ne(Expr::simple_column("numRecords"), null_col), + Expr::eq(simple_column!("tightBounds"), false), + Expr::ne(simple_column!("numRecords"), null_col), ) } @@ -65,7 +66,7 @@ fn as_inverted_data_skipping_predicate(expr: &Expr) -> Option { // to check if a column could NOT have a null, we need two different checks, to see // if the bounds are tight and then to actually do the check if let Column(col) = expr.as_ref() { - let null_col = ColumnName::join("nullCount", col.clone()); + let null_col = joined_column!("nullCount", col.clone()); Some(Expr::or( get_tight_not_null_expr(null_col.clone()), get_wide_not_null_expr(null_col), @@ -117,8 +118,8 @@ fn as_data_skipping_predicate(expr: &Expr) -> Option { _ => return None, // unsupported combination of operands }; let stats_col = match op { - LessThan | LessThanOrEqual => "minValues", - GreaterThan | GreaterThanOrEqual => "maxValues", + LessThan | LessThanOrEqual => simple_column_name!("minValues"), + GreaterThan | GreaterThanOrEqual => simple_column_name!("maxValues"), Equal => { return as_data_skipping_predicate(&Expr::and( Expr::le(Column(col.clone()), Literal(val.clone())), @@ -127,14 +128,8 @@ fn as_data_skipping_predicate(expr: &Expr) -> Option { } NotEqual => { return Some(Expr::or( - Expr::gt( - ColumnName::join("minValues", col.clone()).into(), - val.clone(), - ), - Expr::lt( - ColumnName::join("maxValues", col.clone()).into(), - val.clone(), - ), + Expr::gt(joined_column!("minValues", col.clone()), val.clone()), + Expr::lt(joined_column!("maxValues", col.clone()), val.clone()), )); } _ => return None, // unsupported operation @@ -148,7 +143,7 @@ fn as_data_skipping_predicate(expr: &Expr) -> Option { // to check if a column could have a null, we need two different checks, to see if // the bounds are tight and then to actually do the check if let Column(col) = expr.as_ref() { - let null_col = ColumnName::join("nullCount", col.clone()); + let null_col = joined_column!("nullCount", col.clone()); Some(Expr::or( get_tight_null_expr(null_col.clone()), get_wide_null_expr(null_col), @@ -191,9 +186,9 @@ impl DataSkippingFilter { static PREDICATE_SCHEMA: LazyLock = LazyLock::new(|| { DataType::struct_type([StructField::new("predicate", DataType::BOOLEAN, true)]) }); - static STATS_EXPR: LazyLock = LazyLock::new(|| Expr::split_column("add.stats")); + static STATS_EXPR: LazyLock = LazyLock::new(|| nested_column!("add.stats")); static FILTER_EXPR: LazyLock = - LazyLock::new(|| Expr::simple_column("predicate").distinct(false)); + LazyLock::new(|| simple_column!("predicate").distinct(false)); let predicate = predicate.as_deref()?; debug!("Creating a data skipping filter for {}", &predicate); @@ -314,10 +309,10 @@ mod tests { #[test] fn test_rewrite_basic_comparison() { - let column = Expr::simple_column("a"); + let column = simple_column!("a"); let lit_int = Expr::literal(1_i32); - let min_col = Expr::split_column("minValues.a"); - let max_col = Expr::split_column("maxValues.a"); + let min_col = nested_column!("minValues.a"); + let max_col = nested_column!("maxValues.a"); let cases = [ ( diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 46ceb8fbd..c5c608dab 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -9,7 +9,7 @@ use super::ScanData; use crate::actions::{get_log_schema, ADD_NAME, REMOVE_NAME}; use crate::actions::{visitors::AddVisitor, visitors::RemoveVisitor, Add, Remove}; use crate::engine_data::{GetData, TypedGetData}; -use crate::expressions::{Expression, ExpressionRef}; +use crate::expressions::{nested_column, Expression, ExpressionRef}; use crate::schema::{DataType, MapType, SchemaRef, StructField, StructType}; use crate::{DataVisitor, DeltaResult, Engine, EngineData, ExpressionHandler}; @@ -124,12 +124,12 @@ impl LogReplayScanner { fn get_add_transform_expr(&self) -> Expression { Expression::Struct(vec![ - Expression::split_column("add.path"), - Expression::split_column("add.size"), - Expression::split_column("add.modificationTime"), - Expression::split_column("add.stats"), - Expression::split_column("add.deletionVector"), - Expression::Struct(vec![Expression::split_column("add.partitionValues")]), + nested_column!("add.path"), + nested_column!("add.size"), + nested_column!("add.modificationTime"), + nested_column!("add.stats"), + nested_column!("add.deletionVector"), + Expression::Struct(vec![nested_column!("add.partitionValues")]), ]) } diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 0abaee97b..a8341c656 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -420,7 +420,8 @@ fn get_state_info( let physical_field = logical_field.make_physical(column_mapping_mode)?; let physical_name = physical_field.name.clone(); read_fields.push(physical_field); - Ok(ColumnType::Selected(ColumnName::simple(physical_name))) + // TODO: Support nested columns! + Ok(ColumnType::Selected(ColumnName::new([physical_name]))) } }) .try_collect()?; @@ -613,6 +614,7 @@ mod tests { use std::path::PathBuf; use crate::engine::sync::SyncEngine; + use crate::expressions::nested_column; use crate::schema::PrimitiveType; use crate::Table; @@ -756,7 +758,7 @@ mod tests { assert_eq!(data.len(), 1); // Ineffective predicate pushdown attempted, so the one data file should be returned. - let int_col = Expression::split_column("numeric.ints.int32"); + let int_col = nested_column!("numeric.ints.int32"); let value = Expression::literal(1000i32); let predicate = Arc::new(int_col.clone().gt(value.clone())); let scan = snapshot diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index a17efde3e..722c8a7c4 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -11,6 +11,7 @@ use tracing::{debug, warn}; use url::Url; use crate::actions::{get_log_schema, Metadata, Protocol, METADATA_NAME, PROTOCOL_NAME}; +use crate::expressions::nested_column; use crate::features::{ColumnMappingMode, COLUMN_MAPPING_MODE_KEY}; use crate::path::ParsedLogPath; use crate::scan::ScanBuilder; @@ -111,8 +112,8 @@ impl LogSegment { use Expression as Expr; static META_PREDICATE: LazyLock> = LazyLock::new(|| { Some(Arc::new(Expr::or( - Expr::split_column("metaData.id").is_not_null(), - Expr::split_column("protocol.minReaderVersion").is_not_null(), + nested_column!("metaData.id").is_not_null(), + nested_column!("protocol.minReaderVersion").is_not_null(), ))) }); // read the same protocol and metadata schema for both commits and checkpoints diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index eef599e15..c017f4caf 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -13,7 +13,7 @@ use delta_kernel::actions::deletion_vector::split_vector; use delta_kernel::engine::arrow_data::ArrowEngineData; use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; use delta_kernel::engine::default::DefaultEngine; -use delta_kernel::expressions::{BinaryOperator, Expression}; +use delta_kernel::expressions::{simple_column, BinaryOperator, Expression}; use delta_kernel::scan::state::{visit_scan_files, DvInfo, Stats}; use delta_kernel::scan::{transform_to_logical, Scan}; use delta_kernel::schema::Schema; @@ -339,7 +339,7 @@ async fn stats() -> Result<(), Box> { (NotEqual, 8, vec![&batch2, &batch1]), ]; for (op, value, expected_batches) in test_cases { - let predicate = Expression::binary(op, Expression::simple_column("id"), value); + let predicate = Expression::binary(op, simple_column!("id"), value); let scan = snapshot .clone() .scan_builder() @@ -655,27 +655,27 @@ fn table_for_numbers(nums: Vec) -> Vec { fn predicate_on_number() -> Result<(), Box> { let cases = vec![ ( - Expression::simple_column("number").lt(4i64), + simple_column!("number").lt(4i64), table_for_numbers(vec![1, 2, 3]), ), ( - Expression::simple_column("number").le(4i64), + simple_column!("number").le(4i64), table_for_numbers(vec![1, 2, 3, 4]), ), ( - Expression::simple_column("number").gt(4i64), + simple_column!("number").gt(4i64), table_for_numbers(vec![5, 6]), ), ( - Expression::simple_column("number").ge(4i64), + simple_column!("number").ge(4i64), table_for_numbers(vec![4, 5, 6]), ), ( - Expression::simple_column("number").eq(4i64), + simple_column!("number").eq(4i64), table_for_numbers(vec![4]), ), ( - Expression::simple_column("number").ne(4i64), + simple_column!("number").ne(4i64), table_for_numbers(vec![1, 2, 3, 5, 6]), ), ]; @@ -695,27 +695,27 @@ fn predicate_on_number() -> Result<(), Box> { fn predicate_on_number_not() -> Result<(), Box> { let cases = vec![ ( - Expression::not(Expression::simple_column("number").lt(4i64)), + Expression::not(simple_column!("number").lt(4i64)), table_for_numbers(vec![4, 5, 6]), ), ( - Expression::not(Expression::simple_column("number").le(4i64)), + Expression::not(simple_column!("number").le(4i64)), table_for_numbers(vec![5, 6]), ), ( - Expression::not(Expression::simple_column("number").gt(4i64)), + Expression::not(simple_column!("number").gt(4i64)), table_for_numbers(vec![1, 2, 3, 4]), ), ( - Expression::not(Expression::simple_column("number").ge(4i64)), + Expression::not(simple_column!("number").ge(4i64)), table_for_numbers(vec![1, 2, 3]), ), ( - Expression::not(Expression::simple_column("number").eq(4i64)), + Expression::not(simple_column!("number").eq(4i64)), table_for_numbers(vec![1, 2, 3, 5, 6]), ), ( - Expression::not(Expression::simple_column("number").ne(4i64)), + Expression::not(simple_column!("number").ne(4i64)), table_for_numbers(vec![4]), ), ]; @@ -744,8 +744,8 @@ fn predicate_on_number_with_not_null() -> Result<(), Box> "./tests/data/basic_partitioned", Some(&["a_float", "number"]), Some(Expression::and( - Expression::simple_column("number").is_not_null(), - Expression::simple_column("number").lt(Expression::literal(3i64)), + simple_column!("number").is_not_null(), + simple_column!("number").lt(Expression::literal(3i64)), )), expected, )?; @@ -758,7 +758,7 @@ fn predicate_null() -> Result<(), Box> { read_table_data_str( "./tests/data/basic_partitioned", Some(&["a_float", "number"]), - Some(Expression::simple_column("number").is_null()), + Some(simple_column!("number").is_null()), expected, )?; Ok(()) @@ -785,7 +785,7 @@ fn mixed_null() -> Result<(), Box> { read_table_data_str( "./tests/data/mixed-nulls", Some(&["part", "n"]), - Some(Expression::simple_column("n").is_null()), + Some(simple_column!("n").is_null()), expected, )?; Ok(()) @@ -812,7 +812,7 @@ fn mixed_not_null() -> Result<(), Box> { read_table_data_str( "./tests/data/mixed-nulls", Some(&["part", "n"]), - Some(Expression::simple_column("n").is_not_null()), + Some(simple_column!("n").is_not_null()), expected, )?; Ok(()) @@ -822,31 +822,27 @@ fn mixed_not_null() -> Result<(), Box> { fn and_or_predicates() -> Result<(), Box> { let cases = vec![ ( - Expression::simple_column("number") + simple_column!("number") .gt(4i64) - .and(Expression::simple_column("a_float").gt(5.5)), + .and(simple_column!("a_float").gt(5.5)), table_for_numbers(vec![6]), ), ( - Expression::simple_column("number") + simple_column!("number") .gt(4i64) - .and(Expression::not( - Expression::simple_column("a_float").gt(5.5), - )), + .and(Expression::not(simple_column!("a_float").gt(5.5))), table_for_numbers(vec![5]), ), ( - Expression::simple_column("number") + simple_column!("number") .gt(4i64) - .or(Expression::simple_column("a_float").gt(5.5)), + .or(simple_column!("a_float").gt(5.5)), table_for_numbers(vec![5, 6]), ), ( - Expression::simple_column("number") + simple_column!("number") .gt(4i64) - .or(Expression::not( - Expression::simple_column("a_float").gt(5.5), - )), + .or(Expression::not(simple_column!("a_float").gt(5.5))), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ]; @@ -866,37 +862,33 @@ fn not_and_or_predicates() -> Result<(), Box> { let cases = vec![ ( Expression::not( - Expression::simple_column("number") + simple_column!("number") .gt(4i64) - .and(Expression::simple_column("a_float").gt(5.5)), + .and(simple_column!("a_float").gt(5.5)), ), table_for_numbers(vec![1, 2, 3, 4, 5]), ), ( Expression::not( - Expression::simple_column("number") + simple_column!("number") .gt(4i64) - .and(Expression::not( - Expression::simple_column("a_float").gt(5.5), - )), + .and(Expression::not(simple_column!("a_float").gt(5.5))), ), table_for_numbers(vec![1, 2, 3, 4, 6]), ), ( Expression::not( - Expression::simple_column("number") + simple_column!("number") .gt(4i64) - .or(Expression::simple_column("a_float").gt(5.5)), + .or(simple_column!("a_float").gt(5.5)), ), table_for_numbers(vec![1, 2, 3, 4]), ), ( Expression::not( - Expression::simple_column("number") + simple_column!("number") .gt(4i64) - .or(Expression::not( - Expression::simple_column("a_float").gt(5.5), - )), + .or(Expression::not(simple_column!("a_float").gt(5.5))), ), vec![], ), @@ -921,25 +913,23 @@ fn invalid_skips_none_predicates() -> Result<(), Box> { table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - Expression::simple_column("number").distinct(3i64), + simple_column!("number").distinct(3i64), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - Expression::simple_column("number").gt(empty_struct.clone()), + simple_column!("number").gt(empty_struct.clone()), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - Expression::simple_column("number").and(empty_struct.clone().is_null()), + simple_column!("number").and(empty_struct.clone().is_null()), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - Expression::not(Expression::simple_column("number").gt(empty_struct.clone())), + Expression::not(simple_column!("number").gt(empty_struct.clone())), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - Expression::not( - Expression::simple_column("number").and(empty_struct.clone().is_null()), - ), + Expression::not(simple_column!("number").and(empty_struct.clone().is_null())), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ]; @@ -973,7 +963,7 @@ fn with_predicate_and_removes() -> Result<(), Box> { read_table_data_str( "./tests/data/table-with-dv-small/", None, - Some(Expression::gt(Expression::simple_column("value"), 3)), + Some(Expression::gt(simple_column!("value"), 3)), expected, )?; Ok(()) From 5629e18fc2c5fbceea286935affbe598ee01121a Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Wed, 23 Oct 2024 08:32:00 -0700 Subject: [PATCH 5/6] join takes references now --- kernel/src/expressions/column_name.rs | 35 +++++++++++++++++---------- kernel/src/scan/data_skipping.rs | 13 +++++----- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/kernel/src/expressions/column_name.rs b/kernel/src/expressions/column_name.rs index e0f1f718e..14033d502 100644 --- a/kernel/src/expressions/column_name.rs +++ b/kernel/src/expressions/column_name.rs @@ -18,9 +18,21 @@ impl ColumnName { Self { path } } - /// Joins two column names, concatenating their fields into a single nested column path. - pub fn join(left: ColumnName, right: ColumnName) -> ColumnName { - [left, right].into_iter().collect() + /// Joins this column with another, concatenating their fields into a single nested column path. + /// + /// NOTE: This is a convenience method that copies two arguments without consuming them. If more + /// arguments are needed, or if performance is a concern, it is recommended to use + /// [`FromIterator for ColumnName`](#impl-FromIterator-for-ColumnName) instead: + /// + /// ``` + /// # use delta_kernel::expressions::ColumnName; + /// let x = ColumnName::new(["a", "b"]); + /// let y = ColumnName::new(["c", "d"]); + /// let joined: ColumnName = [x, y].into_iter().collect(); + /// assert_eq!(joined, ColumnName::new(["a", "b", "c", "d"])); + /// ``` + pub fn join(&self, right: &ColumnName) -> ColumnName { + [self.clone(), right.clone()].into_iter().collect() } /// The path of field names for this column name @@ -169,16 +181,13 @@ pub use __nested_column_name as nested_column_name; #[doc(hidden)] macro_rules! __joined_column_name { ( $left:literal, $right:literal ) => { - $crate::expressions::ColumnName::join( - $crate::__simple_column_name!($left), - $crate::__simple_column_name!($right), - ) + $crate::__simple_column_name!($left).join(&$crate::__simple_column_name!($right)) }; ( $left:literal, $right:expr ) => { - $crate::expressions::ColumnName::join($crate::__simple_column_name!($left), $right) + $crate::__simple_column_name!($left).join(&$right) }; ( $left:expr, $right:literal) => { - $crate::expressions::ColumnName::join($left, $crate::__simple_column_name!($right)) + $left.join(&$crate::__simple_column_name!($right)) }; ( $($other:tt)* ) => { compile_error!("joined_column_name!() requires at least one string literal input") @@ -260,20 +269,20 @@ mod test { assert_eq!(joined_column_name!("a", "b"), ColumnName::new(["a", "b"])); assert_eq!( - joined_column_name!(simple.clone(), "b"), + joined_column_name!(simple, "b"), ColumnName::new(["x", "b"]) ); assert_eq!( - joined_column_name!(nested.clone(), "b"), + joined_column_name!(nested, "b"), ColumnName::new(["x.y", "b"]) ); assert_eq!( - joined_column_name!("a", simple.clone()), + joined_column_name!("a", &simple), ColumnName::new(["a", "x"]) ); assert_eq!( - joined_column_name!("a", nested.clone()), + joined_column_name!("a", &nested), ColumnName::new(["a", "x.y"]) ); diff --git a/kernel/src/scan/data_skipping.rs b/kernel/src/scan/data_skipping.rs index 1c250700a..4fa3c2d08 100644 --- a/kernel/src/scan/data_skipping.rs +++ b/kernel/src/scan/data_skipping.rs @@ -8,7 +8,7 @@ use crate::actions::visitors::SelectionVectorVisitor; use crate::actions::{get_log_schema, ADD_NAME}; use crate::error::DeltaResult; use crate::expressions::{ - joined_column, nested_column, simple_column, simple_column_name, BinaryOperator, ColumnName, + joined_column, nested_column, simple_column, simple_column_name, BinaryOperator, Expression as Expr, ExpressionRef, UnaryOperator, VariadicOperator, }; use crate::schema::{DataType, PrimitiveType, SchemaRef, SchemaTransform, StructField, StructType}; @@ -66,7 +66,7 @@ fn as_inverted_data_skipping_predicate(expr: &Expr) -> Option { // to check if a column could NOT have a null, we need two different checks, to see // if the bounds are tight and then to actually do the check if let Column(col) = expr.as_ref() { - let null_col = joined_column!("nullCount", col.clone()); + let null_col = joined_column!("nullCount", col); Some(Expr::or( get_tight_not_null_expr(null_col.clone()), get_wide_not_null_expr(null_col), @@ -128,14 +128,13 @@ fn as_data_skipping_predicate(expr: &Expr) -> Option { } NotEqual => { return Some(Expr::or( - Expr::gt(joined_column!("minValues", col.clone()), val.clone()), - Expr::lt(joined_column!("maxValues", col.clone()), val.clone()), + Expr::gt(joined_column!("minValues", col), val.clone()), + Expr::lt(joined_column!("maxValues", col), val.clone()), )); } _ => return None, // unsupported operation }; - let col = ColumnName::join(stats_col, col.clone()); - Some(Expr::binary(op, col, val.clone())) + Some(Expr::binary(op, stats_col.join(col), val.clone())) } // push down Not by inverting everything below it UnaryOperation { op: Not, expr } => as_inverted_data_skipping_predicate(expr), @@ -143,7 +142,7 @@ fn as_data_skipping_predicate(expr: &Expr) -> Option { // to check if a column could have a null, we need two different checks, to see if // the bounds are tight and then to actually do the check if let Column(col) = expr.as_ref() { - let null_col = joined_column!("nullCount", col.clone()); + let null_col = joined_column!("nullCount", col); Some(Expr::or( get_tight_null_expr(null_col.clone()), get_wide_null_expr(null_col), From 74f5f7fa50408dfd22d8dcce848338d7b0c4e459 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Wed, 23 Oct 2024 10:44:29 -0700 Subject: [PATCH 6/6] merge simple and nested column name concept into just column name --- derive-macros/src/lib.rs | 37 ++--- kernel/src/actions/set_transaction.rs | 4 +- kernel/src/engine/arrow_expression.rs | 30 ++-- .../parquet_row_group_skipping/tests.rs | 32 ++-- .../engine/parquet_stats_skipping/tests.rs | 12 +- .../{column_name.rs => column_names.rs} | 141 ++++++------------ kernel/src/expressions/mod.rs | 11 +- kernel/src/expressions/scalars.rs | 4 +- kernel/src/scan/data_skipping.rs | 40 ++--- kernel/src/scan/log_replay.rs | 14 +- kernel/src/scan/mod.rs | 4 +- kernel/src/snapshot.rs | 6 +- kernel/tests/read.rs | 85 +++++------ 13 files changed, 173 insertions(+), 247 deletions(-) rename kernel/src/expressions/{column_name.rs => column_names.rs} (58%) diff --git a/derive-macros/src/lib.rs b/derive-macros/src/lib.rs index 5c16fd145..3b2e35aa4 100644 --- a/derive-macros/src/lib.rs +++ b/derive-macros/src/lib.rs @@ -5,16 +5,19 @@ use syn::{ parse_macro_input, Data, DataStruct, DeriveInput, Error, Fields, Meta, PathArguments, Type, }; -fn parse_column_name( - input: proc_macro::TokenStream, - allow_dots: bool, - transform: impl FnOnce(syn::LitStr) -> TokenStream, -) -> proc_macro::TokenStream { - let is_valid = |c: char| c.is_ascii_alphanumeric() || c == '_' || (allow_dots && c == '.'); +/// Parses a dot-delimited column name into an array of field names. See +/// [`delta_kernel::expressions::column_name::column_name`] macro for details. +#[proc_macro] +pub fn parse_column_name(input: proc_macro::TokenStream) -> proc_macro::TokenStream { + let is_valid = |c: char| c.is_ascii_alphanumeric() || c == '_' || c == '.'; let err = match syn::parse(input) { Ok(syn::Lit::Str(name)) => match name.value().chars().find(|c| !is_valid(*c)) { - None => return transform(name).into(), Some(bad_char) => Error::new(name.span(), format!("Invalid character: {bad_char:?}")), + _ => { + let path = name.value(); + let path = path.split('.').map(proc_macro2::Literal::string); + return quote_spanned! { name.span() => [#(#path),*] }.into(); + } }, Ok(lit) => Error::new(lit.span(), "Expected a string literal"), Err(err) => err, @@ -22,26 +25,6 @@ fn parse_column_name( err.into_compile_error().into() } -/// Parses a simple column name into a single-element array of field names. See -/// [`delta_kernel::expressions::column_name::simple_column_name`] for details. -#[proc_macro] -pub fn parse_simple_column_name(input: proc_macro::TokenStream) -> proc_macro::TokenStream { - parse_column_name(input, false, |name| { - quote_spanned! { name.span() => [#name] } - }) -} - -/// Parses a nested column name into an array of simple field names. See -/// [`delta_kernel::expressions::column_name::nested_column_name`] for details. -#[proc_macro] -pub fn parse_nested_column_name(input: proc_macro::TokenStream) -> proc_macro::TokenStream { - parse_column_name(input, true, |name| { - let path = name.value(); - let path = path.split('.').map(proc_macro2::Literal::string); - quote_spanned! { name.span() => [#(#path),*] } - }) -} - /// Derive a `delta_kernel::schemas::ToDataType` implementation for the annotated struct. The actual /// field names in the schema (and therefore of the struct members) are all mandated by the Delta /// spec, and so the user of this macro is responsible for ensuring that diff --git a/kernel/src/actions/set_transaction.rs b/kernel/src/actions/set_transaction.rs index c2134ee43..5cfae8863 100644 --- a/kernel/src/actions/set_transaction.rs +++ b/kernel/src/actions/set_transaction.rs @@ -2,7 +2,7 @@ use std::sync::{Arc, LazyLock}; use crate::actions::visitors::SetTransactionVisitor; use crate::actions::{get_log_schema, SetTransaction, SET_TRANSACTION_NAME}; -use crate::expressions::nested_column; +use crate::expressions::column_expr; use crate::snapshot::Snapshot; use crate::{DeltaResult, Engine, EngineData, ExpressionRef, SchemaRef}; @@ -54,7 +54,7 @@ impl SetTransactionScanner { // point filtering by a particular app id, even if we have one, because app ids are all in // the a single checkpoint part having large min/max range (because they're usually uuids). static META_PREDICATE: LazyLock> = - LazyLock::new(|| Some(Arc::new(nested_column!("txn.appId").is_not_null()))); + LazyLock::new(|| Some(Arc::new(column_expr!("txn.appId").is_not_null()))); self.snapshot .log_segment .replay(engine, schema.clone(), schema, META_PREDICATE.clone()) diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs index 7d2d36132..cb9138299 100644 --- a/kernel/src/engine/arrow_expression.rs +++ b/kernel/src/engine/arrow_expression.rs @@ -589,9 +589,9 @@ mod tests { let array = ListArray::new(field.clone(), offsets, Arc::new(values), None); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array.clone())]).unwrap(); - let not_op = Expression::binary(BinaryOperator::NotIn, 5, simple_column!("item")); + let not_op = Expression::binary(BinaryOperator::NotIn, 5, column_expr!("item")); - let in_op = Expression::binary(BinaryOperator::In, 5, simple_column!("item")); + let in_op = Expression::binary(BinaryOperator::In, 5, column_expr!("item")); let result = evaluate_expression(¬_op, &batch, None).unwrap(); let expected = BooleanArray::from(vec![true, false, true]); @@ -609,7 +609,7 @@ mod tests { let schema = Schema::new([field.clone()]); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(values.clone())]).unwrap(); - let in_op = Expression::binary(BinaryOperator::NotIn, 5, simple_column!("item")); + let in_op = Expression::binary(BinaryOperator::NotIn, 5, column_expr!("item")); let in_result = evaluate_expression(&in_op, &batch, None); @@ -654,8 +654,8 @@ mod tests { let in_op = Expression::binary( BinaryOperator::NotIn, - simple_column!("item"), - simple_column!("item"), + column_expr!("item"), + column_expr!("item"), ); let in_result = evaluate_expression(&in_op, &batch, None); @@ -679,9 +679,9 @@ mod tests { let array = ListArray::new(field.clone(), offsets, Arc::new(values), None); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array.clone())]).unwrap(); - let str_not_op = Expression::binary(BinaryOperator::NotIn, "bye", simple_column!("item")); + let str_not_op = Expression::binary(BinaryOperator::NotIn, "bye", column_expr!("item")); - let str_in_op = Expression::binary(BinaryOperator::In, "hi", simple_column!("item")); + let str_in_op = Expression::binary(BinaryOperator::In, "hi", column_expr!("item")); let result = evaluate_expression(&str_in_op, &batch, None).unwrap(); let expected = BooleanArray::from(vec![true, true, true]); @@ -698,7 +698,7 @@ mod tests { let values = Int32Array::from(vec![1, 2, 3]); let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values.clone())]).unwrap(); - let column = simple_column!("a"); + let column = column_expr!("a"); let results = evaluate_expression(&column, &batch, None).unwrap(); assert_eq!(results.as_ref(), &values); @@ -719,7 +719,7 @@ mod tests { vec![Arc::new(struct_array.clone())], ) .unwrap(); - let column = nested_column!("b.a"); + let column = column_expr!("b.a"); let results = evaluate_expression(&column, &batch, None).unwrap(); assert_eq!(results.as_ref(), &values); } @@ -729,7 +729,7 @@ mod tests { let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); let values = Int32Array::from(vec![1, 2, 3]); let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values)]).unwrap(); - let column = simple_column!("a"); + let column = column_expr!("a"); let expression = column.clone().add(1); let results = evaluate_expression(&expression, &batch, None).unwrap(); @@ -765,8 +765,8 @@ mod tests { vec![Arc::new(values.clone()), Arc::new(values)], ) .unwrap(); - let column_a = simple_column!("a"); - let column_b = simple_column!("b"); + let column_a = column_expr!("a"); + let column_b = column_expr!("b"); let expression = column_a.clone().add(column_b.clone()); let results = evaluate_expression(&expression, &batch, None).unwrap(); @@ -789,7 +789,7 @@ mod tests { let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); let values = Int32Array::from(vec![1, 2, 3]); let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values)]).unwrap(); - let column = simple_column!("a"); + let column = column_expr!("a"); let expression = column.clone().lt(2); let results = evaluate_expression(&expression, &batch, None).unwrap(); @@ -836,8 +836,8 @@ mod tests { ], ) .unwrap(); - let column_a = simple_column!("a"); - let column_b = simple_column!("b"); + let column_a = column_expr!("a"); + let column_b = column_expr!("b"); let expression = column_a.clone().and(column_b.clone()); let results = diff --git a/kernel/src/engine/parquet_row_group_skipping/tests.rs b/kernel/src/engine/parquet_row_group_skipping/tests.rs index 9c2e0e630..19bd2b5bf 100644 --- a/kernel/src/engine/parquet_row_group_skipping/tests.rs +++ b/kernel/src/engine/parquet_row_group_skipping/tests.rs @@ -1,5 +1,5 @@ use super::*; -use crate::expressions::{nested_column, simple_column}; +use crate::expressions::column_expr; use crate::Expression; use parquet::arrow::arrow_reader::ArrowReaderMetadata; use std::fs::File; @@ -40,21 +40,21 @@ fn test_get_stat_values() { // The expression doesn't matter -- it just needs to mention all the columns we care about. let columns = Expression::and_from(vec![ - nested_column!("varlen.utf8"), - nested_column!("numeric.ints.int64"), - nested_column!("numeric.ints.int32"), - nested_column!("numeric.ints.int16"), - nested_column!("numeric.ints.int8"), - nested_column!("numeric.floats.float32"), - nested_column!("numeric.floats.float64"), - simple_column!("bool"), - nested_column!("varlen.binary"), - nested_column!("numeric.decimals.decimal32"), - nested_column!("numeric.decimals.decimal64"), - nested_column!("numeric.decimals.decimal128"), - nested_column!("chrono.date32"), - nested_column!("chrono.timestamp"), - nested_column!("chrono.timestamp_ntz"), + column_expr!("varlen.utf8"), + column_expr!("numeric.ints.int64"), + column_expr!("numeric.ints.int32"), + column_expr!("numeric.ints.int16"), + column_expr!("numeric.ints.int8"), + column_expr!("numeric.floats.float32"), + column_expr!("numeric.floats.float64"), + column_expr!("bool"), + column_expr!("varlen.binary"), + column_expr!("numeric.decimals.decimal32"), + column_expr!("numeric.decimals.decimal64"), + column_expr!("numeric.decimals.decimal128"), + column_expr!("chrono.date32"), + column_expr!("chrono.timestamp"), + column_expr!("chrono.timestamp_ntz"), ]); let filter = RowGroupFilter::new(metadata.metadata().row_group(0), &columns); diff --git a/kernel/src/engine/parquet_stats_skipping/tests.rs b/kernel/src/engine/parquet_stats_skipping/tests.rs index 9aa8c9228..b4bdd97d3 100644 --- a/kernel/src/engine/parquet_stats_skipping/tests.rs +++ b/kernel/src/engine/parquet_stats_skipping/tests.rs @@ -1,5 +1,5 @@ use super::*; -use crate::expressions::{simple_column, ArrayData, StructData}; +use crate::expressions::{column_expr, ArrayData, StructData}; use crate::schema::ArrayType; use crate::DataType; @@ -337,7 +337,7 @@ fn test_binary_eq_ne() { const LO: Scalar = Scalar::Long(1); const MID: Scalar = Scalar::Long(10); const HI: Scalar = Scalar::Long(100); - let col = &simple_column!("x"); + let col = &column_expr!("x"); for inverted in [false, true] { // negative test -- mismatched column type @@ -485,7 +485,7 @@ fn test_binary_lt_ge() { const LO: Scalar = Scalar::Long(1); const MID: Scalar = Scalar::Long(10); const HI: Scalar = Scalar::Long(100); - let col = &simple_column!("x"); + let col = &column_expr!("x"); for inverted in [false, true] { expect_eq!( @@ -585,7 +585,7 @@ fn test_binary_le_gt() { const LO: Scalar = Scalar::Long(1); const MID: Scalar = Scalar::Long(10); const HI: Scalar = Scalar::Long(100); - let col = &simple_column!("x"); + let col = &column_expr!("x"); for inverted in [false, true] { // negative test -- mismatched column type @@ -736,7 +736,7 @@ impl ParquetStatsSkippingFilter for NullCountTestFilter { fn test_not_null() { use UnaryOperator::IsNull; - let col = &simple_column!("x"); + let col = &column_expr!("x"); for inverted in [false, true] { expect_eq!( NullCountTestFilter::new(None, 10).apply_unary(IsNull, col, inverted), @@ -809,7 +809,7 @@ impl ParquetStatsSkippingFilter for AllNullTestFilter { #[test] fn test_sql_where() { - let col = &simple_column!("x"); + let col = &column_expr!("x"); let val = &Expression::literal(1); const NULL: Expression = Expression::Literal(Scalar::Null(DataType::BOOLEAN)); const FALSE: Expression = Expression::Literal(Scalar::Boolean(false)); diff --git a/kernel/src/expressions/column_name.rs b/kernel/src/expressions/column_names.rs similarity index 58% rename from kernel/src/expressions/column_name.rs rename to kernel/src/expressions/column_names.rs index 14033d502..4129abb30 100644 --- a/kernel/src/expressions/column_name.rs +++ b/kernel/src/expressions/column_names.rs @@ -106,64 +106,48 @@ impl Hash for ColumnName { } } -/// Creates a simple (non-nested) column name. +/// Creates a nested column name whose field names are all simple column names (containing only +/// alphanumeric characters and underscores), delimited by dots. This macro is provided as a +/// convenience for the common case where the caller knows the column name contains only simple +/// field names and that splitting by periods is safe: /// -/// To avoid accidental misuse, the argument must be a string literal and must not contain any -/// dots. Thus, the following invocations would both fail to compile: +/// ``` +/// # use delta_kernel::expressions::{column_name, ColumnName}; +/// assert_eq!(column_name!("a.b.c"), ColumnName::new(["a", "b", "c"])); +/// ``` +/// +/// To avoid accidental misuse, the argument must be a string literal, so the compiler can validate +/// the safety conditions. Thus, the following uses would fail to compile: /// /// ```fail_compile -/// # use delta_kernel::expressions::simple_column_name; -/// let s = "a"; -/// let name = simple_column_name!(s); // not a string literal +/// # use delta_kernel::expressions::column_name; +/// let s = "a.b"; +/// let name = column_name!(s); // not a string literal /// ``` /// /// ```fail_compile /// # use delta_kernel::expressions::simple_column_name; -/// let name = simple_column_name!("a.b"); // contains dots +/// let name = simple_column_name!("a b"); // non-alphanumeric character /// ``` -/// -/// NOTE: `simple_column_name!("a.b.c")` produces a non-nested column whose name happens to contain -/// two period characters. To interpret the column name as nested with field names "a", "b" and "c", -/// use [`nested_column_name!`] instead. // NOTE: Macros are only public if exported, which defines them at the root of the crate. But we // don't want it there. So, we export a hidden macro and pub use it here where we actually want it. #[macro_export] #[doc(hidden)] -macro_rules! __simple_column_name { +macro_rules! __column_name { ( $($name:tt)* ) => { - $crate::expressions::ColumnName::new(delta_kernel_derive::parse_simple_column_name!($($name)*)) + $crate::expressions::ColumnName::new(delta_kernel_derive::parse_column_name!($($name)*)) }; } #[doc(inline)] -pub use __simple_column_name as simple_column_name; +pub use __column_name as column_name; -/// Creates a nested column name whose field names are all simple column names. This macro is -/// provided as a convenience for the common case where the caller knows the column contains only -/// simple field names and that splitting by periods is safe. -/// -/// To avoid accidental misuse, the argument must be a string literal. Thus, the following -/// invocation would fail to compile: +/// Joins two column names together, when one or both inputs might be literal strings representing +/// simple (non-nested) column names. For example: /// -/// ```fail_compile -/// # use delta_kernel::expressions::nested_column_name; -/// let s = "a.b"; -/// let name = nested_column_name!(s); /// ``` -/// -/// NOTE: `nested_column_name!("a.b.c")` produces a path with field names "a", "b", and "c". -#[macro_export] -#[doc(hidden)] -macro_rules! __nested_column_name { - ( $($name:tt)* ) => { - $crate::expressions::ColumnName::new(delta_kernel_derive::parse_nested_column_name!($($name)*)) - }; -} -#[doc(inline)] -pub use __nested_column_name as nested_column_name; - -/// Joins two column names together, when one or both inputs might be literal strings representing -/// simple (non-nested) column names. For example, the invocation `joined_column_name!("a", "b")` is -/// equivalent to `ColumnName::join(simple_column_name!("a"), simple_column_name!("b"))` +/// # use delta_kernel::expressions::{column_name, joined_column_name}; +/// assert_eq!(joined_column_name!("a.b", "c"), column_name!("a.b").join(&column_name!("c"))) +/// ``` /// /// To avoid accidental misuse, at least one argument must be a string literal. Thus, the following /// invocation would fail to compile: @@ -173,21 +157,17 @@ pub use __nested_column_name as nested_column_name; /// let s = "s"; /// let name = joined_column_name!(s, s); /// ``` -/// -/// NOTE: `joined_column_name!("a.b", "c")` produces a path with field names "a.b" and "c". If -/// either argument is a nested path delimited by periods, it should be split first and the result -/// passed to `joined_column_name!`, e.g. `joined_column_name!(nested_column_name!("a.b"), "c")`. #[macro_export] #[doc(hidden)] macro_rules! __joined_column_name { ( $left:literal, $right:literal ) => { - $crate::__simple_column_name!($left).join(&$crate::__simple_column_name!($right)) + $crate::__column_name!($left).join(&$crate::__column_name!($right)) }; ( $left:literal, $right:expr ) => { - $crate::__simple_column_name!($left).join(&$right) + $crate::__column_name!($left).join(&$right) }; ( $left:expr, $right:literal) => { - $left.join(&$crate::__simple_column_name!($right)) + $left.join(&$crate::__column_name!($right)) }; ( $($other:tt)* ) => { compile_error!("joined_column_name!() requires at least one string literal input") @@ -198,72 +178,46 @@ pub use __joined_column_name as joined_column_name; #[macro_export] #[doc(hidden)] -macro_rules! __simple_column_expr { +macro_rules! __column_expr { ( $($name:tt)* ) => { - $crate::expressions::Expression::from($crate::expressions::simple_column_name!($($name)*)) + $crate::expressions::Expression::from($crate::__column_name!($($name)*)) }; } #[doc(inline)] -pub use __simple_column_expr as simple_column; - -#[macro_export] -#[doc(hidden)] -macro_rules! __nested_column_expr { - ( $($name:tt)* ) => { - $crate::expressions::Expression::from($crate::expressions::nested_column_name!($($name)*)) - }; -} -#[doc(inline)] -pub use __nested_column_expr as nested_column; +pub use __column_expr as column_expr; #[macro_export] #[doc(hidden)] macro_rules! __joined_column_expr { ( $($name:tt)* ) => { - $crate::expressions::Expression::from($crate::expressions::joined_column_name!($($name)*)) + $crate::expressions::Expression::from($crate::__joined_column_name!($($name)*)) }; } #[doc(inline)] -pub use __joined_column_expr as joined_column; +pub use __joined_column_expr as joined_column_expr; #[cfg(test)] mod test { use super::*; - use delta_kernel_derive::{parse_nested_column_name, parse_simple_column_name}; + use delta_kernel_derive::parse_column_name; #[test] fn test_parse_column_name_macros() { - assert_eq!(parse_simple_column_name!("a"), ["a"]); + assert_eq!(parse_column_name!("a"), ["a"]); - assert_eq!(parse_nested_column_name!("a"), ["a"]); - assert_eq!(parse_nested_column_name!("a.b"), ["a", "b"]); - assert_eq!(parse_nested_column_name!("a.b.c"), ["a", "b", "c"]); + assert_eq!(parse_column_name!("a"), ["a"]); + assert_eq!(parse_column_name!("a.b"), ["a", "b"]); + assert_eq!(parse_column_name!("a.b.c"), ["a", "b", "c"]); } #[test] fn test_column_name_macros() { - let simple = simple_column_name!("x"); - let nested = nested_column_name!("x.y"); - - // fails to compile (argument is not a literal string, or contains illegal characters) - //simple_column_name!("a.b"); - //simple_column_name!("a b"); - //simple_column_name!(simple); - //simple_column_name!(simple.clone()); - - assert_eq!(simple_column_name!("a"), ColumnName::new(["a"])); + let simple = column_name!("x"); + let nested = column_name!("x.y"); - // fails to compile (argument is not a literal string, or contains illegal characters) - //nested_column_name!("a b.c d"); - //nested_column_name!(simple); - //nested_column_name!(simple.clone()); - - assert_eq!(nested_column_name!("a"), ColumnName::new(["a"])); // Silly, but legal. - assert_eq!(nested_column_name!("a.b"), ColumnName::new(["a", "b"])); - assert_eq!( - nested_column_name!("a.b.c"), - ColumnName::new(["a", "b", "c"]) - ); + assert_eq!(column_name!("a"), ColumnName::new(["a"])); + assert_eq!(column_name!("a.b"), ColumnName::new(["a", "b"])); + assert_eq!(column_name!("a.b.c"), ColumnName::new(["a", "b", "c"])); assert_eq!(joined_column_name!("a", "b"), ColumnName::new(["a", "b"])); assert_eq!(joined_column_name!("a", "b"), ColumnName::new(["a", "b"])); @@ -285,19 +239,12 @@ mod test { joined_column_name!("a", &nested), ColumnName::new(["a", "x.y"]) ); - - // fails to compile (none of the arguments is a literal, or one of the literals contains illegal characters) - //joined_column_name!(simple, simple); - //joined_column_name!(simple.clone(), simple.clone()); - //joined_column_name!("a.b", "c"); - //joined_column_name!("a", "b.c"); - //joined_column_name!("a.b", "c.d"); } #[test] fn test_column_name_methods() { - let simple = simple_column_name!("x"); - let nested = nested_column_name!("x.y"); + let simple = column_name!("x"); + let nested = column_name!("x.y"); // path() assert_eq!(simple.path(), "x"); @@ -317,6 +264,6 @@ mod test { // impl FromIterator let name: ColumnName = [nested, simple].into_iter().collect(); - assert_eq!(name, nested_column_name!("x.y.x")); + assert_eq!(name, column_name!("x.y.x")); } } diff --git a/kernel/src/expressions/mod.rs b/kernel/src/expressions/mod.rs index eea578ac7..54b90ff5f 100644 --- a/kernel/src/expressions/mod.rs +++ b/kernel/src/expressions/mod.rs @@ -5,14 +5,13 @@ use std::fmt::{Display, Formatter}; use itertools::Itertools; -pub use self::column_name::{ - joined_column, joined_column_name, nested_column, nested_column_name, simple_column, - simple_column_name, ColumnName, +pub use self::column_names::{ + column_expr, column_name, joined_column_expr, joined_column_name, ColumnName, }; pub use self::scalars::{ArrayData, Scalar, StructData}; use crate::DataType; -mod column_name; +mod column_names; mod scalars; #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -416,11 +415,11 @@ impl> std::ops::Div for Expression { #[cfg(test)] mod tests { - use super::{simple_column, Expression as Expr}; + use super::{column_expr, Expression as Expr}; #[test] fn test_expression_format() { - let col_ref = simple_column!("x"); + let col_ref = column_expr!("x"); let cases = [ (col_ref.clone(), "Column(x)"), (col_ref.clone().eq(2), "Column(x) = 2"), diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs index e8f19984c..3578258e9 100644 --- a/kernel/src/expressions/scalars.rs +++ b/kernel/src/expressions/scalars.rs @@ -463,7 +463,7 @@ impl PrimitiveType { mod tests { use std::f32::consts::PI; - use crate::expressions::{simple_column, BinaryOperator}; + use crate::expressions::{column_expr, BinaryOperator}; use crate::Expression; use super::*; @@ -555,7 +555,7 @@ mod tests { elements: vec![Scalar::Integer(1), Scalar::Integer(2), Scalar::Integer(3)], }); - let column = simple_column!("item"); + let column = column_expr!("item"); let array_op = Expression::binary(BinaryOperator::In, 10, array.clone()); let array_not_op = Expression::binary(BinaryOperator::NotIn, 10, array); let column_op = Expression::binary(BinaryOperator::In, PI, column.clone()); diff --git a/kernel/src/scan/data_skipping.rs b/kernel/src/scan/data_skipping.rs index 187091bef..31aaaab15 100644 --- a/kernel/src/scan/data_skipping.rs +++ b/kernel/src/scan/data_skipping.rs @@ -8,8 +8,8 @@ use crate::actions::get_log_add_schema; use crate::actions::visitors::SelectionVectorVisitor; use crate::error::DeltaResult; use crate::expressions::{ - joined_column, nested_column, simple_column, simple_column_name, BinaryOperator, - Expression as Expr, ExpressionRef, UnaryOperator, VariadicOperator, + column_expr, column_name, joined_column_expr, BinaryOperator, Expression as Expr, + ExpressionRef, UnaryOperator, VariadicOperator, }; use crate::schema::{DataType, PrimitiveType, SchemaRef, SchemaTransform, StructField, StructType}; use crate::{Engine, EngineData, ExpressionEvaluator, JsonHandler}; @@ -19,7 +19,7 @@ use crate::{Engine, EngineData, ExpressionEvaluator, JsonHandler}; /// by the default for tightBounds being true, so we have to check if it's EITHER `null` OR `true` fn get_tight_null_expr(null_col: Expr) -> Expr { Expr::and( - Expr::distinct(simple_column!("tightBounds"), false), + Expr::distinct(column_expr!("tightBounds"), false), Expr::gt(null_col, 0i64), ) } @@ -30,8 +30,8 @@ fn get_tight_null_expr(null_col: Expr) -> Expr { /// doesn't help us) fn get_wide_null_expr(null_col: Expr) -> Expr { Expr::and( - Expr::eq(simple_column!("tightBounds"), false), - Expr::eq(simple_column!("numRecords"), null_col), + Expr::eq(column_expr!("tightBounds"), false), + Expr::eq(column_expr!("numRecords"), null_col), ) } @@ -40,8 +40,8 @@ fn get_wide_null_expr(null_col: Expr) -> Expr { /// the default for tightBounds being true, so we have to check if it's EITHER `null` OR `true` fn get_tight_not_null_expr(null_col: Expr) -> Expr { Expr::and( - Expr::distinct(simple_column!("tightBounds"), false), - Expr::lt(null_col, simple_column!("numRecords")), + Expr::distinct(column_expr!("tightBounds"), false), + Expr::lt(null_col, column_expr!("numRecords")), ) } @@ -51,8 +51,8 @@ fn get_tight_not_null_expr(null_col: Expr) -> Expr { /// there is a possibility of a NOT null fn get_wide_not_null_expr(null_col: Expr) -> Expr { Expr::and( - Expr::eq(simple_column!("tightBounds"), false), - Expr::ne(simple_column!("numRecords"), null_col), + Expr::eq(column_expr!("tightBounds"), false), + Expr::ne(column_expr!("numRecords"), null_col), ) } @@ -66,7 +66,7 @@ fn as_inverted_data_skipping_predicate(expr: &Expr) -> Option { // to check if a column could NOT have a null, we need two different checks, to see // if the bounds are tight and then to actually do the check if let Column(col) = expr.as_ref() { - let null_col = joined_column!("nullCount", col); + let null_col = joined_column_expr!("nullCount", col); Some(Expr::or( get_tight_not_null_expr(null_col.clone()), get_wide_not_null_expr(null_col), @@ -118,8 +118,8 @@ fn as_data_skipping_predicate(expr: &Expr) -> Option { _ => return None, // unsupported combination of operands }; let stats_col = match op { - LessThan | LessThanOrEqual => simple_column_name!("minValues"), - GreaterThan | GreaterThanOrEqual => simple_column_name!("maxValues"), + LessThan | LessThanOrEqual => column_name!("minValues"), + GreaterThan | GreaterThanOrEqual => column_name!("maxValues"), Equal => { return as_data_skipping_predicate(&Expr::and( Expr::le(Column(col.clone()), Literal(val.clone())), @@ -128,8 +128,8 @@ fn as_data_skipping_predicate(expr: &Expr) -> Option { } NotEqual => { return Some(Expr::or( - Expr::gt(joined_column!("minValues", col), val.clone()), - Expr::lt(joined_column!("maxValues", col), val.clone()), + Expr::gt(joined_column_expr!("minValues", col), val.clone()), + Expr::lt(joined_column_expr!("maxValues", col), val.clone()), )); } _ => return None, // unsupported operation @@ -142,7 +142,7 @@ fn as_data_skipping_predicate(expr: &Expr) -> Option { // to check if a column could have a null, we need two different checks, to see if // the bounds are tight and then to actually do the check if let Column(col) = expr.as_ref() { - let null_col = joined_column!("nullCount", col); + let null_col = joined_column_expr!("nullCount", col); Some(Expr::or( get_tight_null_expr(null_col.clone()), get_wide_null_expr(null_col), @@ -185,9 +185,9 @@ impl DataSkippingFilter { static PREDICATE_SCHEMA: LazyLock = LazyLock::new(|| { DataType::struct_type([StructField::new("predicate", DataType::BOOLEAN, true)]) }); - static STATS_EXPR: LazyLock = LazyLock::new(|| nested_column!("add.stats")); + static STATS_EXPR: LazyLock = LazyLock::new(|| column_expr!("add.stats")); static FILTER_EXPR: LazyLock = - LazyLock::new(|| simple_column!("predicate").distinct(false)); + LazyLock::new(|| column_expr!("predicate").distinct(false)); let predicate = predicate.as_deref()?; debug!("Creating a data skipping filter for {}", &predicate); @@ -308,10 +308,10 @@ mod tests { #[test] fn test_rewrite_basic_comparison() { - let column = simple_column!("a"); + let column = column_expr!("a"); let lit_int = Expr::literal(1_i32); - let min_col = nested_column!("minValues.a"); - let max_col = nested_column!("maxValues.a"); + let min_col = column_expr!("minValues.a"); + let max_col = column_expr!("maxValues.a"); let cases = [ ( diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 6957f17ae..f872a8eca 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -9,7 +9,7 @@ use super::ScanData; use crate::actions::{get_log_schema, ADD_NAME, REMOVE_NAME}; use crate::actions::{visitors::AddVisitor, visitors::RemoveVisitor, Add, Remove}; use crate::engine_data::{GetData, TypedGetData}; -use crate::expressions::{nested_column, Expression, ExpressionRef}; +use crate::expressions::{column_expr, Expression, ExpressionRef}; use crate::schema::{DataType, MapType, SchemaRef, StructField, StructType}; use crate::{DataVisitor, DeltaResult, Engine, EngineData, ExpressionHandler}; @@ -125,12 +125,12 @@ impl LogReplayScanner { fn get_add_transform_expr(&self) -> Expression { Expression::Struct(vec![ - nested_column!("add.path"), - nested_column!("add.size"), - nested_column!("add.modificationTime"), - nested_column!("add.stats"), - nested_column!("add.deletionVector"), - Expression::Struct(vec![nested_column!("add.partitionValues")]), + column_expr!("add.path"), + column_expr!("add.size"), + column_expr!("add.modificationTime"), + column_expr!("add.stats"), + column_expr!("add.deletionVector"), + Expression::Struct(vec![column_expr!("add.partitionValues")]), ]) } diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 36383998e..b5c01e0a4 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -615,7 +615,7 @@ mod tests { use std::path::PathBuf; use crate::engine::sync::SyncEngine; - use crate::expressions::nested_column; + use crate::expressions::column_expr; use crate::schema::PrimitiveType; use crate::Table; @@ -759,7 +759,7 @@ mod tests { assert_eq!(data.len(), 1); // Ineffective predicate pushdown attempted, so the one data file should be returned. - let int_col = nested_column!("numeric.ints.int32"); + let int_col = column_expr!("numeric.ints.int32"); let value = Expression::literal(1000i32); let predicate = Arc::new(int_col.clone().gt(value.clone())); let scan = snapshot diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index 722c8a7c4..f119571e2 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -11,7 +11,7 @@ use tracing::{debug, warn}; use url::Url; use crate::actions::{get_log_schema, Metadata, Protocol, METADATA_NAME, PROTOCOL_NAME}; -use crate::expressions::nested_column; +use crate::expressions::column_expr; use crate::features::{ColumnMappingMode, COLUMN_MAPPING_MODE_KEY}; use crate::path::ParsedLogPath; use crate::scan::ScanBuilder; @@ -112,8 +112,8 @@ impl LogSegment { use Expression as Expr; static META_PREDICATE: LazyLock> = LazyLock::new(|| { Some(Arc::new(Expr::or( - nested_column!("metaData.id").is_not_null(), - nested_column!("protocol.minReaderVersion").is_not_null(), + column_expr!("metaData.id").is_not_null(), + column_expr!("protocol.minReaderVersion").is_not_null(), ))) }); // read the same protocol and metadata schema for both commits and checkpoints diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index c017f4caf..92bf70314 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -13,7 +13,7 @@ use delta_kernel::actions::deletion_vector::split_vector; use delta_kernel::engine::arrow_data::ArrowEngineData; use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; use delta_kernel::engine::default::DefaultEngine; -use delta_kernel::expressions::{simple_column, BinaryOperator, Expression}; +use delta_kernel::expressions::{column_expr, BinaryOperator, Expression}; use delta_kernel::scan::state::{visit_scan_files, DvInfo, Stats}; use delta_kernel::scan::{transform_to_logical, Scan}; use delta_kernel::schema::Schema; @@ -339,7 +339,7 @@ async fn stats() -> Result<(), Box> { (NotEqual, 8, vec![&batch2, &batch1]), ]; for (op, value, expected_batches) in test_cases { - let predicate = Expression::binary(op, simple_column!("id"), value); + let predicate = Expression::binary(op, column_expr!("id"), value); let scan = snapshot .clone() .scan_builder() @@ -655,27 +655,24 @@ fn table_for_numbers(nums: Vec) -> Vec { fn predicate_on_number() -> Result<(), Box> { let cases = vec![ ( - simple_column!("number").lt(4i64), + column_expr!("number").lt(4i64), table_for_numbers(vec![1, 2, 3]), ), ( - simple_column!("number").le(4i64), + column_expr!("number").le(4i64), table_for_numbers(vec![1, 2, 3, 4]), ), ( - simple_column!("number").gt(4i64), + column_expr!("number").gt(4i64), table_for_numbers(vec![5, 6]), ), ( - simple_column!("number").ge(4i64), + column_expr!("number").ge(4i64), table_for_numbers(vec![4, 5, 6]), ), + (column_expr!("number").eq(4i64), table_for_numbers(vec![4])), ( - simple_column!("number").eq(4i64), - table_for_numbers(vec![4]), - ), - ( - simple_column!("number").ne(4i64), + column_expr!("number").ne(4i64), table_for_numbers(vec![1, 2, 3, 5, 6]), ), ]; @@ -695,27 +692,27 @@ fn predicate_on_number() -> Result<(), Box> { fn predicate_on_number_not() -> Result<(), Box> { let cases = vec![ ( - Expression::not(simple_column!("number").lt(4i64)), + Expression::not(column_expr!("number").lt(4i64)), table_for_numbers(vec![4, 5, 6]), ), ( - Expression::not(simple_column!("number").le(4i64)), + Expression::not(column_expr!("number").le(4i64)), table_for_numbers(vec![5, 6]), ), ( - Expression::not(simple_column!("number").gt(4i64)), + Expression::not(column_expr!("number").gt(4i64)), table_for_numbers(vec![1, 2, 3, 4]), ), ( - Expression::not(simple_column!("number").ge(4i64)), + Expression::not(column_expr!("number").ge(4i64)), table_for_numbers(vec![1, 2, 3]), ), ( - Expression::not(simple_column!("number").eq(4i64)), + Expression::not(column_expr!("number").eq(4i64)), table_for_numbers(vec![1, 2, 3, 5, 6]), ), ( - Expression::not(simple_column!("number").ne(4i64)), + Expression::not(column_expr!("number").ne(4i64)), table_for_numbers(vec![4]), ), ]; @@ -744,8 +741,8 @@ fn predicate_on_number_with_not_null() -> Result<(), Box> "./tests/data/basic_partitioned", Some(&["a_float", "number"]), Some(Expression::and( - simple_column!("number").is_not_null(), - simple_column!("number").lt(Expression::literal(3i64)), + column_expr!("number").is_not_null(), + column_expr!("number").lt(Expression::literal(3i64)), )), expected, )?; @@ -758,7 +755,7 @@ fn predicate_null() -> Result<(), Box> { read_table_data_str( "./tests/data/basic_partitioned", Some(&["a_float", "number"]), - Some(simple_column!("number").is_null()), + Some(column_expr!("number").is_null()), expected, )?; Ok(()) @@ -785,7 +782,7 @@ fn mixed_null() -> Result<(), Box> { read_table_data_str( "./tests/data/mixed-nulls", Some(&["part", "n"]), - Some(simple_column!("n").is_null()), + Some(column_expr!("n").is_null()), expected, )?; Ok(()) @@ -812,7 +809,7 @@ fn mixed_not_null() -> Result<(), Box> { read_table_data_str( "./tests/data/mixed-nulls", Some(&["part", "n"]), - Some(simple_column!("n").is_not_null()), + Some(column_expr!("n").is_not_null()), expected, )?; Ok(()) @@ -822,27 +819,27 @@ fn mixed_not_null() -> Result<(), Box> { fn and_or_predicates() -> Result<(), Box> { let cases = vec![ ( - simple_column!("number") + column_expr!("number") .gt(4i64) - .and(simple_column!("a_float").gt(5.5)), + .and(column_expr!("a_float").gt(5.5)), table_for_numbers(vec![6]), ), ( - simple_column!("number") + column_expr!("number") .gt(4i64) - .and(Expression::not(simple_column!("a_float").gt(5.5))), + .and(Expression::not(column_expr!("a_float").gt(5.5))), table_for_numbers(vec![5]), ), ( - simple_column!("number") + column_expr!("number") .gt(4i64) - .or(simple_column!("a_float").gt(5.5)), + .or(column_expr!("a_float").gt(5.5)), table_for_numbers(vec![5, 6]), ), ( - simple_column!("number") + column_expr!("number") .gt(4i64) - .or(Expression::not(simple_column!("a_float").gt(5.5))), + .or(Expression::not(column_expr!("a_float").gt(5.5))), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ]; @@ -862,33 +859,33 @@ fn not_and_or_predicates() -> Result<(), Box> { let cases = vec![ ( Expression::not( - simple_column!("number") + column_expr!("number") .gt(4i64) - .and(simple_column!("a_float").gt(5.5)), + .and(column_expr!("a_float").gt(5.5)), ), table_for_numbers(vec![1, 2, 3, 4, 5]), ), ( Expression::not( - simple_column!("number") + column_expr!("number") .gt(4i64) - .and(Expression::not(simple_column!("a_float").gt(5.5))), + .and(Expression::not(column_expr!("a_float").gt(5.5))), ), table_for_numbers(vec![1, 2, 3, 4, 6]), ), ( Expression::not( - simple_column!("number") + column_expr!("number") .gt(4i64) - .or(simple_column!("a_float").gt(5.5)), + .or(column_expr!("a_float").gt(5.5)), ), table_for_numbers(vec![1, 2, 3, 4]), ), ( Expression::not( - simple_column!("number") + column_expr!("number") .gt(4i64) - .or(Expression::not(simple_column!("a_float").gt(5.5))), + .or(Expression::not(column_expr!("a_float").gt(5.5))), ), vec![], ), @@ -913,23 +910,23 @@ fn invalid_skips_none_predicates() -> Result<(), Box> { table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - simple_column!("number").distinct(3i64), + column_expr!("number").distinct(3i64), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - simple_column!("number").gt(empty_struct.clone()), + column_expr!("number").gt(empty_struct.clone()), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - simple_column!("number").and(empty_struct.clone().is_null()), + column_expr!("number").and(empty_struct.clone().is_null()), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - Expression::not(simple_column!("number").gt(empty_struct.clone())), + Expression::not(column_expr!("number").gt(empty_struct.clone())), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ( - Expression::not(simple_column!("number").and(empty_struct.clone().is_null())), + Expression::not(column_expr!("number").and(empty_struct.clone().is_null())), table_for_numbers(vec![1, 2, 3, 4, 5, 6]), ), ]; @@ -963,7 +960,7 @@ fn with_predicate_and_removes() -> Result<(), Box> { read_table_data_str( "./tests/data/table-with-dv-small/", None, - Some(Expression::gt(simple_column!("value"), 3)), + Some(Expression::gt(column_expr!("value"), 3)), expected, )?; Ok(())