diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index 067617a697a9..8436b5279bd7 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -81,3 +81,7 @@ name = "in_list" [[bench]] harness = false name = "case_when" + +[[bench]] +harness = false +name = "is_null" diff --git a/datafusion/physical-expr/benches/is_null.rs b/datafusion/physical-expr/benches/is_null.rs new file mode 100644 index 000000000000..3dad8e9b456a --- /dev/null +++ b/datafusion/physical-expr/benches/is_null.rs @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::datatypes::{Field, Schema}; +use arrow::record_batch::RecordBatch; +use arrow_array::builder::Int32Builder; +use arrow_schema::DataType; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_physical_expr::expressions::{IsNotNullExpr, IsNullExpr}; +use datafusion_physical_expr_common::expressions::column::Column; +use datafusion_physical_expr_common::physical_expr::PhysicalExpr; +use std::sync::Arc; + +fn criterion_benchmark(c: &mut Criterion) { + // create input data + let mut c1 = Int32Builder::new(); + let mut c2 = Int32Builder::new(); + let mut c3 = Int32Builder::new(); + for i in 0..1000 { + // c1 is always null + c1.append_null(); + // c2 is never null + c2.append_value(i); + // c3 is a mix of values and nulls + if i % 7 == 0 { + c3.append_null(); + } else { + c3.append_value(i); + } + } + let c1 = Arc::new(c1.finish()); + let c2 = Arc::new(c2.finish()); + let c3 = Arc::new(c3.finish()); + let schema = Schema::new(vec![ + Field::new("c1", DataType::Int32, true), + Field::new("c2", DataType::Int32, false), + Field::new("c3", DataType::Int32, true), + ]); + let batch = RecordBatch::try_new(Arc::new(schema), vec![c1, c2, c3]).unwrap(); + + c.bench_function("is_null: column is all nulls", |b| { + let expr = is_null("c1", 0); + b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap())) + }); + + c.bench_function("is_null: column is never null", |b| { + let expr = is_null("c2", 1); + b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap())) + }); + + c.bench_function("is_null: column is mix of values and nulls", |b| { + let expr = is_null("c3", 2); + b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap())) + }); + + c.bench_function("is_not_null: column is all nulls", |b| { + let expr = is_not_null("c1", 0); + b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap())) + }); + + c.bench_function("is_not_null: column is never null", |b| { + let expr = is_not_null("c2", 1); + b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap())) + }); + + c.bench_function("is_not_null: column is mix of values and nulls", |b| { + let expr = is_not_null("c3", 2); + b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap())) + }); +} + +fn is_null(name: &str, index: usize) -> Arc { + Arc::new(IsNullExpr::new(Arc::new(Column::new(name, index)))) +} + +fn is_not_null(name: &str, index: usize) -> Arc { + Arc::new(IsNotNullExpr::new(Arc::new(Column::new(name, index)))) +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/physical-expr/src/expressions/is_not_null.rs b/datafusion/physical-expr/src/expressions/is_not_null.rs index 9f7438d13e05..58559352d44c 100644 --- a/datafusion/physical-expr/src/expressions/is_not_null.rs +++ b/datafusion/physical-expr/src/expressions/is_not_null.rs @@ -22,7 +22,6 @@ use std::{any::Any, sync::Arc}; use crate::physical_expr::down_cast_any_ref; use crate::PhysicalExpr; -use arrow::compute; use arrow::{ datatypes::{DataType, Schema}, record_batch::RecordBatch, @@ -74,8 +73,7 @@ impl PhysicalExpr for IsNotNullExpr { let arg = self.arg.evaluate(batch)?; match arg { ColumnarValue::Array(array) => { - let is_null = super::is_null::compute_is_null(array)?; - let is_not_null = compute::not(&is_null)?; + let is_not_null = super::is_null::compute_is_not_null(array)?; Ok(ColumnarValue::Array(Arc::new(is_not_null))) } ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::Scalar( diff --git a/datafusion/physical-expr/src/expressions/is_null.rs b/datafusion/physical-expr/src/expressions/is_null.rs index e2dc941e26bc..3cdb49bcab42 100644 --- a/datafusion/physical-expr/src/expressions/is_null.rs +++ b/datafusion/physical-expr/src/expressions/is_null.rs @@ -117,6 +117,16 @@ pub(crate) fn compute_is_null(array: ArrayRef) -> Result { } } +/// workaround , +/// this can be replaced with a direct call to `arrow::compute::is_not_null` once it's fixed. +pub(crate) fn compute_is_not_null(array: ArrayRef) -> Result { + if array.as_any().is::() { + compute::not(&compute_is_null(array)?).map_err(Into::into) + } else { + compute::is_not_null(array.as_ref()).map_err(Into::into) + } +} + fn dense_union_is_null( union_array: &UnionArray, offsets: &ScalarBuffer,