Skip to content

Commit

Permalink
perf: Optimize IsNotNullExpr (#11586)
Browse files Browse the repository at this point in the history
* add criterion benchmarks for IsNullExpr and IsNotNullExpr

* Improve IsNotNull performance by avoiding calling is_null then not and just calling is_not_null kernel directly

* fast path if input array is all nulls or no nulls

* revert experimental change

* remove unused import

* simplify PR
  • Loading branch information
andygrove authored Jul 24, 2024
1 parent bcf715c commit 20b298e
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 3 deletions.
4 changes: 4 additions & 0 deletions datafusion/physical-expr/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,7 @@ name = "in_list"
[[bench]]
harness = false
name = "case_when"

[[bench]]
harness = false
name = "is_null"
95 changes: 95 additions & 0 deletions datafusion/physical-expr/benches/is_null.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow::datatypes::{Field, Schema};
use arrow::record_batch::RecordBatch;
use arrow_array::builder::Int32Builder;
use arrow_schema::DataType;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_physical_expr::expressions::{IsNotNullExpr, IsNullExpr};
use datafusion_physical_expr_common::expressions::column::Column;
use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
use std::sync::Arc;

fn criterion_benchmark(c: &mut Criterion) {
// create input data
let mut c1 = Int32Builder::new();
let mut c2 = Int32Builder::new();
let mut c3 = Int32Builder::new();
for i in 0..1000 {
// c1 is always null
c1.append_null();
// c2 is never null
c2.append_value(i);
// c3 is a mix of values and nulls
if i % 7 == 0 {
c3.append_null();
} else {
c3.append_value(i);
}
}
let c1 = Arc::new(c1.finish());
let c2 = Arc::new(c2.finish());
let c3 = Arc::new(c3.finish());
let schema = Schema::new(vec![
Field::new("c1", DataType::Int32, true),
Field::new("c2", DataType::Int32, false),
Field::new("c3", DataType::Int32, true),
]);
let batch = RecordBatch::try_new(Arc::new(schema), vec![c1, c2, c3]).unwrap();

c.bench_function("is_null: column is all nulls", |b| {
let expr = is_null("c1", 0);
b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
});

c.bench_function("is_null: column is never null", |b| {
let expr = is_null("c2", 1);
b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
});

c.bench_function("is_null: column is mix of values and nulls", |b| {
let expr = is_null("c3", 2);
b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
});

c.bench_function("is_not_null: column is all nulls", |b| {
let expr = is_not_null("c1", 0);
b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
});

c.bench_function("is_not_null: column is never null", |b| {
let expr = is_not_null("c2", 1);
b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
});

c.bench_function("is_not_null: column is mix of values and nulls", |b| {
let expr = is_not_null("c3", 2);
b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
});
}

fn is_null(name: &str, index: usize) -> Arc<dyn PhysicalExpr> {
Arc::new(IsNullExpr::new(Arc::new(Column::new(name, index))))
}

fn is_not_null(name: &str, index: usize) -> Arc<dyn PhysicalExpr> {
Arc::new(IsNotNullExpr::new(Arc::new(Column::new(name, index))))
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
4 changes: 1 addition & 3 deletions datafusion/physical-expr/src/expressions/is_not_null.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ use std::{any::Any, sync::Arc};

use crate::physical_expr::down_cast_any_ref;
use crate::PhysicalExpr;
use arrow::compute;
use arrow::{
datatypes::{DataType, Schema},
record_batch::RecordBatch,
Expand Down Expand Up @@ -74,8 +73,7 @@ impl PhysicalExpr for IsNotNullExpr {
let arg = self.arg.evaluate(batch)?;
match arg {
ColumnarValue::Array(array) => {
let is_null = super::is_null::compute_is_null(array)?;
let is_not_null = compute::not(&is_null)?;
let is_not_null = super::is_null::compute_is_not_null(array)?;
Ok(ColumnarValue::Array(Arc::new(is_not_null)))
}
ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::Scalar(
Expand Down
10 changes: 10 additions & 0 deletions datafusion/physical-expr/src/expressions/is_null.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,16 @@ pub(crate) fn compute_is_null(array: ArrayRef) -> Result<BooleanArray> {
}
}

/// workaround <https://github.com/apache/arrow-rs/issues/6017>,
/// this can be replaced with a direct call to `arrow::compute::is_not_null` once it's fixed.
pub(crate) fn compute_is_not_null(array: ArrayRef) -> Result<BooleanArray> {
if array.as_any().is::<UnionArray>() {
compute::not(&compute_is_null(array)?).map_err(Into::into)
} else {
compute::is_not_null(array.as_ref()).map_err(Into::into)
}
}

fn dense_union_is_null(
union_array: &UnionArray,
offsets: &ScalarBuffer<i32>,
Expand Down

0 comments on commit 20b298e

Please sign in to comment.