Skip to content

Commit

Permalink
Improve regexp kernels performance by avoiding cloning Regex (#5235)
Browse files Browse the repository at this point in the history
* Improve regexp_match performance by avoiding cloning Regex

* For review
  • Loading branch information
viirya authored Dec 23, 2023
1 parent 859edc6 commit 72c9505
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 6 deletions.
10 changes: 4 additions & 6 deletions arrow-string/src/regexp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,14 @@ pub fn regexp_is_match_utf8<OffsetSize: OffsetSizeTrait>(
(Some(value), Some(pattern)) => {
let existing_pattern = patterns.get(&pattern);
let re = match existing_pattern {
Some(re) => re.clone(),
Some(re) => re,
None => {
let re = Regex::new(pattern.as_str()).map_err(|e| {
ArrowError::ComputeError(format!(
"Regular expression did not compile: {e:?}"
))
})?;
patterns.insert(pattern, re.clone());
re
patterns.entry(pattern).or_insert(re)
}
};
result.append(re.is_match(value));
Expand Down Expand Up @@ -216,15 +215,14 @@ pub fn regexp_match<OffsetSize: OffsetSizeTrait>(
(Some(value), Some(pattern)) => {
let existing_pattern = patterns.get(&pattern);
let re = match existing_pattern {
Some(re) => re.clone(),
Some(re) => re,
None => {
let re = Regex::new(pattern.as_str()).map_err(|e| {
ArrowError::ComputeError(format!(
"Regular expression did not compile: {e:?}"
))
})?;
patterns.insert(pattern, re.clone());
re
patterns.entry(pattern).or_insert(re)
}
};
match re.captures(value) {
Expand Down
5 changes: 5 additions & 0 deletions arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,11 @@ name = "substring_kernels"
harness = false
required-features = ["test_utils"]

[[bench]]
name = "regexp_kernels"
harness = false
required-features = ["test_utils"]

[[bench]]
name = "array_data_validate"
harness = false
Expand Down
44 changes: 44 additions & 0 deletions arrow/benches/regexp_kernels.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#[macro_use]
extern crate criterion;
use criterion::Criterion;

extern crate arrow;

use arrow::array::*;
use arrow::compute::kernels::regexp::*;
use arrow::util::bench_util::*;

fn bench_regexp(arr: &GenericStringArray<i32>, regex_array: &GenericStringArray<i32>) {
regexp_match(criterion::black_box(arr), regex_array, None).unwrap();
}

fn add_benchmark(c: &mut Criterion) {
let size = 65536;
let val_len = 1000;

let arr_string = create_string_array_with_len::<i32>(size, 0.0, val_len);
let pattern_values = vec![r".*-(\d*)-.*"; size];
let pattern = GenericStringArray::<i32>::from(pattern_values);

c.bench_function("regexp", |b| b.iter(|| bench_regexp(&arr_string, &pattern)));
}

criterion_group!(benches, add_benchmark);
criterion_main!(benches);

0 comments on commit 72c9505

Please sign in to comment.