Skip to content

Commit 812bb35

Browse files
authored
Make array_reverse faster for List and FixedSizeList (#18500)
## Rationale for this change Noticed while doing #18424 that the list types `List` and `FixedSizeList` uses `MutableData` to build the reverse array. Using `take` turns out to be a lot faster, ~70% for both `List` and `FixedSizeList`. This PR also reworks the benchmark added in #18425, and these are the results on that compared to the implementation on main: ``` # cargo bench --bench array_reverse Compiling datafusion-functions-nested v50.3.0 (/Users/vegard/dev/datafusion/datafusion/functions-nested) Finished `bench` profile [optimized] target(s) in 42.08s Running benches/array_reverse.rs (target/release/deps/array_reverse-2c473eed34a53d0a) Gnuplot not found, using plotters backend Benchmarking array_reverse_list: Warming up for 3.0000 s Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 6.3s, or reduce sample count to 70. array_reverse_list time: [62.201 ms 62.551 ms 62.946 ms] change: [−70.137% −69.965% −69.785%] (p = 0.00 < 0.05) Performance has improved. Found 8 outliers among 100 measurements (8.00%) 5 (5.00%) high mild 3 (3.00%) high severe Benchmarking array_reverse_list_view: Warming up for 3.0000 s Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 6.3s, or reduce sample count to 70. array_reverse_list_view time: [61.649 ms 61.905 ms 62.185 ms] change: [−16.122% −15.623% −15.087%] (p = 0.00 < 0.05) Performance has improved. Found 6 outliers among 100 measurements (6.00%) 5 (5.00%) high mild 1 (1.00%) high severe array_reverse_fixed_size_list time: [4.7936 ms 4.8292 ms 4.8741 ms] change: [−76.435% −76.196% −75.951%] (p = 0.00 < 0.05) Performance has improved. Found 20 outliers among 100 measurements (20.00%) 8 (8.00%) low mild 5 (5.00%) high mild 7 (7.00%) high severe ``` ## Are these changes tested? Covered by existing sqllogic tests, and one new test for `FixedSizeList`.
1 parent 1d8bc9b commit 812bb35

File tree

2 files changed

+138
-55
lines changed

2 files changed

+138
-55
lines changed

datafusion/functions-nested/benches/array_reverse.rs

Lines changed: 56 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use std::{hint::black_box, sync::Arc};
2424
use crate::criterion::Criterion;
2525
use arrow::{
2626
array::{ArrayRef, FixedSizeListArray, Int32Array, ListArray, ListViewArray},
27-
buffer::{OffsetBuffer, ScalarBuffer},
27+
buffer::{NullBuffer, OffsetBuffer, ScalarBuffer},
2828
datatypes::{DataType, Field},
2929
};
3030
use datafusion_functions_nested::reverse::array_reverse_inner;
@@ -34,44 +34,80 @@ fn array_reverse(array: &ArrayRef) -> ArrayRef {
3434
}
3535

3636
fn criterion_benchmark(c: &mut Criterion) {
37-
// Construct large arrays for benchmarking
38-
let array_len = 100000;
39-
let step_size: usize = 1000;
40-
let offsets: Vec<i32> = (0..array_len as i32).step_by(step_size).collect();
37+
// Create array sizes with step size of 100, starting from 100.
38+
let number_of_arrays = 1000;
39+
let sizes = (0..number_of_arrays)
40+
.map(|i| 100 + i * 100)
41+
.collect::<Vec<i32>>();
42+
43+
// Calculate the total number of values
44+
let total_values = sizes.iter().sum::<i32>();
45+
46+
// Calculate sizes and offsets from array lengths
47+
let offsets = sizes
48+
.iter()
49+
.scan(0, |acc, &x| {
50+
let offset = *acc;
51+
*acc += x;
52+
Some(offset)
53+
})
54+
.collect::<Vec<i32>>();
4155
let offsets = ScalarBuffer::from(offsets);
42-
let sizes: Vec<i32> = vec![step_size as i32; array_len / step_size];
43-
let values = (0..array_len as i32).collect::<Vec<i32>>();
56+
// Set every 10th array to null
57+
let nulls = (0..number_of_arrays)
58+
.map(|i| i % 10 != 0)
59+
.collect::<Vec<bool>>();
60+
61+
let values = (0..total_values).collect::<Vec<i32>>();
62+
let values = Arc::new(Int32Array::from(values));
63+
64+
// Create ListArray and ListViewArray
65+
let nulls_list_array = Some(NullBuffer::from(
66+
nulls[..((number_of_arrays as usize) - 1)].to_vec(),
67+
));
4468
let list_array: ArrayRef = Arc::new(ListArray::new(
4569
Arc::new(Field::new("a", DataType::Int32, false)),
4670
OffsetBuffer::new(offsets.clone()),
47-
Arc::new(Int32Array::from(values.clone())),
48-
None,
71+
values.clone(),
72+
nulls_list_array,
4973
));
50-
let fixed_size_list_array: ArrayRef = Arc::new(FixedSizeListArray::new(
51-
Arc::new(Field::new("a", DataType::Int32, false)),
52-
step_size as i32,
53-
Arc::new(Int32Array::from(values.clone())),
54-
None,
74+
let nulls_list_view_array = Some(NullBuffer::from(
75+
nulls[..(number_of_arrays as usize)].to_vec(),
5576
));
5677
let list_view_array: ArrayRef = Arc::new(ListViewArray::new(
5778
Arc::new(Field::new("a", DataType::Int32, false)),
5879
offsets,
5980
ScalarBuffer::from(sizes),
60-
Arc::new(Int32Array::from(values)),
61-
None,
81+
values.clone(),
82+
nulls_list_view_array,
6283
));
6384

6485
c.bench_function("array_reverse_list", |b| {
6586
b.iter(|| array_reverse(&list_array))
6687
});
6788

68-
c.bench_function("array_reverse_fixed_size_list", |b| {
69-
b.iter(|| array_reverse(&fixed_size_list_array))
70-
});
71-
7289
c.bench_function("array_reverse_list_view", |b| {
7390
b.iter(|| array_reverse(&list_view_array))
7491
});
92+
93+
// Create FixedSizeListArray
94+
let array_len = 1000;
95+
let num_arrays = 5000;
96+
let total_values = num_arrays * array_len;
97+
let values = (0..total_values).collect::<Vec<i32>>();
98+
let values = Arc::new(Int32Array::from(values));
99+
// Set every 10th array to null
100+
let nulls = (0..num_arrays).map(|i| i % 10 != 0).collect::<Vec<bool>>();
101+
let nulls = Some(NullBuffer::from(nulls));
102+
let fixed_size_list_array: ArrayRef = Arc::new(FixedSizeListArray::new(
103+
Arc::new(Field::new("a", DataType::Int32, false)),
104+
array_len,
105+
values.clone(),
106+
nulls.clone(),
107+
));
108+
c.bench_function("array_reverse_fixed_size_list", |b| {
109+
b.iter(|| array_reverse(&fixed_size_list_array))
110+
});
75111
}
76112

77113
criterion_group!(benches, criterion_benchmark);

datafusion/functions-nested/src/reverse.rs

Lines changed: 82 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
2020
use crate::utils::make_scalar_function;
2121
use arrow::array::{
22-
Array, ArrayRef, Capacities, FixedSizeListArray, GenericListArray,
23-
GenericListViewArray, MutableArrayData, OffsetSizeTrait, UInt32Array,
22+
Array, ArrayRef, FixedSizeListArray, GenericListArray, GenericListViewArray,
23+
OffsetSizeTrait, UInt32Array, UInt64Array,
2424
};
2525
use arrow::buffer::{OffsetBuffer, ScalarBuffer};
2626
use arrow::compute::take;
@@ -155,11 +155,8 @@ fn general_array_reverse<O: OffsetSizeTrait>(
155155
field: &FieldRef,
156156
) -> Result<ArrayRef> {
157157
let values = array.values();
158-
let original_data = values.to_data();
159-
let capacity = Capacities::Array(original_data.len());
160158
let mut offsets = vec![O::usize_as(0)];
161-
let mut mutable =
162-
MutableArrayData::with_capacities(vec![&original_data], false, capacity);
159+
let mut indices: Vec<O> = Vec::with_capacity(values.len());
163160

164161
for (row_index, (&start, &end)) in array.offsets().iter().tuple_windows().enumerate()
165162
{
@@ -171,18 +168,34 @@ fn general_array_reverse<O: OffsetSizeTrait>(
171168

172169
let mut index = end - O::one();
173170
while index >= start {
174-
mutable.extend(0, index.to_usize().unwrap(), index.to_usize().unwrap() + 1);
171+
indices.push(index);
175172
index = index - O::one();
176173
}
177174
let size = end - start;
178175
offsets.push(offsets[row_index] + size);
179176
}
180177

181-
let data = mutable.freeze();
178+
// Materialize values from underlying array with take
179+
let indices_array: ArrayRef = if O::IS_LARGE {
180+
Arc::new(UInt64Array::from(
181+
indices
182+
.iter()
183+
.map(|i| i.as_usize() as u64)
184+
.collect::<Vec<_>>(),
185+
))
186+
} else {
187+
Arc::new(UInt32Array::from(
188+
indices
189+
.iter()
190+
.map(|i| i.as_usize() as u32)
191+
.collect::<Vec<_>>(),
192+
))
193+
};
194+
let values = take(&values, &indices_array, None)?;
182195
Ok(Arc::new(GenericListArray::<O>::try_new(
183196
Arc::clone(field),
184197
OffsetBuffer::<O>::new(offsets.into()),
185-
arrow::array::make_array(data),
198+
values,
186199
array.nulls().cloned(),
187200
)?))
188201
}
@@ -231,7 +244,7 @@ fn list_view_reverse<O: OffsetSizeTrait>(
231244

232245
// Materialize values from underlying array with take
233246
let indices_array: ArrayRef = if O::IS_LARGE {
234-
Arc::new(arrow::array::UInt64Array::from(
247+
Arc::new(UInt64Array::from(
235248
indices
236249
.iter()
237250
.map(|i| i.as_usize() as u64)
@@ -245,13 +258,12 @@ fn list_view_reverse<O: OffsetSizeTrait>(
245258
.collect::<Vec<_>>(),
246259
))
247260
};
248-
let values_reversed = take(&values, &indices_array, None)?;
249-
261+
let values = take(&values, &indices_array, None)?;
250262
Ok(Arc::new(GenericListViewArray::<O>::try_new(
251263
Arc::clone(field),
252264
ScalarBuffer::from(new_offsets),
253265
ScalarBuffer::from(new_sizes),
254-
values_reversed,
266+
values,
255267
array.nulls().cloned(),
256268
)?))
257269
}
@@ -260,42 +272,34 @@ fn fixed_size_array_reverse(
260272
array: &FixedSizeListArray,
261273
field: &FieldRef,
262274
) -> Result<ArrayRef> {
263-
let values = array.values();
264-
let original_data = values.to_data();
265-
let capacity = Capacities::Array(original_data.len());
266-
let mut mutable =
267-
MutableArrayData::with_capacities(vec![&original_data], false, capacity);
268-
let value_length = array.value_length() as usize;
275+
let values: &Arc<dyn Array> = array.values();
269276

270-
for row_index in 0..array.len() {
271-
// skip the null value
272-
if array.is_null(row_index) {
273-
mutable.extend(0, 0, value_length);
274-
continue;
275-
}
276-
let start = row_index * value_length;
277-
let end = start + value_length;
278-
for idx in (start..end).rev() {
279-
mutable.extend(0, idx, idx + 1);
280-
}
277+
// Since each fixed size list in the physical array is the same size and we keep the order
278+
// of the fixed size lists, we can reverse the indices for each fixed size list.
279+
let mut indices: Vec<u64> = (0..values.len() as u64).collect();
280+
for chunk in indices.chunks_mut(array.value_length() as usize) {
281+
chunk.reverse();
281282
}
282283

283-
let data = mutable.freeze();
284+
// Materialize values from underlying array with take
285+
let indices_array: ArrayRef = Arc::new(UInt64Array::from(indices));
286+
let values = take(&values, &indices_array, None)?;
287+
284288
Ok(Arc::new(FixedSizeListArray::try_new(
285289
Arc::clone(field),
286290
array.value_length(),
287-
arrow::array::make_array(data),
291+
values,
288292
array.nulls().cloned(),
289293
)?))
290294
}
291295

292296
#[cfg(test)]
293297
mod tests {
294-
use crate::reverse::list_view_reverse;
298+
use crate::reverse::{fixed_size_array_reverse, list_view_reverse};
295299
use arrow::{
296300
array::{
297-
AsArray, GenericListViewArray, Int32Array, LargeListViewArray, ListViewArray,
298-
OffsetSizeTrait,
301+
AsArray, FixedSizeListArray, GenericListViewArray, Int32Array,
302+
LargeListViewArray, ListViewArray, OffsetSizeTrait,
299303
},
300304
buffer::{NullBuffer, ScalarBuffer},
301305
datatypes::{DataType, Field, Int32Type},
@@ -312,6 +316,13 @@ mod tests {
312316
.collect()
313317
}
314318

319+
fn fixed_size_list_values(array: &FixedSizeListArray) -> Vec<Option<Vec<i32>>> {
320+
array
321+
.iter()
322+
.map(|x| x.map(|x| x.as_primitive::<Int32Type>().values().to_vec()))
323+
.collect()
324+
}
325+
315326
#[test]
316327
fn test_reverse_list_view() -> Result<()> {
317328
let field = Arc::new(Field::new("a", DataType::Int32, false));
@@ -450,4 +461,40 @@ mod tests {
450461
assert_eq!(expected, reversed);
451462
Ok(())
452463
}
464+
465+
#[test]
466+
fn test_reverse_fixed_size_list() -> Result<()> {
467+
let field = Arc::new(Field::new("a", DataType::Int32, false));
468+
let values = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9]));
469+
let result = fixed_size_array_reverse(
470+
&FixedSizeListArray::new(
471+
field,
472+
3,
473+
values,
474+
Some(NullBuffer::from(vec![true, false, true])),
475+
),
476+
&Arc::new(Field::new("test", DataType::Int32, true)),
477+
)?;
478+
let reversed = fixed_size_list_values(result.as_fixed_size_list());
479+
let expected = vec![Some(vec![3, 2, 1]), None, Some(vec![9, 8, 7])];
480+
assert_eq!(expected, reversed);
481+
Ok(())
482+
}
483+
484+
#[test]
485+
fn test_reverse_fixed_size_list_empty() -> Result<()> {
486+
let field = Arc::new(Field::new("a", DataType::Int32, false));
487+
let empty_array: Vec<i32> = vec![];
488+
let values = Arc::new(Int32Array::from(empty_array));
489+
let nulls = None;
490+
let fixed_size_list = FixedSizeListArray::new(field, 3, values, nulls);
491+
let result = fixed_size_array_reverse(
492+
&fixed_size_list,
493+
&Arc::new(Field::new("test", DataType::Int32, true)),
494+
)?;
495+
let reversed = fixed_size_list_values(result.as_fixed_size_list());
496+
let expected: Vec<Option<Vec<i32>>> = vec![];
497+
assert_eq!(expected, reversed);
498+
Ok(())
499+
}
453500
}

0 commit comments

Comments
 (0)