Skip to content

Commit

Permalink
faster iterators for nullable columns
Browse files Browse the repository at this point in the history
  • Loading branch information
korowa committed Jun 18, 2024
1 parent 74c46ae commit 2750458
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 52 deletions.
76 changes: 30 additions & 46 deletions arrow-row/src/fixed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@
use crate::array::PrimitiveArray;
use crate::null_sentinel;
use arrow_array::builder::BufferBuilder;
use arrow_array::{Array, ArrowPrimitiveType, BooleanArray, FixedSizeBinaryArray};
use arrow_array::{ArrowPrimitiveType, BooleanArray, FixedSizeBinaryArray};
use arrow_buffer::{
bit_util, i256, ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer,
bit_util, i256, ArrowNativeType, BooleanBuffer, Buffer, IntervalDayTime, IntervalMonthDayNano,
MutableBuffer, NullBuffer,
};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{DataType, SortOptions};
Expand Down Expand Up @@ -216,22 +217,20 @@ where
///
/// - 1 byte `0` if null or `1` if valid
/// - bytes of [`FixedLengthEncoding`]
pub fn encode<T: ArrowPrimitiveType>(
pub fn encode<T: FixedLengthEncoding>(
data: &mut [u8],
offsets: &mut [usize],
array: &PrimitiveArray<T>,
values: &[T],
nulls: &NullBuffer,
opts: SortOptions,
) where
T::Native: FixedLengthEncoding,
{
let mut offset_idx = 1;
for maybe_val in array {
let offset = &mut offsets[offset_idx];
let end_offset = *offset + T::Native::ENCODED_LEN;
if let Some(val) = maybe_val {
) {
for (value_idx, is_valid) in nulls.iter().enumerate() {
let offset = &mut offsets[value_idx + 1];
let end_offset = *offset + T::ENCODED_LEN;
if is_valid {
let to_write = &mut data[*offset..end_offset];
to_write[0] = 1;
let mut encoded = val.encode();
let mut encoded = values[value_idx].encode();
if opts.descending {
// Flip bits to reverse order
encoded.as_mut().iter_mut().for_each(|v| *v = !*v)
Expand All @@ -241,26 +240,20 @@ pub fn encode<T: ArrowPrimitiveType>(
data[*offset] = null_sentinel(opts);
}
*offset = end_offset;
offset_idx += 1;
}
}

/// Encoding for non-nullable primitive arrays.
/// Iterates directly over the `values`, and skips NULLs-checking.
pub fn encode_not_null<T: ArrowPrimitiveType>(
pub fn encode_not_null<T: FixedLengthEncoding>(
data: &mut [u8],
offsets: &mut [usize],
array: &PrimitiveArray<T>,
values: &[T],
opts: SortOptions,
) where
T::Native: FixedLengthEncoding,
{
assert!(!array.is_nullable());

let mut offset_idx = 1;
for val in array.values() {
let offset = &mut offsets[offset_idx];
let end_offset = *offset + T::Native::ENCODED_LEN;
) {
for (value_idx, val) in values.iter().enumerate() {
let offset = &mut offsets[value_idx + 1];
let end_offset = *offset + T::ENCODED_LEN;

let to_write = &mut data[*offset..end_offset];
to_write[0] = 1;
Expand All @@ -272,28 +265,27 @@ pub fn encode_not_null<T: ArrowPrimitiveType>(
to_write[1..].copy_from_slice(encoded.as_ref());

*offset = end_offset;
offset_idx += 1;
}
}

/// Boolean values are encoded as
///
/// - 1 byte `0` if null or `1` if valid
/// - bytes of [`FixedLengthEncoding`]
pub fn encode_bool(
pub fn encode_boolean(
data: &mut [u8],
offsets: &mut [usize],
array: &BooleanArray,
values: &BooleanBuffer,
nulls: &NullBuffer,
opts: SortOptions,
) {
let mut offset_idx = 1;
for maybe_val in array {
let offset = &mut offsets[offset_idx];
for (idx, is_valid) in nulls.iter().enumerate() {
let offset = &mut offsets[idx + 1];
let end_offset = *offset + bool::ENCODED_LEN;
if let Some(val) = maybe_val {
if is_valid {
let to_write = &mut data[*offset..end_offset];
to_write[0] = 1;
let mut encoded = val.encode();
let mut encoded = values.value(idx).encode();
if opts.descending {
// Flip bits to reverse order
encoded.as_mut().iter_mut().for_each(|v| *v = !*v)
Expand All @@ -303,23 +295,19 @@ pub fn encode_bool(
data[*offset] = null_sentinel(opts);
}
*offset = end_offset;
offset_idx += 1;
}
}

/// Encoding for non-nullable boolean arrays.
/// Iterates directly over `values`, and skips NULLs-checking.
pub fn encode_bool_not_null(
pub fn encode_boolean_not_null(
data: &mut [u8],
offsets: &mut [usize],
array: &BooleanArray,
values: &BooleanBuffer,
opts: SortOptions,
) {
assert!(!array.is_nullable());

let mut offset_idx = 1;
for val in array.values() {
let offset = &mut offsets[offset_idx];
for (value_idx, val) in values.iter().enumerate() {
let offset = &mut offsets[value_idx + 1];
let end_offset = *offset + bool::ENCODED_LEN;

let to_write = &mut data[*offset..end_offset];
Expand All @@ -332,7 +320,6 @@ pub fn encode_bool_not_null(
to_write[1..].copy_from_slice(encoded.as_ref());

*offset = end_offset;
offset_idx += 1;
}
}

Expand All @@ -343,9 +330,7 @@ pub fn encode_fixed_size_binary(
opts: SortOptions,
) {
let len = array.value_length() as usize;
let mut offset_idx = 1;
for maybe_val in array {
let offset = &mut offsets[offset_idx];
for (offset, maybe_val) in offsets.iter_mut().skip(1).zip(array.iter()) {
let end_offset = *offset + len + 1;
if let Some(val) = maybe_val {
let to_write = &mut data[*offset..end_offset];
Expand All @@ -359,7 +344,6 @@ pub fn encode_fixed_size_binary(
data[*offset] = null_sentinel(opts);
}
*offset = end_offset;
offset_idx += 1;
}
}

Expand Down
12 changes: 6 additions & 6 deletions arrow-row/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1147,18 +1147,18 @@ fn encode_column(
Encoder::Stateless => {
downcast_primitive_array! {
column => {
if column.is_nullable(){
fixed::encode(data, offsets, column, opts)
if let Some(nulls) = column.nulls().filter(|n| n.null_count() > 0){
fixed::encode(data, offsets, column.values(), nulls, opts)
} else {
fixed::encode_not_null(data, offsets, column, opts)
fixed::encode_not_null(data, offsets, column.values(), opts)
}
}
DataType::Null => {}
DataType::Boolean => {
if column.is_nullable(){
fixed::encode_bool(data, offsets, column.as_boolean(), opts)
if let Some(nulls) = column.nulls().filter(|n| n.null_count() > 0){
fixed::encode_boolean(data, offsets, column.as_boolean().values(), nulls, opts)
} else {
fixed::encode_bool_not_null(data, offsets, column.as_boolean(), opts)
fixed::encode_boolean_not_null(data, offsets, column.as_boolean().values(), opts)
}
}
DataType::Binary => {
Expand Down

0 comments on commit 2750458

Please sign in to comment.