From 519413a142b5a9308f741fb722d952503182cc17 Mon Sep 17 00:00:00 2001 From: Eduard Karacharov Date: Sat, 8 Jun 2024 11:58:58 +0300 Subject: [PATCH 1/3] skip iterator removed from primitive encoding --- arrow-row/src/fixed.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arrow-row/src/fixed.rs b/arrow-row/src/fixed.rs index 0f3c3d0912f6..b0807fa80fc9 100644 --- a/arrow-row/src/fixed.rs +++ b/arrow-row/src/fixed.rs @@ -222,7 +222,9 @@ pub fn encode>>( i: I, opts: SortOptions, ) { - for (offset, maybe_val) in offsets.iter_mut().skip(1).zip(i) { + let mut offset_idx = 1; + for maybe_val in i { + let offset = &mut offsets[offset_idx]; let end_offset = *offset + T::ENCODED_LEN; if let Some(val) = maybe_val { let to_write = &mut data[*offset..end_offset]; @@ -237,6 +239,7 @@ pub fn encode>>( data[*offset] = null_sentinel(opts); } *offset = end_offset; + offset_idx += 1; } } @@ -247,7 +250,9 @@ pub fn encode_fixed_size_binary( opts: SortOptions, ) { let len = array.value_length() as usize; - for (offset, maybe_val) in offsets.iter_mut().skip(1).zip(array.iter()) { + let mut offset_idx = 1; + for maybe_val in array { + let offset = &mut offsets[offset_idx]; let end_offset = *offset + len + 1; if let Some(val) = maybe_val { let to_write = &mut data[*offset..end_offset]; @@ -261,6 +266,7 @@ pub fn encode_fixed_size_binary( data[*offset] = null_sentinel(opts); } *offset = end_offset; + offset_idx += 1; } } From 99e15033861ccac142c4348784bf2416116b6d56 Mon Sep 17 00:00:00 2001 From: Eduard Karacharov Date: Sun, 9 Jun 2024 11:04:06 +0300 Subject: [PATCH 2/3] special cases for not-null primitives encoding --- arrow-row/src/fixed.rs | 103 +++++++++++++++++++++++++++++++++++++++-- arrow-row/src/lib.rs | 16 ++++++- 2 files changed, 112 insertions(+), 7 deletions(-) diff --git a/arrow-row/src/fixed.rs b/arrow-row/src/fixed.rs index b0807fa80fc9..461d9cd9c1c0 100644 --- a/arrow-row/src/fixed.rs +++ b/arrow-row/src/fixed.rs @@ -18,7 +18,7 @@ use crate::array::PrimitiveArray; use crate::null_sentinel; use arrow_array::builder::BufferBuilder; -use arrow_array::{ArrowPrimitiveType, BooleanArray, FixedSizeBinaryArray}; +use arrow_array::{Array, ArrowPrimitiveType, BooleanArray, FixedSizeBinaryArray}; use arrow_buffer::{ bit_util, i256, ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, }; @@ -216,16 +216,80 @@ where /// /// - 1 byte `0` if null or `1` if valid /// - bytes of [`FixedLengthEncoding`] -pub fn encode>>( +pub fn encode( data: &mut [u8], offsets: &mut [usize], - i: I, + array: &PrimitiveArray, + opts: SortOptions, +) where + T::Native: FixedLengthEncoding, +{ + let mut offset_idx = 1; + for maybe_val in array { + let offset = &mut offsets[offset_idx]; + let end_offset = *offset + T::Native::ENCODED_LEN; + if let Some(val) = maybe_val { + let to_write = &mut data[*offset..end_offset]; + to_write[0] = 1; + let mut encoded = val.encode(); + if opts.descending { + // Flip bits to reverse order + encoded.as_mut().iter_mut().for_each(|v| *v = !*v) + } + to_write[1..].copy_from_slice(encoded.as_ref()) + } else { + data[*offset] = null_sentinel(opts); + } + *offset = end_offset; + offset_idx += 1; + } +} + +/// Encoding for non-nullable primitive arrays. +/// Iterates directly over the `values`, and skips NULLs-checking. +pub fn encode_not_null( + data: &mut [u8], + offsets: &mut [usize], + array: &PrimitiveArray, + opts: SortOptions, +) where + T::Native: FixedLengthEncoding, +{ + assert!(!array.is_nullable()); + + let mut offset_idx = 1; + for val in array.values() { + let offset = &mut offsets[offset_idx]; + let end_offset = *offset + T::Native::ENCODED_LEN; + + let to_write = &mut data[*offset..end_offset]; + to_write[0] = 1; + let mut encoded = val.encode(); + if opts.descending { + // Flip bits to reverse order + encoded.as_mut().iter_mut().for_each(|v| *v = !*v) + } + to_write[1..].copy_from_slice(encoded.as_ref()); + + *offset = end_offset; + offset_idx += 1; + } +} + +/// Boolean values are encoded as +/// +/// - 1 byte `0` if null or `1` if valid +/// - bytes of [`FixedLengthEncoding`] +pub fn encode_bool( + data: &mut [u8], + offsets: &mut [usize], + array: &BooleanArray, opts: SortOptions, ) { let mut offset_idx = 1; - for maybe_val in i { + for maybe_val in array { let offset = &mut offsets[offset_idx]; - let end_offset = *offset + T::ENCODED_LEN; + let end_offset = *offset + bool::ENCODED_LEN; if let Some(val) = maybe_val { let to_write = &mut data[*offset..end_offset]; to_write[0] = 1; @@ -243,6 +307,35 @@ pub fn encode>>( } } +/// Encoding for non-nullable boolean arrays. +/// Iterates directly over `values`, and skips NULLs-checking. +pub fn encode_bool_not_null( + data: &mut [u8], + offsets: &mut [usize], + array: &BooleanArray, + opts: SortOptions, +) { + assert!(!array.is_nullable()); + + let mut offset_idx = 1; + for val in array.values() { + let offset = &mut offsets[offset_idx]; + let end_offset = *offset + bool::ENCODED_LEN; + + let to_write = &mut data[*offset..end_offset]; + to_write[0] = 1; + let mut encoded = val.encode(); + if opts.descending { + // Flip bits to reverse order + encoded.as_mut().iter_mut().for_each(|v| *v = !*v) + } + to_write[1..].copy_from_slice(encoded.as_ref()); + + *offset = end_offset; + offset_idx += 1; + } +} + pub fn encode_fixed_size_binary( data: &mut [u8], offsets: &mut [usize], diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 8e1285493b0b..8e67858fa5af 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -1146,9 +1146,21 @@ fn encode_column( match encoder { Encoder::Stateless => { downcast_primitive_array! { - column => fixed::encode(data, offsets, column, opts), + column => { + if column.is_nullable(){ + fixed::encode(data, offsets, column, opts) + } else { + fixed::encode_not_null(data, offsets, column, opts) + } + } DataType::Null => {} - DataType::Boolean => fixed::encode(data, offsets, column.as_boolean(), opts), + DataType::Boolean => { + if column.is_nullable(){ + fixed::encode_bool(data, offsets, column.as_boolean(), opts) + } else { + fixed::encode_bool_not_null(data, offsets, column.as_boolean(), opts) + } + } DataType::Binary => { variable::encode(data, offsets, as_generic_binary_array::(column).iter(), opts) } From d681c3e034284a22ad499d1843ed7481978c5630 Mon Sep 17 00:00:00 2001 From: Eduard Karacharov Date: Tue, 18 Jun 2024 21:07:58 +0300 Subject: [PATCH 3/3] faster iterators for nullable columns --- arrow-row/src/fixed.rs | 76 +++++++++++++++++------------------------- arrow-row/src/lib.rs | 12 +++---- 2 files changed, 36 insertions(+), 52 deletions(-) diff --git a/arrow-row/src/fixed.rs b/arrow-row/src/fixed.rs index 461d9cd9c1c0..3d9920708f9b 100644 --- a/arrow-row/src/fixed.rs +++ b/arrow-row/src/fixed.rs @@ -18,9 +18,10 @@ use crate::array::PrimitiveArray; use crate::null_sentinel; use arrow_array::builder::BufferBuilder; -use arrow_array::{Array, ArrowPrimitiveType, BooleanArray, FixedSizeBinaryArray}; +use arrow_array::{ArrowPrimitiveType, BooleanArray, FixedSizeBinaryArray}; use arrow_buffer::{ - bit_util, i256, ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, + bit_util, i256, ArrowNativeType, BooleanBuffer, Buffer, IntervalDayTime, IntervalMonthDayNano, + MutableBuffer, NullBuffer, }; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{DataType, SortOptions}; @@ -216,22 +217,20 @@ where /// /// - 1 byte `0` if null or `1` if valid /// - bytes of [`FixedLengthEncoding`] -pub fn encode( +pub fn encode( data: &mut [u8], offsets: &mut [usize], - array: &PrimitiveArray, + values: &[T], + nulls: &NullBuffer, opts: SortOptions, -) where - T::Native: FixedLengthEncoding, -{ - let mut offset_idx = 1; - for maybe_val in array { - let offset = &mut offsets[offset_idx]; - let end_offset = *offset + T::Native::ENCODED_LEN; - if let Some(val) = maybe_val { +) { + for (value_idx, is_valid) in nulls.iter().enumerate() { + let offset = &mut offsets[value_idx + 1]; + let end_offset = *offset + T::ENCODED_LEN; + if is_valid { let to_write = &mut data[*offset..end_offset]; to_write[0] = 1; - let mut encoded = val.encode(); + let mut encoded = values[value_idx].encode(); if opts.descending { // Flip bits to reverse order encoded.as_mut().iter_mut().for_each(|v| *v = !*v) @@ -241,26 +240,20 @@ pub fn encode( data[*offset] = null_sentinel(opts); } *offset = end_offset; - offset_idx += 1; } } /// Encoding for non-nullable primitive arrays. /// Iterates directly over the `values`, and skips NULLs-checking. -pub fn encode_not_null( +pub fn encode_not_null( data: &mut [u8], offsets: &mut [usize], - array: &PrimitiveArray, + values: &[T], opts: SortOptions, -) where - T::Native: FixedLengthEncoding, -{ - assert!(!array.is_nullable()); - - let mut offset_idx = 1; - for val in array.values() { - let offset = &mut offsets[offset_idx]; - let end_offset = *offset + T::Native::ENCODED_LEN; +) { + for (value_idx, val) in values.iter().enumerate() { + let offset = &mut offsets[value_idx + 1]; + let end_offset = *offset + T::ENCODED_LEN; let to_write = &mut data[*offset..end_offset]; to_write[0] = 1; @@ -272,7 +265,6 @@ pub fn encode_not_null( to_write[1..].copy_from_slice(encoded.as_ref()); *offset = end_offset; - offset_idx += 1; } } @@ -280,20 +272,20 @@ pub fn encode_not_null( /// /// - 1 byte `0` if null or `1` if valid /// - bytes of [`FixedLengthEncoding`] -pub fn encode_bool( +pub fn encode_boolean( data: &mut [u8], offsets: &mut [usize], - array: &BooleanArray, + values: &BooleanBuffer, + nulls: &NullBuffer, opts: SortOptions, ) { - let mut offset_idx = 1; - for maybe_val in array { - let offset = &mut offsets[offset_idx]; + for (idx, is_valid) in nulls.iter().enumerate() { + let offset = &mut offsets[idx + 1]; let end_offset = *offset + bool::ENCODED_LEN; - if let Some(val) = maybe_val { + if is_valid { let to_write = &mut data[*offset..end_offset]; to_write[0] = 1; - let mut encoded = val.encode(); + let mut encoded = values.value(idx).encode(); if opts.descending { // Flip bits to reverse order encoded.as_mut().iter_mut().for_each(|v| *v = !*v) @@ -303,23 +295,19 @@ pub fn encode_bool( data[*offset] = null_sentinel(opts); } *offset = end_offset; - offset_idx += 1; } } /// Encoding for non-nullable boolean arrays. /// Iterates directly over `values`, and skips NULLs-checking. -pub fn encode_bool_not_null( +pub fn encode_boolean_not_null( data: &mut [u8], offsets: &mut [usize], - array: &BooleanArray, + values: &BooleanBuffer, opts: SortOptions, ) { - assert!(!array.is_nullable()); - - let mut offset_idx = 1; - for val in array.values() { - let offset = &mut offsets[offset_idx]; + for (value_idx, val) in values.iter().enumerate() { + let offset = &mut offsets[value_idx + 1]; let end_offset = *offset + bool::ENCODED_LEN; let to_write = &mut data[*offset..end_offset]; @@ -332,7 +320,6 @@ pub fn encode_bool_not_null( to_write[1..].copy_from_slice(encoded.as_ref()); *offset = end_offset; - offset_idx += 1; } } @@ -343,9 +330,7 @@ pub fn encode_fixed_size_binary( opts: SortOptions, ) { let len = array.value_length() as usize; - let mut offset_idx = 1; - for maybe_val in array { - let offset = &mut offsets[offset_idx]; + for (offset, maybe_val) in offsets.iter_mut().skip(1).zip(array.iter()) { let end_offset = *offset + len + 1; if let Some(val) = maybe_val { let to_write = &mut data[*offset..end_offset]; @@ -359,7 +344,6 @@ pub fn encode_fixed_size_binary( data[*offset] = null_sentinel(opts); } *offset = end_offset; - offset_idx += 1; } } diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 8e67858fa5af..935db80385bb 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -1147,18 +1147,18 @@ fn encode_column( Encoder::Stateless => { downcast_primitive_array! { column => { - if column.is_nullable(){ - fixed::encode(data, offsets, column, opts) + if let Some(nulls) = column.nulls().filter(|n| n.null_count() > 0){ + fixed::encode(data, offsets, column.values(), nulls, opts) } else { - fixed::encode_not_null(data, offsets, column, opts) + fixed::encode_not_null(data, offsets, column.values(), opts) } } DataType::Null => {} DataType::Boolean => { - if column.is_nullable(){ - fixed::encode_bool(data, offsets, column.as_boolean(), opts) + if let Some(nulls) = column.nulls().filter(|n| n.null_count() > 0){ + fixed::encode_boolean(data, offsets, column.as_boolean().values(), nulls, opts) } else { - fixed::encode_bool_not_null(data, offsets, column.as_boolean(), opts) + fixed::encode_boolean_not_null(data, offsets, column.as_boolean().values(), opts) } } DataType::Binary => {