diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index 2dac33a4f28b..fdfb26f7f72a 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -198,7 +198,7 @@ where return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE))); } - let nulls = NullBuffer::union(a.nulls(), b.nulls()); + let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()); let values = a.values().iter().zip(b.values()).map(|(l, r)| op(*l, *r)); // JUSTIFICATION @@ -248,7 +248,7 @@ where )))); } - let nulls = NullBuffer::union(a.nulls(), b.nulls()); + let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()); let mut builder = a.into_builder()?; @@ -296,7 +296,9 @@ where if a.null_count() == 0 && b.null_count() == 0 { try_binary_no_nulls(len, a, b, op) } else { - let nulls = NullBuffer::union(a.nulls(), b.nulls()).unwrap(); + let nulls = + NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()) + .unwrap(); let mut buffer = BufferBuilder::::new(len); buffer.append_n_zeroed(len); @@ -355,7 +357,10 @@ where if a.null_count() == 0 && b.null_count() == 0 { try_binary_no_nulls_mut(len, a, b, op) } else { - let nulls = NullBuffer::union(a.nulls(), b.nulls()).unwrap(); + let nulls = + NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()) + .unwrap(); + let mut builder = a.into_builder()?; let slice = builder.values_slice_mut(); diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs index 61e591d51634..46e5998208f1 100644 --- a/arrow-arith/src/boolean.rs +++ b/arrow-arith/src/boolean.rs @@ -25,7 +25,7 @@ use arrow_array::*; use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_quaternary_op_helper}; use arrow_buffer::{BooleanBuffer, NullBuffer}; -use arrow_schema::{ArrowError, DataType}; +use arrow_schema::ArrowError; /// Logical 'and' boolean values with Kleene logic /// @@ -311,11 +311,7 @@ pub fn not(left: &BooleanArray) -> Result { /// assert_eq!(a_is_null, BooleanArray::from(vec![false, false, true])); /// ``` pub fn is_null(input: &dyn Array) -> Result { - let values = match input.nulls() { - // NullArray has no nulls buffer yet all values are null - None if input.data_type() == &DataType::Null => { - BooleanBuffer::new_set(input.len()) - } + let values = match input.logical_nulls() { None => BooleanBuffer::new_unset(input.len()), Some(nulls) => !nulls.inner(), }; @@ -335,11 +331,7 @@ pub fn is_null(input: &dyn Array) -> Result { /// assert_eq!(a_is_not_null, BooleanArray::from(vec![true, true, false])); /// ``` pub fn is_not_null(input: &dyn Array) -> Result { - let values = match input.nulls() { - // NullArray has no nulls buffer yet all values are null - None if input.data_type() == &DataType::Null => { - BooleanBuffer::new_unset(input.len()) - } + let values = match input.logical_nulls() { None => BooleanBuffer::new_set(input.len()), Some(n) => n.inner().clone(), }; diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 14fa87e138eb..995bb7d510d9 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -205,7 +205,7 @@ impl BooleanArray { where F: FnMut(T::Item) -> bool, { - let nulls = left.nulls().cloned(); + let nulls = left.logical_nulls(); let values = BooleanBuffer::collect_bool(left.len(), |i| unsafe { // SAFETY: i in range 0..len op(left.value_unchecked(i)) @@ -239,7 +239,10 @@ impl BooleanArray { { assert_eq!(left.len(), right.len()); - let nulls = NullBuffer::union(left.nulls(), right.nulls()); + let nulls = NullBuffer::union( + left.logical_nulls().as_ref(), + right.logical_nulls().as_ref(), + ); let values = BooleanBuffer::collect_bool(left.len(), |i| unsafe { // SAFETY: i in range 0..len op(left.value_unchecked(i), right.value_unchecked(i)) diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 5a2f439a8e0f..2d80c75f073a 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -729,6 +729,31 @@ impl Array for DictionaryArray { self.keys.nulls() } + fn logical_nulls(&self) -> Option { + match self.values.nulls() { + None => self.nulls().cloned(), + Some(value_nulls) => { + let mut builder = BooleanBufferBuilder::new(self.len()); + match self.keys.nulls() { + Some(n) => builder.append_buffer(n.inner()), + None => builder.append_n(self.len(), true), + } + for (idx, k) in self.keys.values().iter().enumerate() { + let k = k.as_usize(); + // Check range to allow for nulls + if k < value_nulls.len() && value_nulls.is_null(k) { + builder.set_bit(idx, false); + } + } + Some(builder.finish().into()) + } + } + } + + fn is_nullable(&self) -> bool { + !self.is_empty() && (self.nulls().is_some() || self.values.is_nullable()) + } + fn get_buffer_memory_size(&self) -> usize { self.keys.get_buffer_memory_size() + self.values.get_buffer_memory_size() } @@ -843,6 +868,14 @@ impl<'a, K: ArrowDictionaryKeyType, V: Sync> Array for TypedDictionaryArray<'a, self.dictionary.nulls() } + fn logical_nulls(&self) -> Option { + self.dictionary.logical_nulls() + } + + fn is_nullable(&self) -> bool { + self.dictionary.is_nullable() + } + fn get_buffer_memory_size(&self) -> usize { self.dictionary.get_buffer_memory_size() } @@ -1253,4 +1286,20 @@ mod tests { assert_eq!(v, expected, "{idx}"); } } + + #[test] + fn test_iterator_nulls() { + let keys = Int32Array::new( + vec![0, 700, 1, 2].into(), + Some(NullBuffer::from(vec![true, false, true, true])), + ); + let values = Int32Array::from(vec![Some(50), None, Some(2)]); + let dict = DictionaryArray::new(keys, Arc::new(values)); + let values: Vec<_> = dict + .downcast_dict::() + .unwrap() + .into_iter() + .collect(); + assert_eq!(values, &[Some(50), None, None, Some(2)]) + } } diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 6c3abb556ad6..8996fc8da408 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -147,7 +147,7 @@ impl FixedSizeListArray { /// * `size < 0` /// * `values.len() / size != nulls.len()` /// * `values.data_type() != field.data_type()` - /// * `!field.is_nullable() && !nulls.expand(size).contains(values.nulls())` + /// * `!field.is_nullable() && !nulls.expand(size).contains(values.logical_nulls())` pub fn try_new( field: FieldRef, size: i32, @@ -181,11 +181,11 @@ impl FixedSizeListArray { ))); } - if let Some(a) = values.nulls() { + if let Some(a) = values.logical_nulls() { let nulls_valid = field.is_nullable() || nulls .as_ref() - .map(|n| n.expand(size as _).contains(a)) + .map(|n| n.expand(size as _).contains(&a)) .unwrap_or_default(); if !nulls_valid { diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 05628084c844..f5b7ae77c3f9 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -161,7 +161,7 @@ impl GenericListArray { /// /// * `offsets.len() - 1 != nulls.len()` /// * `offsets.last() > values.len()` - /// * `!field.is_nullable() && values.null_count() != 0` + /// * `!field.is_nullable() && values.is_nullable()` /// * `field.data_type() != values.data_type()` pub fn try_new( field: FieldRef, @@ -189,7 +189,7 @@ impl GenericListArray { ))); } } - if !field.is_nullable() && values.null_count() != 0 { + if !field.is_nullable() && values.is_nullable() { return Err(ArrowError::InvalidArgumentError(format!( "Non-nullable field of {}ListArray {:?} cannot contain nulls", OffsetSize::PREFIX, diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 0157279dfe49..79240d105a44 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -173,12 +173,33 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// ``` fn offset(&self) -> usize; - /// Returns the null buffers of this array if any + /// Returns the null buffer of this array if any + /// + /// Note: some arrays can encode their nullability in their children, for example, + /// [`DictionaryArray::values`] values or [`RunArray::values`], or without a null buffer, + /// such as [`NullArray`]. Use [`Array::logical_nulls`] to obtain a computed mask encoding this fn nulls(&self) -> Option<&NullBuffer>; + /// Returns the logical null buffer of this array if any + /// + /// In most cases this will be the same as [`Array::nulls`], except for: + /// + /// * DictionaryArray where [`DictionaryArray::values`] contains nulls + /// * RunArray where [`RunArray::values`] contains nulls + /// * NullArray where all indices are nulls + /// + /// In these cases a logical [`NullBuffer`] will be computed, encoding the logical nullability + /// of these arrays, beyond what is encoded in [`Array::nulls`] + fn logical_nulls(&self) -> Option { + self.nulls().cloned() + } + /// Returns whether the element at `index` is null. /// When using this function on a slice, the index is relative to the slice. /// + /// Note: this method returns the physical nullability, i.e. that encoded in [`Array::nulls`] + /// see [`Array::logical_nulls`] for logical nullability + /// /// # Example: /// /// ``` @@ -196,6 +217,9 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// Returns whether the element at `index` is not null. /// When using this function on a slice, the index is relative to the slice. /// + /// Note: this method returns the physical nullability, i.e. that encoded in [`Array::nulls`] + /// see [`Array::logical_nulls`] for logical nullability + /// /// # Example: /// /// ``` @@ -210,7 +234,10 @@ pub trait Array: std::fmt::Debug + Send + Sync { !self.is_null(index) } - /// Returns the total number of null values in this array. + /// Returns the total number of physical null values in this array. + /// + /// Note: this method returns the physical null count, i.e. that encoded in [`Array::nulls`], + /// see [`Array::logical_nulls`] for logical nullability /// /// # Example: /// @@ -226,6 +253,19 @@ pub trait Array: std::fmt::Debug + Send + Sync { self.nulls().map(|n| n.null_count()).unwrap_or_default() } + /// Returns `false` if the array is guaranteed to not contain any logical nulls + /// + /// In general this will be equivalent to `Array::null_count() != 0` but may differ in the + /// presence of logical nullability, see [`Array::logical_nulls`]. + /// + /// Implementations will return `true` unless they can cheaply prove no logical nulls + /// are present. For example a [`DictionaryArray`] with nullable values will still return true, + /// even if the nulls present in [`DictionaryArray::values`] are not referenced by any key, + /// and therefore would not appear in [`Array::logical_nulls`]. + fn is_nullable(&self) -> bool { + self.null_count() != 0 + } + /// Returns the total number of bytes of memory pointed to by this array. /// The buffers store bytes in the Arrow memory format, and include the data as well as the validity map. fn get_buffer_memory_size(&self) -> usize; @@ -277,6 +317,10 @@ impl Array for ArrayRef { self.as_ref().nulls() } + fn logical_nulls(&self) -> Option { + self.as_ref().logical_nulls() + } + fn is_null(&self, index: usize) -> bool { self.as_ref().is_null(index) } @@ -289,6 +333,10 @@ impl Array for ArrayRef { self.as_ref().null_count() } + fn is_nullable(&self) -> bool { + self.as_ref().is_nullable() + } + fn get_buffer_memory_size(&self) -> usize { self.as_ref().get_buffer_memory_size() } @@ -335,6 +383,10 @@ impl<'a, T: Array> Array for &'a T { T::nulls(self) } + fn logical_nulls(&self) -> Option { + T::logical_nulls(self) + } + fn is_null(&self, index: usize) -> bool { T::is_null(self, index) } @@ -347,6 +399,10 @@ impl<'a, T: Array> Array for &'a T { T::null_count(self) } + fn is_nullable(&self) -> bool { + T::is_nullable(self) + } + fn get_buffer_memory_size(&self) -> usize { T::get_buffer_memory_size(self) } diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index c054c890431b..af3ec0b57d27 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -36,8 +36,10 @@ use std::sync::Arc; /// /// let array = NullArray::new(10); /// +/// assert!(array.is_nullable()); /// assert_eq!(array.len(), 10); -/// assert_eq!(array.null_count(), 10); +/// assert_eq!(array.null_count(), 0); +/// assert_eq!(array.logical_nulls().unwrap().null_count(), 10); /// ``` #[derive(Clone)] pub struct NullArray { @@ -107,22 +109,12 @@ impl Array for NullArray { None } - /// Returns whether the element at `index` is null. - /// All elements of a `NullArray` are always null. - fn is_null(&self, _index: usize) -> bool { - true + fn logical_nulls(&self) -> Option { + (self.len != 0).then(|| NullBuffer::new_null(self.len)) } - /// Returns whether the element at `index` is valid. - /// All elements of a `NullArray` are always invalid. - fn is_valid(&self, _index: usize) -> bool { - false - } - - /// Returns the total number of null values in this array. - /// The null count of a `NullArray` always equals its length. - fn null_count(&self) -> usize { - self.len() + fn is_nullable(&self) -> bool { + !self.is_empty() } fn get_buffer_memory_size(&self) -> usize { @@ -176,8 +168,10 @@ mod tests { let null_arr = NullArray::new(32); assert_eq!(null_arr.len(), 32); - assert_eq!(null_arr.null_count(), 32); - assert!(!null_arr.is_valid(0)); + assert_eq!(null_arr.null_count(), 0); + assert_eq!(null_arr.logical_nulls().unwrap().null_count(), 32); + assert!(null_arr.is_valid(0)); + assert!(null_arr.is_nullable()); } #[test] @@ -186,7 +180,10 @@ mod tests { let array2 = array1.slice(8, 16); assert_eq!(array2.len(), 16); - assert_eq!(array2.null_count(), 16); + assert_eq!(array2.null_count(), 0); + assert_eq!(array2.logical_nulls().unwrap().null_count(), 16); + assert!(array2.is_valid(0)); + assert!(array2.is_nullable()); } #[test] diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 820d5c9ebfc1..30cefaeb4d46 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -18,7 +18,7 @@ use std::any::Any; use std::sync::Arc; -use arrow_buffer::{ArrowNativeType, NullBuffer, RunEndBuffer}; +use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer, RunEndBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; @@ -349,6 +349,43 @@ impl Array for RunArray { None } + fn logical_nulls(&self) -> Option { + let len = self.len(); + let nulls = self.values.logical_nulls()?; + let mut out = BooleanBufferBuilder::new(len); + let offset = self.run_ends.offset(); + let mut valid_start = 0; + let mut last_end = 0; + for (idx, end) in self.run_ends.values().iter().enumerate() { + let end = end.as_usize(); + if end < offset { + continue; + } + let end = (end - offset).min(len); + if nulls.is_null(idx) { + if valid_start < last_end { + out.append_n(last_end - valid_start, true); + } + out.append_n(end - last_end, false); + valid_start = end; + } + last_end = end; + if end == len { + break; + } + } + if valid_start < len { + out.append_n(len - valid_start, true) + } + // Sanity check + assert_eq!(out.len(), len); + Some(out.finish().into()) + } + + fn is_nullable(&self) -> bool { + !self.is_empty() && self.values.is_nullable() + } + fn get_buffer_memory_size(&self) -> usize { self.run_ends.inner().inner().capacity() + self.values.get_buffer_memory_size() } @@ -569,6 +606,14 @@ impl<'a, R: RunEndIndexType, V: Sync> Array for TypedRunArray<'a, R, V> { self.run_array.nulls() } + fn logical_nulls(&self) -> Option { + self.run_array.logical_nulls() + } + + fn is_nullable(&self) -> bool { + self.run_array.is_nullable() + } + fn get_buffer_memory_size(&self) -> usize { self.run_array.get_buffer_memory_size() } @@ -1041,4 +1086,26 @@ mod tests { ); } } + + #[test] + fn test_logical_nulls() { + let run = Int32Array::from(vec![3, 6, 9, 12]); + let values = Int32Array::from(vec![Some(0), None, Some(1), None]); + let array = RunArray::try_new(&run, &values).unwrap(); + + let expected = vec![ + true, true, true, false, false, false, true, true, true, false, false, false, + ]; + + let n = array.logical_nulls().unwrap(); + assert_eq!(n.null_count(), 6); + + let slices = [(0, 12), (0, 2), (2, 5), (3, 0), (3, 3), (3, 4), (4, 8)]; + for (offset, length) in slices { + let a = array.slice(offset, length); + let n = a.logical_nulls().unwrap(); + let n = n.into_iter().collect::>(); + assert_eq!(&n, &expected[offset..offset + length], "{offset} {length}"); + } + } } diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 1a79ebd95f37..284c3b26a946 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -143,15 +143,14 @@ impl StructArray { ))); } - if let Some(a) = a.nulls() { - let nulls_valid = f.is_nullable() - || nulls.as_ref().map(|n| n.contains(a)).unwrap_or_default(); - - if !nulls_valid { - return Err(ArrowError::InvalidArgumentError(format!( - "Found unmasked nulls for non-nullable StructArray field {:?}", - f.name() - ))); + if !f.is_nullable() { + if let Some(a) = a.logical_nulls() { + if !nulls.as_ref().map(|n| n.contains(&a)).unwrap_or_default() { + return Err(ArrowError::InvalidArgumentError(format!( + "Found unmasked nulls for non-nullable StructArray field {:?}", + f.name() + ))); + } } } } @@ -314,7 +313,7 @@ impl TryFrom> for StructArray { .into_iter() .map(|(name, array)| { ( - Field::new(name, array.data_type().clone(), array.nulls().is_some()), + Field::new(name, array.data_type().clone(), array.is_nullable()), array, ) }) diff --git a/arrow-array/src/builder/null_builder.rs b/arrow-array/src/builder/null_builder.rs index 94cb7f5cc281..53a6b103d541 100644 --- a/arrow-array/src/builder/null_builder.rs +++ b/arrow-array/src/builder/null_builder.rs @@ -40,7 +40,7 @@ use std::sync::Arc; /// let arr = b.finish(); /// /// assert_eq!(8, arr.len()); -/// assert_eq!(8, arr.null_count()); +/// assert_eq!(0, arr.null_count()); /// ``` #[derive(Debug)] pub struct NullBuilder { @@ -160,7 +160,8 @@ mod tests { let arr = builder.finish(); assert_eq!(20, arr.len()); assert_eq!(0, arr.offset()); - assert_eq!(20, arr.null_count()); + assert_eq!(0, arr.null_count()); + assert!(arr.is_nullable()); } #[test] @@ -170,10 +171,10 @@ mod tests { builder.append_empty_value(); builder.append_empty_values(3); let mut array = builder.finish_cloned(); - assert_eq!(21, array.null_count()); + assert_eq!(21, array.len()); builder.append_empty_values(5); array = builder.finish(); - assert_eq!(26, array.null_count()); + assert_eq!(26, array.len()); } } diff --git a/arrow-array/src/iterator.rs b/arrow-array/src/iterator.rs index 86f5d991288a..a198332ca5b5 100644 --- a/arrow-array/src/iterator.rs +++ b/arrow-array/src/iterator.rs @@ -22,6 +22,7 @@ use crate::array::{ GenericListArray, GenericStringArray, PrimitiveArray, }; use crate::{FixedSizeListArray, MapArray}; +use arrow_buffer::NullBuffer; /// An iterator that returns Some(T) or None, that can be used on any [`ArrayAccessor`] /// @@ -46,6 +47,7 @@ use crate::{FixedSizeListArray, MapArray}; #[derive(Debug)] pub struct ArrayIter { array: T, + logical_nulls: Option, current: usize, current_end: usize, } @@ -54,12 +56,22 @@ impl ArrayIter { /// create a new iterator pub fn new(array: T) -> Self { let len = array.len(); + let logical_nulls = array.logical_nulls(); ArrayIter { array, + logical_nulls, current: 0, current_end: len, } } + + #[inline] + fn is_null(&self, idx: usize) -> bool { + self.logical_nulls + .as_ref() + .map(|x| x.is_null(idx)) + .unwrap_or_default() + } } impl Iterator for ArrayIter { @@ -69,7 +81,7 @@ impl Iterator for ArrayIter { fn next(&mut self) -> Option { if self.current == self.current_end { None - } else if self.array.is_null(self.current) { + } else if self.is_null(self.current) { self.current += 1; Some(None) } else { @@ -98,7 +110,7 @@ impl DoubleEndedIterator for ArrayIter { None } else { self.current_end -= 1; - Some(if self.array.is_null(self.current_end) { + Some(if self.is_null(self.current_end) { None } else { // Safety: diff --git a/arrow-buffer/src/builder/boolean.rs b/arrow-buffer/src/builder/boolean.rs index f84cfa79c2dc..f0e7f0f13670 100644 --- a/arrow-buffer/src/builder/boolean.rs +++ b/arrow-buffer/src/builder/boolean.rs @@ -203,6 +203,12 @@ impl BooleanBufferBuilder { ); } + /// Append [`BooleanBuffer`] to this [`BooleanBufferBuilder`] + pub fn append_buffer(&mut self, buffer: &BooleanBuffer) { + let range = buffer.offset()..buffer.offset() + buffer.len(); + self.append_packed_range(range, buffer.values()) + } + /// Returns the packed bits pub fn as_slice(&self) -> &[u8] { self.buffer.as_slice() diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index c7fd082de2e6..a08a7a4fd413 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -7233,9 +7233,8 @@ mod tests { assert_eq!(array.data_type(), &data_type); let cast_array = cast(&array, &DataType::Null).expect("cast failed"); assert_eq!(cast_array.data_type(), &DataType::Null); - for i in 0..4 { - assert!(cast_array.is_null(i)); - } + assert_eq!(cast_array.len(), 4); + assert_eq!(cast_array.logical_nulls().unwrap().null_count(), 4); } #[test] diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index d18b0e36e930..21583fac08ff 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -6352,4 +6352,18 @@ mod tests { .to_string() .contains("Could not convert ToType with to_i128")); } + + #[test] + #[cfg(feature = "dyn_cmp_dict")] + fn test_dictionary_nested_nulls() { + let keys = Int32Array::from(vec![0, 1, 2]); + let v1 = Arc::new(Int32Array::from(vec![Some(0), None, Some(2)])); + let a = DictionaryArray::new(keys.clone(), v1); + let v2 = Arc::new(Int32Array::from(vec![None, Some(0), Some(2)])); + let b = DictionaryArray::new(keys, v2); + + let r = eq_dyn(&a, &b).unwrap(); + assert_eq!(r.null_count(), 2); + assert!(r.is_valid(2)); + } } diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 648a7d7afcca..87858630599f 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -719,19 +719,19 @@ where } } -type LexicographicalCompareItem<'a> = ( - Option<&'a NullBuffer>, // nulls - DynComparator, // comparator - SortOptions, // sort_option +type LexicographicalCompareItem = ( + Option, // nulls + DynComparator, // comparator + SortOptions, // sort_option ); /// A lexicographical comparator that wraps given array data (columns) and can lexicographically compare data /// at given two indices. The lifetime is the same at the data wrapped. -pub struct LexicographicalComparator<'a> { - compare_items: Vec>, +pub struct LexicographicalComparator { + compare_items: Vec, } -impl LexicographicalComparator<'_> { +impl LexicographicalComparator { /// lexicographically compare values at the wrapped columns with given indices. pub fn compare(&self, a_idx: usize, b_idx: usize) -> Ordering { for (nulls, comparator, sort_option) in &self.compare_items { @@ -780,14 +780,14 @@ impl LexicographicalComparator<'_> { /// results with two indices. pub fn try_new( columns: &[SortColumn], - ) -> Result, ArrowError> { + ) -> Result { let compare_items = columns .iter() .map(|column| { // flatten and convert build comparators let values = column.values.as_ref(); Ok(( - values.nulls(), + values.logical_nulls(), build_compare(values, values)?, column.options.unwrap_or_default(), )) @@ -4016,4 +4016,30 @@ mod tests { vec![None, None, None, Some(5.1), Some(5.1), Some(3.0), Some(1.2)], ); } + + #[test] + fn test_lexicographic_comparator_null_dict_values() { + let values = Int32Array::new( + vec![1, 2, 3, 4].into(), + Some(NullBuffer::from(vec![true, false, false, true])), + ); + let keys = Int32Array::new( + vec![0, 1, 53, 3].into(), + Some(NullBuffer::from(vec![true, true, false, true])), + ); + // [1, NULL, NULL, 4] + let dict = DictionaryArray::new(keys, Arc::new(values)); + + let comparator = LexicographicalComparator::try_new(&[SortColumn { + values: Arc::new(dict), + options: None, + }]) + .unwrap(); + // 1.cmp(NULL) + assert_eq!(comparator.compare(0, 1), Ordering::Greater); + // NULL.cmp(NULL) + assert_eq!(comparator.compare(2, 1), Ordering::Equal); + // NULL.cmp(4) + assert_eq!(comparator.compare(2, 3), Ordering::Less); + } } diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 1223280e3769..9d3abea66fb1 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -581,7 +581,10 @@ where )); } - let nulls = NullBuffer::union(left.nulls(), right.nulls()); + let nulls = NullBuffer::union( + left.logical_nulls().as_ref(), + right.logical_nulls().as_ref(), + ); let mut result = BooleanBufferBuilder::new(left.len()); for i in 0..left.len() { diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 47b01890301e..48615dc3d599 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -494,7 +494,7 @@ impl LevelInfoBuilder { def_levels.reserve(len); info.non_null_indices.reserve(len); - match array.nulls() { + match array.logical_nulls() { Some(nulls) => { // TODO: Faster bitmask iteration (#1757) for i in range { @@ -1751,7 +1751,6 @@ mod tests { builder.write(&a, 0..4); let levels = builder.finish(); - let list_level = levels.get(0).unwrap(); let expected_level = LevelInfo { def_levels: Some(vec![5, 4, 5, 2, 5, 3, 5, 5, 4, 4, 0]), rep_levels: Some(vec![0, 2, 2, 1, 0, 1, 0, 2, 1, 2, 0]), @@ -1760,6 +1759,35 @@ mod tests { max_rep_level: 2, }; - assert_eq!(list_level, &expected_level); + assert_eq!(levels[0], expected_level); + } + + #[test] + fn test_null_dictionary_values() { + let values = Int32Array::new( + vec![1, 2, 3, 4].into(), + Some(NullBuffer::from(vec![true, false, true, true])), + ); + let keys = Int32Array::new( + vec![1, 54, 2, 0].into(), + Some(NullBuffer::from(vec![true, false, true, true])), + ); + // [NULL, NULL, 3, 0] + let dict = DictionaryArray::new(keys, Arc::new(values)); + + let item_field = Field::new("item", dict.data_type().clone(), true); + + let mut builder = + LevelInfoBuilder::try_new(&item_field, Default::default()).unwrap(); + builder.write(&dict, 0..4); + let levels = builder.finish(); + let expected_level = LevelInfo { + def_levels: Some(vec![0, 0, 1, 1]), + rep_levels: None, + non_null_indices: vec![2, 3], + max_def_level: 1, + max_rep_level: 0, + }; + assert_eq!(levels[0], expected_level); } } diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index d3d4e2626fe3..c4d174b6adc1 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1965,7 +1965,7 @@ mod tests { assert_eq!(a.value(0).len(), 0); assert_eq!(a.value(2).len(), 2); - assert_eq!(a.value(2).null_count(), 2); + assert_eq!(a.value(2).logical_nulls().unwrap().null_count(), 2); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap(); roundtrip(batch, None);