Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Array Logical Nullability #4691

Merged
merged 1 commit into from
Aug 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions arrow-arith/src/arity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ where
return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE)));
}

let nulls = NullBuffer::union(a.nulls(), b.nulls());
let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref());

let values = a.values().iter().zip(b.values()).map(|(l, r)| op(*l, *r));
// JUSTIFICATION
Expand Down Expand Up @@ -248,7 +248,7 @@ where
))));
}

let nulls = NullBuffer::union(a.nulls(), b.nulls());
let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref());

let mut builder = a.into_builder()?;

Expand Down Expand Up @@ -296,7 +296,9 @@ where
if a.null_count() == 0 && b.null_count() == 0 {
try_binary_no_nulls(len, a, b, op)
} else {
let nulls = NullBuffer::union(a.nulls(), b.nulls()).unwrap();
let nulls =
NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref())
.unwrap();

let mut buffer = BufferBuilder::<O::Native>::new(len);
buffer.append_n_zeroed(len);
Expand Down Expand Up @@ -355,7 +357,10 @@ where
if a.null_count() == 0 && b.null_count() == 0 {
try_binary_no_nulls_mut(len, a, b, op)
} else {
let nulls = NullBuffer::union(a.nulls(), b.nulls()).unwrap();
let nulls =
NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref())
.unwrap();

let mut builder = a.into_builder()?;

let slice = builder.values_slice_mut();
Expand Down
14 changes: 3 additions & 11 deletions arrow-arith/src/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
use arrow_array::*;
use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_quaternary_op_helper};
use arrow_buffer::{BooleanBuffer, NullBuffer};
use arrow_schema::{ArrowError, DataType};
use arrow_schema::ArrowError;

/// Logical 'and' boolean values with Kleene logic
///
Expand Down Expand Up @@ -311,11 +311,7 @@ pub fn not(left: &BooleanArray) -> Result<BooleanArray, ArrowError> {
/// assert_eq!(a_is_null, BooleanArray::from(vec![false, false, true]));
/// ```
pub fn is_null(input: &dyn Array) -> Result<BooleanArray, ArrowError> {
let values = match input.nulls() {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We no longer need to special case NullArray, whilst also now correctly handling DictionaryArray with value nulls

// NullArray has no nulls buffer yet all values are null
None if input.data_type() == &DataType::Null => {
BooleanBuffer::new_set(input.len())
}
let values = match input.logical_nulls() {
None => BooleanBuffer::new_unset(input.len()),
Some(nulls) => !nulls.inner(),
};
Expand All @@ -335,11 +331,7 @@ pub fn is_null(input: &dyn Array) -> Result<BooleanArray, ArrowError> {
/// assert_eq!(a_is_not_null, BooleanArray::from(vec![true, true, false]));
/// ```
pub fn is_not_null(input: &dyn Array) -> Result<BooleanArray, ArrowError> {
let values = match input.nulls() {
// NullArray has no nulls buffer yet all values are null
None if input.data_type() == &DataType::Null => {
BooleanBuffer::new_unset(input.len())
}
let values = match input.logical_nulls() {
None => BooleanBuffer::new_set(input.len()),
Some(n) => n.inner().clone(),
};
Expand Down
7 changes: 5 additions & 2 deletions arrow-array/src/array/boolean_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ impl BooleanArray {
where
F: FnMut(T::Item) -> bool,
{
let nulls = left.nulls().cloned();
let nulls = left.logical_nulls();
let values = BooleanBuffer::collect_bool(left.len(), |i| unsafe {
// SAFETY: i in range 0..len
op(left.value_unchecked(i))
Expand Down Expand Up @@ -239,7 +239,10 @@ impl BooleanArray {
{
assert_eq!(left.len(), right.len());

let nulls = NullBuffer::union(left.nulls(), right.nulls());
let nulls = NullBuffer::union(
left.logical_nulls().as_ref(),
right.logical_nulls().as_ref(),
);
let values = BooleanBuffer::collect_bool(left.len(), |i| unsafe {
// SAFETY: i in range 0..len
op(left.value_unchecked(i), right.value_unchecked(i))
Expand Down
49 changes: 49 additions & 0 deletions arrow-array/src/array/dictionary_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,31 @@ impl<T: ArrowDictionaryKeyType> Array for DictionaryArray<T> {
self.keys.nulls()
}

fn logical_nulls(&self) -> Option<NullBuffer> {
match self.values.nulls() {
None => self.nulls().cloned(),
Some(value_nulls) => {
let mut builder = BooleanBufferBuilder::new(self.len());
match self.keys.nulls() {
Some(n) => builder.append_buffer(n.inner()),
None => builder.append_n(self.len(), true),
}
for (idx, k) in self.keys.values().iter().enumerate() {
let k = k.as_usize();
// Check range to allow for nulls
if k < value_nulls.len() && value_nulls.is_null(k) {
builder.set_bit(idx, false);
}
}
Some(builder.finish().into())
}
}
}

fn is_nullable(&self) -> bool {
!self.is_empty() && (self.nulls().is_some() || self.values.is_nullable())
}

fn get_buffer_memory_size(&self) -> usize {
self.keys.get_buffer_memory_size() + self.values.get_buffer_memory_size()
}
Expand Down Expand Up @@ -843,6 +868,14 @@ impl<'a, K: ArrowDictionaryKeyType, V: Sync> Array for TypedDictionaryArray<'a,
self.dictionary.nulls()
}

fn logical_nulls(&self) -> Option<NullBuffer> {
self.dictionary.logical_nulls()
}

fn is_nullable(&self) -> bool {
self.dictionary.is_nullable()
}

fn get_buffer_memory_size(&self) -> usize {
self.dictionary.get_buffer_memory_size()
}
Expand Down Expand Up @@ -1253,4 +1286,20 @@ mod tests {
assert_eq!(v, expected, "{idx}");
}
}

#[test]
fn test_iterator_nulls() {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test for #4616

let keys = Int32Array::new(
vec![0, 700, 1, 2].into(),
Some(NullBuffer::from(vec![true, false, true, true])),
);
let values = Int32Array::from(vec![Some(50), None, Some(2)]);
let dict = DictionaryArray::new(keys, Arc::new(values));
let values: Vec<_> = dict
.downcast_dict::<Int32Array>()
.unwrap()
.into_iter()
.collect();
assert_eq!(values, &[Some(50), None, None, Some(2)])
}
}
6 changes: 3 additions & 3 deletions arrow-array/src/array/fixed_size_list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ impl FixedSizeListArray {
/// * `size < 0`
/// * `values.len() / size != nulls.len()`
/// * `values.data_type() != field.data_type()`
/// * `!field.is_nullable() && !nulls.expand(size).contains(values.nulls())`
/// * `!field.is_nullable() && !nulls.expand(size).contains(values.logical_nulls())`
pub fn try_new(
field: FieldRef,
size: i32,
Expand Down Expand Up @@ -181,11 +181,11 @@ impl FixedSizeListArray {
)));
}

if let Some(a) = values.nulls() {
if let Some(a) = values.logical_nulls() {
let nulls_valid = field.is_nullable()
|| nulls
.as_ref()
.map(|n| n.expand(size as _).contains(a))
.map(|n| n.expand(size as _).contains(&a))
.unwrap_or_default();

if !nulls_valid {
Expand Down
4 changes: 2 additions & 2 deletions arrow-array/src/array/list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
///
/// * `offsets.len() - 1 != nulls.len()`
/// * `offsets.last() > values.len()`
/// * `!field.is_nullable() && values.null_count() != 0`
/// * `!field.is_nullable() && values.is_nullable()`
/// * `field.data_type() != values.data_type()`
pub fn try_new(
field: FieldRef,
Expand Down Expand Up @@ -189,7 +189,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
)));
}
}
if !field.is_nullable() && values.null_count() != 0 {
if !field.is_nullable() && values.is_nullable() {
return Err(ArrowError::InvalidArgumentError(format!(
"Non-nullable field of {}ListArray {:?} cannot contain nulls",
OffsetSize::PREFIX,
Expand Down
60 changes: 58 additions & 2 deletions arrow-array/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -173,12 +173,33 @@ pub trait Array: std::fmt::Debug + Send + Sync {
/// ```
fn offset(&self) -> usize;

/// Returns the null buffers of this array if any
/// Returns the null buffer of this array if any
///
/// Note: some arrays can encode their nullability in their children, for example,
/// [`DictionaryArray::values`] values or [`RunArray::values`], or without a null buffer,
/// such as [`NullArray`]. Use [`Array::logical_nulls`] to obtain a computed mask encoding this
fn nulls(&self) -> Option<&NullBuffer>;

/// Returns the logical null buffer of this array if any
///
/// In most cases this will be the same as [`Array::nulls`], except for:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think Array::nulls should be deprecated and a non-deprecated version physical_nulls should be introduced. Users will misuse nulls because:

  • most people will read over the small note within the doc string
  • nulls is easier to find than logical_nulls
  • there are two options of "which null", and we offer an implicit default (because it is not prefixed with a semantic) that is clearly the wrong one for many use cases

Copy link
Contributor Author

@tustvold tustvold Aug 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whilst I do sort of agree, I'm not sure this wouldn't just cause more confusion, as this isn't a distinction found in the arrow spec. Would we also deprecate is_null, is_valid, etc... I think the current API hasn't been a major issue thus far, as the cases where it matters are rare.

FWIW once we have StringView I think there is basically no reason to use DictionaryArray or RunArray

///
/// * DictionaryArray where [`DictionaryArray::values`] contains nulls
/// * RunArray where [`RunArray::values`] contains nulls
/// * NullArray where all indices are nulls
///
/// In these cases a logical [`NullBuffer`] will be computed, encoding the logical nullability
/// of these arrays, beyond what is encoded in [`Array::nulls`]
fn logical_nulls(&self) -> Option<NullBuffer> {
self.nulls().cloned()
}

/// Returns whether the element at `index` is null.
/// When using this function on a slice, the index is relative to the slice.
///
/// Note: this method returns the physical nullability, i.e. that encoded in [`Array::nulls`]
/// see [`Array::logical_nulls`] for logical nullability
///
/// # Example:
///
/// ```
Expand All @@ -196,6 +217,9 @@ pub trait Array: std::fmt::Debug + Send + Sync {
/// Returns whether the element at `index` is not null.
/// When using this function on a slice, the index is relative to the slice.
///
/// Note: this method returns the physical nullability, i.e. that encoded in [`Array::nulls`]
/// see [`Array::logical_nulls`] for logical nullability
///
/// # Example:
///
/// ```
Expand All @@ -210,7 +234,10 @@ pub trait Array: std::fmt::Debug + Send + Sync {
!self.is_null(index)
}

/// Returns the total number of null values in this array.
/// Returns the total number of physical null values in this array.
///
/// Note: this method returns the physical null count, i.e. that encoded in [`Array::nulls`],
/// see [`Array::logical_nulls`] for logical nullability
///
/// # Example:
///
Expand All @@ -226,6 +253,19 @@ pub trait Array: std::fmt::Debug + Send + Sync {
self.nulls().map(|n| n.null_count()).unwrap_or_default()
}

/// Returns `false` if the array is guaranteed to not contain any logical nulls
///
/// In general this will be equivalent to `Array::null_count() != 0` but may differ in the
/// presence of logical nullability, see [`Array::logical_nulls`].
///
/// Implementations will return `true` unless they can cheaply prove no logical nulls
/// are present. For example a [`DictionaryArray`] with nullable values will still return true,
/// even if the nulls present in [`DictionaryArray::values`] are not referenced by any key,
/// and therefore would not appear in [`Array::logical_nulls`].
fn is_nullable(&self) -> bool {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I debated calling this method is_required instead, but figured it was better to stick with consistent terminology, and this matches the method on Field

self.null_count() != 0
}

/// Returns the total number of bytes of memory pointed to by this array.
/// The buffers store bytes in the Arrow memory format, and include the data as well as the validity map.
fn get_buffer_memory_size(&self) -> usize;
Expand Down Expand Up @@ -277,6 +317,10 @@ impl Array for ArrayRef {
self.as_ref().nulls()
}

fn logical_nulls(&self) -> Option<NullBuffer> {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I considered having this eagerly constructed and stored on the arrays, but I felt this would be more surprising as it would make the arrays diverge from the arrow specification. Whilst this does raise the prospect of computing this NullBuffer multiple times, the cases where this is required are fairly rare. We can always optimise in future should this become a problem

self.as_ref().logical_nulls()
}

fn is_null(&self, index: usize) -> bool {
self.as_ref().is_null(index)
}
Expand All @@ -289,6 +333,10 @@ impl Array for ArrayRef {
self.as_ref().null_count()
}

fn is_nullable(&self) -> bool {
self.as_ref().is_nullable()
}

fn get_buffer_memory_size(&self) -> usize {
self.as_ref().get_buffer_memory_size()
}
Expand Down Expand Up @@ -335,6 +383,10 @@ impl<'a, T: Array> Array for &'a T {
T::nulls(self)
}

fn logical_nulls(&self) -> Option<NullBuffer> {
T::logical_nulls(self)
}

fn is_null(&self, index: usize) -> bool {
T::is_null(self, index)
}
Expand All @@ -347,6 +399,10 @@ impl<'a, T: Array> Array for &'a T {
T::null_count(self)
}

fn is_nullable(&self) -> bool {
T::is_nullable(self)
}

fn get_buffer_memory_size(&self) -> usize {
T::get_buffer_memory_size(self)
}
Expand Down
33 changes: 15 additions & 18 deletions arrow-array/src/array/null_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,10 @@ use std::sync::Arc;
///
/// let array = NullArray::new(10);
///
/// assert!(array.is_nullable());
/// assert_eq!(array.len(), 10);
/// assert_eq!(array.null_count(), 10);
/// assert_eq!(array.null_count(), 0);
/// assert_eq!(array.logical_nulls().unwrap().null_count(), 10);
/// ```
#[derive(Clone)]
pub struct NullArray {
Expand Down Expand Up @@ -107,22 +109,12 @@ impl Array for NullArray {
None
}

/// Returns whether the element at `index` is null.
/// All elements of a `NullArray` are always null.
fn is_null(&self, _index: usize) -> bool {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a breaking change, although I expect the impact to minimal. I debated making is_null and friends always return logical nullability, but this would have been inconsistent with ArrayData and generally seemed like it might just cause more confusion

true
fn logical_nulls(&self) -> Option<NullBuffer> {
(self.len != 0).then(|| NullBuffer::new_null(self.len))
}

/// Returns whether the element at `index` is valid.
/// All elements of a `NullArray` are always invalid.
fn is_valid(&self, _index: usize) -> bool {
false
}

/// Returns the total number of null values in this array.
/// The null count of a `NullArray` always equals its length.
fn null_count(&self) -> usize {
self.len()
fn is_nullable(&self) -> bool {
!self.is_empty()
}

fn get_buffer_memory_size(&self) -> usize {
Expand Down Expand Up @@ -176,8 +168,10 @@ mod tests {
let null_arr = NullArray::new(32);

assert_eq!(null_arr.len(), 32);
assert_eq!(null_arr.null_count(), 32);
assert!(!null_arr.is_valid(0));
assert_eq!(null_arr.null_count(), 0);
assert_eq!(null_arr.logical_nulls().unwrap().null_count(), 32);
assert!(null_arr.is_valid(0));
assert!(null_arr.is_nullable());
}

#[test]
Expand All @@ -186,7 +180,10 @@ mod tests {

let array2 = array1.slice(8, 16);
assert_eq!(array2.len(), 16);
assert_eq!(array2.null_count(), 16);
assert_eq!(array2.null_count(), 0);
assert_eq!(array2.logical_nulls().unwrap().null_count(), 16);
assert!(array2.is_valid(0));
assert!(array2.is_nullable());
}

#[test]
Expand Down
Loading