-
Notifications
You must be signed in to change notification settings - Fork 847
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve Array
Logical Nullability
#4691
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -729,6 +729,31 @@ impl<T: ArrowDictionaryKeyType> Array for DictionaryArray<T> { | |
self.keys.nulls() | ||
} | ||
|
||
fn logical_nulls(&self) -> Option<NullBuffer> { | ||
match self.values.nulls() { | ||
None => self.nulls().cloned(), | ||
Some(value_nulls) => { | ||
let mut builder = BooleanBufferBuilder::new(self.len()); | ||
match self.keys.nulls() { | ||
Some(n) => builder.append_buffer(n.inner()), | ||
None => builder.append_n(self.len(), true), | ||
} | ||
for (idx, k) in self.keys.values().iter().enumerate() { | ||
let k = k.as_usize(); | ||
// Check range to allow for nulls | ||
if k < value_nulls.len() && value_nulls.is_null(k) { | ||
builder.set_bit(idx, false); | ||
} | ||
} | ||
Some(builder.finish().into()) | ||
} | ||
} | ||
} | ||
|
||
fn is_nullable(&self) -> bool { | ||
!self.is_empty() && (self.nulls().is_some() || self.values.is_nullable()) | ||
} | ||
|
||
fn get_buffer_memory_size(&self) -> usize { | ||
self.keys.get_buffer_memory_size() + self.values.get_buffer_memory_size() | ||
} | ||
|
@@ -843,6 +868,14 @@ impl<'a, K: ArrowDictionaryKeyType, V: Sync> Array for TypedDictionaryArray<'a, | |
self.dictionary.nulls() | ||
} | ||
|
||
fn logical_nulls(&self) -> Option<NullBuffer> { | ||
self.dictionary.logical_nulls() | ||
} | ||
|
||
fn is_nullable(&self) -> bool { | ||
self.dictionary.is_nullable() | ||
} | ||
|
||
fn get_buffer_memory_size(&self) -> usize { | ||
self.dictionary.get_buffer_memory_size() | ||
} | ||
|
@@ -1253,4 +1286,20 @@ mod tests { | |
assert_eq!(v, expected, "{idx}"); | ||
} | ||
} | ||
|
||
#[test] | ||
fn test_iterator_nulls() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Test for #4616 |
||
let keys = Int32Array::new( | ||
vec![0, 700, 1, 2].into(), | ||
Some(NullBuffer::from(vec![true, false, true, true])), | ||
); | ||
let values = Int32Array::from(vec![Some(50), None, Some(2)]); | ||
let dict = DictionaryArray::new(keys, Arc::new(values)); | ||
let values: Vec<_> = dict | ||
.downcast_dict::<Int32Array>() | ||
.unwrap() | ||
.into_iter() | ||
.collect(); | ||
assert_eq!(values, &[Some(50), None, None, Some(2)]) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -173,12 +173,33 @@ pub trait Array: std::fmt::Debug + Send + Sync { | |
/// ``` | ||
fn offset(&self) -> usize; | ||
|
||
/// Returns the null buffers of this array if any | ||
/// Returns the null buffer of this array if any | ||
/// | ||
/// Note: some arrays can encode their nullability in their children, for example, | ||
/// [`DictionaryArray::values`] values or [`RunArray::values`], or without a null buffer, | ||
/// such as [`NullArray`]. Use [`Array::logical_nulls`] to obtain a computed mask encoding this | ||
fn nulls(&self) -> Option<&NullBuffer>; | ||
|
||
/// Returns the logical null buffer of this array if any | ||
/// | ||
/// In most cases this will be the same as [`Array::nulls`], except for: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Whilst I do sort of agree, I'm not sure this wouldn't just cause more confusion, as this isn't a distinction found in the arrow spec. Would we also deprecate is_null, is_valid, etc... I think the current API hasn't been a major issue thus far, as the cases where it matters are rare. FWIW once we have StringView I think there is basically no reason to use DictionaryArray or RunArray |
||
/// | ||
/// * DictionaryArray where [`DictionaryArray::values`] contains nulls | ||
/// * RunArray where [`RunArray::values`] contains nulls | ||
/// * NullArray where all indices are nulls | ||
/// | ||
/// In these cases a logical [`NullBuffer`] will be computed, encoding the logical nullability | ||
/// of these arrays, beyond what is encoded in [`Array::nulls`] | ||
fn logical_nulls(&self) -> Option<NullBuffer> { | ||
self.nulls().cloned() | ||
} | ||
|
||
/// Returns whether the element at `index` is null. | ||
/// When using this function on a slice, the index is relative to the slice. | ||
/// | ||
/// Note: this method returns the physical nullability, i.e. that encoded in [`Array::nulls`] | ||
/// see [`Array::logical_nulls`] for logical nullability | ||
/// | ||
/// # Example: | ||
/// | ||
/// ``` | ||
|
@@ -196,6 +217,9 @@ pub trait Array: std::fmt::Debug + Send + Sync { | |
/// Returns whether the element at `index` is not null. | ||
/// When using this function on a slice, the index is relative to the slice. | ||
/// | ||
/// Note: this method returns the physical nullability, i.e. that encoded in [`Array::nulls`] | ||
/// see [`Array::logical_nulls`] for logical nullability | ||
/// | ||
/// # Example: | ||
/// | ||
/// ``` | ||
|
@@ -210,7 +234,10 @@ pub trait Array: std::fmt::Debug + Send + Sync { | |
!self.is_null(index) | ||
} | ||
|
||
/// Returns the total number of null values in this array. | ||
/// Returns the total number of physical null values in this array. | ||
/// | ||
/// Note: this method returns the physical null count, i.e. that encoded in [`Array::nulls`], | ||
/// see [`Array::logical_nulls`] for logical nullability | ||
/// | ||
/// # Example: | ||
/// | ||
|
@@ -226,6 +253,19 @@ pub trait Array: std::fmt::Debug + Send + Sync { | |
self.nulls().map(|n| n.null_count()).unwrap_or_default() | ||
} | ||
|
||
/// Returns `false` if the array is guaranteed to not contain any logical nulls | ||
/// | ||
/// In general this will be equivalent to `Array::null_count() != 0` but may differ in the | ||
/// presence of logical nullability, see [`Array::logical_nulls`]. | ||
/// | ||
/// Implementations will return `true` unless they can cheaply prove no logical nulls | ||
/// are present. For example a [`DictionaryArray`] with nullable values will still return true, | ||
/// even if the nulls present in [`DictionaryArray::values`] are not referenced by any key, | ||
/// and therefore would not appear in [`Array::logical_nulls`]. | ||
fn is_nullable(&self) -> bool { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I debated calling this method is_required instead, but figured it was better to stick with consistent terminology, and this matches the method on |
||
self.null_count() != 0 | ||
} | ||
|
||
/// Returns the total number of bytes of memory pointed to by this array. | ||
/// The buffers store bytes in the Arrow memory format, and include the data as well as the validity map. | ||
fn get_buffer_memory_size(&self) -> usize; | ||
|
@@ -277,6 +317,10 @@ impl Array for ArrayRef { | |
self.as_ref().nulls() | ||
} | ||
|
||
fn logical_nulls(&self) -> Option<NullBuffer> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I considered having this eagerly constructed and stored on the arrays, but I felt this would be more surprising as it would make the arrays diverge from the arrow specification. Whilst this does raise the prospect of computing this NullBuffer multiple times, the cases where this is required are fairly rare. We can always optimise in future should this become a problem |
||
self.as_ref().logical_nulls() | ||
} | ||
|
||
fn is_null(&self, index: usize) -> bool { | ||
self.as_ref().is_null(index) | ||
} | ||
|
@@ -289,6 +333,10 @@ impl Array for ArrayRef { | |
self.as_ref().null_count() | ||
} | ||
|
||
fn is_nullable(&self) -> bool { | ||
self.as_ref().is_nullable() | ||
} | ||
|
||
fn get_buffer_memory_size(&self) -> usize { | ||
self.as_ref().get_buffer_memory_size() | ||
} | ||
|
@@ -335,6 +383,10 @@ impl<'a, T: Array> Array for &'a T { | |
T::nulls(self) | ||
} | ||
|
||
fn logical_nulls(&self) -> Option<NullBuffer> { | ||
T::logical_nulls(self) | ||
} | ||
|
||
fn is_null(&self, index: usize) -> bool { | ||
T::is_null(self, index) | ||
} | ||
|
@@ -347,6 +399,10 @@ impl<'a, T: Array> Array for &'a T { | |
T::null_count(self) | ||
} | ||
|
||
fn is_nullable(&self) -> bool { | ||
T::is_nullable(self) | ||
} | ||
|
||
fn get_buffer_memory_size(&self) -> usize { | ||
T::get_buffer_memory_size(self) | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,8 +36,10 @@ use std::sync::Arc; | |
/// | ||
/// let array = NullArray::new(10); | ||
/// | ||
/// assert!(array.is_nullable()); | ||
/// assert_eq!(array.len(), 10); | ||
/// assert_eq!(array.null_count(), 10); | ||
/// assert_eq!(array.null_count(), 0); | ||
/// assert_eq!(array.logical_nulls().unwrap().null_count(), 10); | ||
/// ``` | ||
#[derive(Clone)] | ||
pub struct NullArray { | ||
|
@@ -107,22 +109,12 @@ impl Array for NullArray { | |
None | ||
} | ||
|
||
/// Returns whether the element at `index` is null. | ||
/// All elements of a `NullArray` are always null. | ||
fn is_null(&self, _index: usize) -> bool { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a breaking change, although I expect the impact to minimal. I debated making is_null and friends always return logical nullability, but this would have been inconsistent with ArrayData and generally seemed like it might just cause more confusion |
||
true | ||
fn logical_nulls(&self) -> Option<NullBuffer> { | ||
(self.len != 0).then(|| NullBuffer::new_null(self.len)) | ||
} | ||
|
||
/// Returns whether the element at `index` is valid. | ||
/// All elements of a `NullArray` are always invalid. | ||
fn is_valid(&self, _index: usize) -> bool { | ||
false | ||
} | ||
|
||
/// Returns the total number of null values in this array. | ||
/// The null count of a `NullArray` always equals its length. | ||
fn null_count(&self) -> usize { | ||
self.len() | ||
fn is_nullable(&self) -> bool { | ||
!self.is_empty() | ||
} | ||
|
||
fn get_buffer_memory_size(&self) -> usize { | ||
|
@@ -176,8 +168,10 @@ mod tests { | |
let null_arr = NullArray::new(32); | ||
|
||
assert_eq!(null_arr.len(), 32); | ||
assert_eq!(null_arr.null_count(), 32); | ||
assert!(!null_arr.is_valid(0)); | ||
assert_eq!(null_arr.null_count(), 0); | ||
assert_eq!(null_arr.logical_nulls().unwrap().null_count(), 32); | ||
assert!(null_arr.is_valid(0)); | ||
assert!(null_arr.is_nullable()); | ||
} | ||
|
||
#[test] | ||
|
@@ -186,7 +180,10 @@ mod tests { | |
|
||
let array2 = array1.slice(8, 16); | ||
assert_eq!(array2.len(), 16); | ||
assert_eq!(array2.null_count(), 16); | ||
assert_eq!(array2.null_count(), 0); | ||
assert_eq!(array2.logical_nulls().unwrap().null_count(), 16); | ||
assert!(array2.is_valid(0)); | ||
assert!(array2.is_nullable()); | ||
} | ||
|
||
#[test] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We no longer need to special case NullArray, whilst also now correctly handling DictionaryArray with value nulls