Skip to content

Commit

Permalink
refactor: Zero-Field Structs and DataFrame with Height Property (#19123)
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite authored Oct 11, 2024
1 parent 922b764 commit dbbd93f
Show file tree
Hide file tree
Showing 125 changed files with 1,155 additions and 649 deletions.
7 changes: 7 additions & 0 deletions crates/polars-arrow/src/array/growable/structure.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use crate::bitmap::MutableBitmap;
/// Concrete [`Growable`] for the [`StructArray`].
pub struct GrowableStruct<'a> {
arrays: Vec<&'a StructArray>,
length: usize,
validity: Option<MutableBitmap>,
values: Vec<Box<dyn Growable<'a> + 'a>>,
}
Expand Down Expand Up @@ -48,6 +49,7 @@ impl<'a> GrowableStruct<'a> {

Self {
arrays,
length: 0,
values,
validity: prepare_validity(use_validity, capacity),
}
Expand All @@ -60,6 +62,7 @@ impl<'a> GrowableStruct<'a> {

StructArray::new(
self.arrays[0].dtype().clone(),
self.length,
values,
validity.map(|v| v.into()),
)
Expand All @@ -71,6 +74,8 @@ impl<'a> Growable<'a> for GrowableStruct<'a> {
let array = *self.arrays.get_unchecked_release(index);
extend_validity(&mut self.validity, array, start, len);

self.length += len;

if array.null_count() == 0 {
self.values
.iter_mut()
Expand All @@ -97,6 +102,7 @@ impl<'a> Growable<'a> for GrowableStruct<'a> {
if let Some(validity) = &mut self.validity {
validity.extend_constant(additional, false);
}
self.length += additional;
}

#[inline]
Expand All @@ -123,6 +129,7 @@ impl<'a> From<GrowableStruct<'a>> for StructArray {

StructArray::new(
val.arrays[0].dtype().clone(),
val.length,
values,
val.validity.map(|v| v.into()),
)
Expand Down
1 change: 1 addition & 0 deletions crates/polars-arrow/src/array/struct_/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ impl Arrow2Arrow for StructArray {

Self {
dtype,
length: data.len(),
values: data.child_data().iter().map(from_data).collect(),
validity: data.nulls().map(|n| Bitmap::from_null_buffer(n.clone())),
}
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-arrow/src/array/struct_/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,6 @@ impl<A: ffi::ArrowArrayRef> FromFfi<A> for StructArray {
})
.collect::<PolarsResult<Vec<Box<dyn Array>>>>()?;

Self::try_new(dtype, values, validity)
Self::try_new(dtype, len, values, validity)
}
}
63 changes: 32 additions & 31 deletions crates/polars-arrow/src/array/struct_/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ pub(super) mod fmt;
mod iterator;
mod mutable;
pub use mutable::*;
use polars_error::{polars_bail, PolarsResult};
use polars_error::{polars_bail, polars_ensure, PolarsResult};

use crate::compute::utils::combine_validities_and;

Expand All @@ -27,13 +27,15 @@ use crate::compute::utils::combine_validities_and;
/// Field::new("c".into(), ArrowDataType::Int32, false),
/// ];
///
/// let array = StructArray::new(ArrowDataType::Struct(fields), vec![boolean, int], None);
/// let array = StructArray::new(ArrowDataType::Struct(fields), 4, vec![boolean, int], None);
/// ```
#[derive(Clone)]
pub struct StructArray {
dtype: ArrowDataType,
// invariant: each array has the same length
values: Vec<Box<dyn Array>>,
// invariant: for each v in values: length == v.len()
length: usize,
validity: Option<Bitmap>,
}

Expand All @@ -49,22 +51,17 @@ impl StructArray {
/// * the validity's length is not equal to the length of the first element
pub fn try_new(
dtype: ArrowDataType,
length: usize,
values: Vec<Box<dyn Array>>,
validity: Option<Bitmap>,
) -> PolarsResult<Self> {
let fields = Self::try_get_fields(&dtype)?;
if fields.is_empty() {
assert!(values.is_empty(), "invalid struct");
assert_eq!(validity.map(|v| v.len()).unwrap_or(0), 0, "invalid struct");
return Ok(Self {
dtype,
values,
validity: None,
});
}
if fields.len() != values.len() {
polars_bail!(ComputeError:"a StructArray must have a number of fields in its DataType equal to the number of child values")
}

polars_ensure!(
fields.len() == values.len(),
ComputeError:
"a StructArray must have a number of fields in its DataType equal to the number of child values"
);

fields
.iter().map(|a| &a.dtype)
Expand All @@ -81,29 +78,29 @@ impl StructArray {
}
})?;

let len = values[0].len();
values
.iter()
.map(|a| a.len())
.map(|f| f.len())
.enumerate()
.try_for_each(|(index, a_len)| {
if a_len != len {
polars_bail!(ComputeError: "The children must have an equal number of values.
However, the values at index {index} have a length of {a_len}, which is different from values at index 0, {len}.")
.try_for_each(|(index, f_length)| {
if f_length != length {
polars_bail!(ComputeError: "The children must have the given number of values.
However, the values at index {index} have a length of {f_length}, which is different from given length {length}.")
} else {
Ok(())
}
})?;

if validity
.as_ref()
.map_or(false, |validity| validity.len() != len)
.map_or(false, |validity| validity.len() != length)
{
polars_bail!(ComputeError:"The validity length of a StructArray must match its number of elements")
}

Ok(Self {
dtype,
length,
values,
validity,
})
Expand All @@ -120,10 +117,11 @@ impl StructArray {
/// * the validity's length is not equal to the length of the first element
pub fn new(
dtype: ArrowDataType,
length: usize,
values: Vec<Box<dyn Array>>,
validity: Option<Bitmap>,
) -> Self {
Self::try_new(dtype, values, validity).unwrap()
Self::try_new(dtype, length, values, validity).unwrap()
}

/// Creates an empty [`StructArray`].
Expand All @@ -133,7 +131,7 @@ impl StructArray {
.iter()
.map(|field| new_empty_array(field.dtype().clone()))
.collect();
Self::new(dtype, values, None)
Self::new(dtype, 0, values, None)
} else {
panic!("StructArray must be initialized with DataType::Struct");
}
Expand All @@ -146,7 +144,7 @@ impl StructArray {
.iter()
.map(|field| new_null_array(field.dtype().clone(), length))
.collect();
Self::new(dtype, values, Some(Bitmap::new_zeroed(length)))
Self::new(dtype, length, values, Some(Bitmap::new_zeroed(length)))
} else {
panic!("StructArray must be initialized with DataType::Struct");
}
Expand All @@ -157,9 +155,10 @@ impl StructArray {
impl StructArray {
/// Deconstructs the [`StructArray`] into its individual components.
#[must_use]
pub fn into_data(self) -> (Vec<Field>, Vec<Box<dyn Array>>, Option<Bitmap>) {
pub fn into_data(self) -> (Vec<Field>, usize, Vec<Box<dyn Array>>, Option<Bitmap>) {
let Self {
dtype,
length,
values,
validity,
} = self;
Expand All @@ -168,7 +167,7 @@ impl StructArray {
} else {
unreachable!()
};
(fields, values, validity)
(fields, length, values, validity)
}

/// Slices this [`StructArray`].
Expand Down Expand Up @@ -199,6 +198,7 @@ impl StructArray {
self.values
.iter_mut()
.for_each(|x| x.slice_unchecked(offset, length));
self.length = length;
}

/// Set the outer nulls into the inner arrays.
Expand Down Expand Up @@ -227,18 +227,17 @@ impl StructArray {
impl StructArray {
#[inline]
fn len(&self) -> usize {
#[cfg(debug_assertions)]
if let Some(fst) = self.values.first() {
for arr in self.values.iter().skip(1) {
if cfg!(debug_assertions) {
for arr in self.values.iter() {
assert_eq!(
arr.len(),
fst.len(),
self.length,
"StructArray invariant: each array has same length"
);
}
}

self.values.first().map(|arr| arr.len()).unwrap_or(0)
self.length
}

/// The optional validity.
Expand Down Expand Up @@ -310,11 +309,13 @@ impl Splitable for StructArray {
(
Self {
dtype: self.dtype.clone(),
length: offset,
values: lhs_values,
validity: lhs_validity,
},
Self {
dtype: self.dtype.clone(),
length: self.length - offset,
values: rhs_values,
validity: rhs_validity,
},
Expand Down
Loading

0 comments on commit dbbd93f

Please sign in to comment.