Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Zero-Field Structs and DataFrame with Height Property #19123

Merged
merged 17 commits into from
Oct 11, 2024
Merged
7 changes: 7 additions & 0 deletions crates/polars-arrow/src/array/growable/structure.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use crate::bitmap::MutableBitmap;
/// Concrete [`Growable`] for the [`StructArray`].
pub struct GrowableStruct<'a> {
arrays: Vec<&'a StructArray>,
length: usize,
validity: Option<MutableBitmap>,
values: Vec<Box<dyn Growable<'a> + 'a>>,
}
Expand Down Expand Up @@ -48,6 +49,7 @@ impl<'a> GrowableStruct<'a> {

Self {
arrays,
length: 0,
values,
validity: prepare_validity(use_validity, capacity),
}
Expand All @@ -60,6 +62,7 @@ impl<'a> GrowableStruct<'a> {

StructArray::new(
self.arrays[0].dtype().clone(),
self.length,
values,
validity.map(|v| v.into()),
)
Expand All @@ -71,6 +74,8 @@ impl<'a> Growable<'a> for GrowableStruct<'a> {
let array = *self.arrays.get_unchecked_release(index);
extend_validity(&mut self.validity, array, start, len);

self.length += len;

if array.null_count() == 0 {
self.values
.iter_mut()
Expand All @@ -97,6 +102,7 @@ impl<'a> Growable<'a> for GrowableStruct<'a> {
if let Some(validity) = &mut self.validity {
validity.extend_constant(additional, false);
}
self.length += additional;
}

#[inline]
Expand All @@ -123,6 +129,7 @@ impl<'a> From<GrowableStruct<'a>> for StructArray {

StructArray::new(
val.arrays[0].dtype().clone(),
val.length,
values,
val.validity.map(|v| v.into()),
)
Expand Down
1 change: 1 addition & 0 deletions crates/polars-arrow/src/array/struct_/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ impl Arrow2Arrow for StructArray {

Self {
dtype,
length: data.len(),
values: data.child_data().iter().map(from_data).collect(),
validity: data.nulls().map(|n| Bitmap::from_null_buffer(n.clone())),
}
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-arrow/src/array/struct_/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,6 @@ impl<A: ffi::ArrowArrayRef> FromFfi<A> for StructArray {
})
.collect::<PolarsResult<Vec<Box<dyn Array>>>>()?;

Self::try_new(dtype, values, validity)
Self::try_new(dtype, len, values, validity)
}
}
63 changes: 32 additions & 31 deletions crates/polars-arrow/src/array/struct_/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ pub(super) mod fmt;
mod iterator;
mod mutable;
pub use mutable::*;
use polars_error::{polars_bail, PolarsResult};
use polars_error::{polars_bail, polars_ensure, PolarsResult};

use crate::compute::utils::combine_validities_and;

Expand All @@ -27,13 +27,15 @@ use crate::compute::utils::combine_validities_and;
/// Field::new("c".into(), ArrowDataType::Int32, false),
/// ];
///
/// let array = StructArray::new(ArrowDataType::Struct(fields), vec![boolean, int], None);
/// let array = StructArray::new(ArrowDataType::Struct(fields), 4, vec![boolean, int], None);
/// ```
#[derive(Clone)]
pub struct StructArray {
dtype: ArrowDataType,
// invariant: each array has the same length
values: Vec<Box<dyn Array>>,
// invariant: for each v in values: length == v.len()
length: usize,
validity: Option<Bitmap>,
}

Expand All @@ -49,22 +51,17 @@ impl StructArray {
/// * the validity's length is not equal to the length of the first element
pub fn try_new(
dtype: ArrowDataType,
length: usize,
values: Vec<Box<dyn Array>>,
validity: Option<Bitmap>,
) -> PolarsResult<Self> {
let fields = Self::try_get_fields(&dtype)?;
if fields.is_empty() {
assert!(values.is_empty(), "invalid struct");
assert_eq!(validity.map(|v| v.len()).unwrap_or(0), 0, "invalid struct");
return Ok(Self {
dtype,
values,
validity: None,
});
}
if fields.len() != values.len() {
polars_bail!(ComputeError:"a StructArray must have a number of fields in its DataType equal to the number of child values")
}

polars_ensure!(
fields.len() == values.len(),
ComputeError:
"a StructArray must have a number of fields in its DataType equal to the number of child values"
);

fields
.iter().map(|a| &a.dtype)
Expand All @@ -81,29 +78,29 @@ impl StructArray {
}
})?;

let len = values[0].len();
values
.iter()
.map(|a| a.len())
.map(|f| f.len())
.enumerate()
.try_for_each(|(index, a_len)| {
if a_len != len {
polars_bail!(ComputeError: "The children must have an equal number of values.
However, the values at index {index} have a length of {a_len}, which is different from values at index 0, {len}.")
.try_for_each(|(index, f_length)| {
if f_length != length {
polars_bail!(ComputeError: "The children must have the given number of values.
However, the values at index {index} have a length of {f_length}, which is different from given length {length}.")
} else {
Ok(())
}
})?;

if validity
.as_ref()
.map_or(false, |validity| validity.len() != len)
.map_or(false, |validity| validity.len() != length)
{
polars_bail!(ComputeError:"The validity length of a StructArray must match its number of elements")
}

Ok(Self {
dtype,
length,
values,
validity,
})
Expand All @@ -120,10 +117,11 @@ impl StructArray {
/// * the validity's length is not equal to the length of the first element
pub fn new(
dtype: ArrowDataType,
length: usize,
values: Vec<Box<dyn Array>>,
validity: Option<Bitmap>,
) -> Self {
Self::try_new(dtype, values, validity).unwrap()
Self::try_new(dtype, length, values, validity).unwrap()
}

/// Creates an empty [`StructArray`].
Expand All @@ -133,7 +131,7 @@ impl StructArray {
.iter()
.map(|field| new_empty_array(field.dtype().clone()))
.collect();
Self::new(dtype, values, None)
Self::new(dtype, 0, values, None)
} else {
panic!("StructArray must be initialized with DataType::Struct");
}
Expand All @@ -146,7 +144,7 @@ impl StructArray {
.iter()
.map(|field| new_null_array(field.dtype().clone(), length))
.collect();
Self::new(dtype, values, Some(Bitmap::new_zeroed(length)))
Self::new(dtype, length, values, Some(Bitmap::new_zeroed(length)))
} else {
panic!("StructArray must be initialized with DataType::Struct");
}
Expand All @@ -157,9 +155,10 @@ impl StructArray {
impl StructArray {
/// Deconstructs the [`StructArray`] into its individual components.
#[must_use]
pub fn into_data(self) -> (Vec<Field>, Vec<Box<dyn Array>>, Option<Bitmap>) {
pub fn into_data(self) -> (Vec<Field>, usize, Vec<Box<dyn Array>>, Option<Bitmap>) {
let Self {
dtype,
length,
values,
validity,
} = self;
Expand All @@ -168,7 +167,7 @@ impl StructArray {
} else {
unreachable!()
};
(fields, values, validity)
(fields, length, values, validity)
}

/// Slices this [`StructArray`].
Expand Down Expand Up @@ -199,6 +198,7 @@ impl StructArray {
self.values
.iter_mut()
.for_each(|x| x.slice_unchecked(offset, length));
self.length = length;
}

/// Set the outer nulls into the inner arrays.
Expand Down Expand Up @@ -227,18 +227,17 @@ impl StructArray {
impl StructArray {
#[inline]
fn len(&self) -> usize {
#[cfg(debug_assertions)]
if let Some(fst) = self.values.first() {
for arr in self.values.iter().skip(1) {
if cfg!(debug_assertions) {
for arr in self.values.iter() {
assert_eq!(
arr.len(),
fst.len(),
self.length,
"StructArray invariant: each array has same length"
);
}
}

self.values.first().map(|arr| arr.len()).unwrap_or(0)
self.length
}

/// The optional validity.
Expand Down Expand Up @@ -310,11 +309,13 @@ impl Splitable for StructArray {
(
Self {
dtype: self.dtype.clone(),
length: offset,
values: lhs_values,
validity: lhs_validity,
},
Self {
dtype: self.dtype.clone(),
length: self.length - offset,
values: rhs_values,
validity: rhs_validity,
},
Expand Down
Loading
Loading