Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stats use ScalarValues and not Scalars #2069

Merged
merged 13 commits into from
Jan 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions encodings/bytebool/src/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ mod tests {
assert!(!bool_arr.statistics().compute_is_strict_sorted().unwrap());
assert!(bool_arr.statistics().compute_is_sorted().unwrap());
assert!(bool_arr.statistics().compute_is_constant().unwrap());
assert_eq!(bool_arr.statistics().compute(Stat::Min), None);
assert_eq!(bool_arr.statistics().compute(Stat::Max), None);
assert!(bool_arr.statistics().compute(Stat::Min).is_none());
assert!(bool_arr.statistics().compute(Stat::Max).is_none());
assert_eq!(bool_arr.statistics().compute_run_count().unwrap(), 1);
assert_eq!(bool_arr.statistics().compute_true_count().unwrap(), 0);
}
Expand Down
4 changes: 2 additions & 2 deletions encodings/datetime-parts/src/stats.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
use vortex_array::stats::{Stat, StatisticsVTable, StatsSet};
use vortex_array::ArrayLen;
use vortex_error::VortexResult;
use vortex_scalar::Scalar;
use vortex_scalar::ScalarValue;

use crate::{DateTimePartsArray, DateTimePartsEncoding};

impl StatisticsVTable<DateTimePartsArray> for DateTimePartsEncoding {
fn compute_statistics(&self, array: &DateTimePartsArray, stat: Stat) -> VortexResult<StatsSet> {
let maybe_stat = match stat {
Stat::NullCount => Some(Scalar::from(array.validity().null_count(array.len())?)),
Stat::NullCount => Some(ScalarValue::from(array.validity().null_count(array.len())?)),
_ => None,
};

Expand Down
12 changes: 6 additions & 6 deletions encodings/fastlanes/src/for/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use vortex_buffer::{Buffer, BufferMut};
use vortex_dtype::{
match_each_integer_ptype, match_each_unsigned_integer_ptype, DType, NativePType, Nullability,
};
use vortex_error::{vortex_bail, vortex_err, VortexExpect, VortexResult};
use vortex_error::{vortex_bail, vortex_err, VortexExpect, VortexResult, VortexUnwrap};
use vortex_scalar::Scalar;
use vortex_sparse::SparseArray;

Expand All @@ -21,10 +21,11 @@ pub fn for_compress(array: PrimitiveArray) -> VortexResult<FoRArray> {
.compute(Stat::Min)
.ok_or_else(|| vortex_err!("Min stat not found"))?;

let nullability = array.dtype().nullability();
let dtype = array.dtype().clone();
let nullability = dtype.nullability();
let encoded = match_each_integer_ptype!(array.ptype(), |$T| {
if shift == <$T>::PTYPE.bit_width() as u8 {
assert_eq!(min, Scalar::zero::<$T>(array.dtype().nullability()));
assert_eq!(usize::try_from(&min).vortex_unwrap(), 0);
encoded_zero::<$T>(array.validity().to_logical(array.len()), nullability)
.vortex_expect("Failed to encode all zeroes")
} else {
Expand All @@ -34,7 +35,7 @@ pub fn for_compress(array: PrimitiveArray) -> VortexResult<FoRArray> {
.into_array()
}
});
FoRArray::try_new(encoded, min, shift)
FoRArray::try_new(encoded, Scalar::new(dtype, min), shift)
}

fn encoded_zero<T: NativePType>(
Expand All @@ -48,8 +49,7 @@ fn encoded_zero<T: NativePType>(
}

let encoded_ptype = T::PTYPE.to_unsigned();
let zero =
match_each_unsigned_integer_ptype!(encoded_ptype, |$T| Scalar::zero::<$T>(nullability));
let zero = match_each_unsigned_integer_ptype!(encoded_ptype, |$T| Scalar::primitive($T::default(), nullability));

Ok(match logical_validity {
LogicalValidity::AllValid(len) => ConstantArray::new(zero, len).into_array(),
Expand Down
8 changes: 4 additions & 4 deletions encodings/runend/src/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@ use vortex_array::variants::PrimitiveArrayTrait;
use vortex_array::{ArrayDType as _, ArrayLen as _, IntoArrayVariant as _};
use vortex_dtype::{match_each_unsigned_integer_ptype, DType, NativePType};
use vortex_error::VortexResult;
use vortex_scalar::Scalar;
use vortex_scalar::ScalarValue;

use crate::{RunEndArray, RunEndEncoding};

impl StatisticsVTable<RunEndArray> for RunEndEncoding {
fn compute_statistics(&self, array: &RunEndArray, stat: Stat) -> VortexResult<StatsSet> {
let maybe_stat = match stat {
Stat::Min | Stat::Max => array.values().statistics().compute(stat),
Stat::IsSorted => Some(Scalar::from(
Stat::IsSorted => Some(ScalarValue::from(
array
.values()
.statistics()
Expand All @@ -25,10 +25,10 @@ impl StatisticsVTable<RunEndArray> for RunEndEncoding {
&& array.logical_validity().all_valid(),
)),
Stat::TrueCount => match array.dtype() {
DType::Bool(_) => Some(Scalar::from(array.true_count()?)),
DType::Bool(_) => Some(ScalarValue::from(array.true_count()?)),
_ => None,
},
Stat::NullCount => Some(Scalar::from(array.null_count()?)),
Stat::NullCount => Some(ScalarValue::from(array.null_count()?)),
_ => None,
};

Expand Down
8 changes: 4 additions & 4 deletions encodings/sparse/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,13 @@ impl SparseArray {

#[inline]
pub fn fill_scalar(&self) -> Scalar {
let fill_value = ScalarValue::from_flexbytes(
let sv = ScalarValue::from_flexbytes(
self.as_ref()
.byte_buffer(0)
.vortex_expect("Missing fill value buffer"),
)
.vortex_expect("Failed to deserialize fill value");
Scalar::new(self.dtype().clone(), fill_value)
Scalar::new(self.dtype().clone(), sv)
}
}

Expand All @@ -173,14 +173,14 @@ impl StatisticsVTable<SparseArray> for SparseEncoding {
let fill_stats = if array.fill_scalar().is_null() {
StatsSet::nulls(fill_len, array.dtype())
} else {
StatsSet::constant(&array.fill_scalar(), fill_len)
StatsSet::constant(array.fill_scalar(), fill_len)
};

if values.is_empty() {
return Ok(fill_stats);
}

Ok(stats.merge_unordered(&fill_stats))
Ok(stats.merge_unordered(&fill_stats, array.dtype()))
}
}

Expand Down
43 changes: 29 additions & 14 deletions encodings/zigzag/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use vortex_array::{
};
use vortex_dtype::{DType, PType};
use vortex_error::{vortex_bail, vortex_err, vortex_panic, VortexExpect as _, VortexResult};
use vortex_scalar::Scalar;
use vortex_scalar::ScalarValue;
use zigzag::ZigZag as ExternalZigZag;

use crate::compress::zigzag_encode;
Expand Down Expand Up @@ -98,15 +98,12 @@ impl StatisticsVTable<ZigZagArray> for ZigZagEncoding {
stats.set(stat, val);
}
} else if matches!(stat, Stat::Min | Stat::Max) {
let encoded_max = array
.encoded()
.statistics()
.compute_as_cast::<u64>(Stat::Max);
let encoded_max = array.encoded().statistics().compute_as::<u64>(Stat::Max);
if let Some(val) = encoded_max {
// the max of the encoded array is the element with the highest absolute value (so either min if negative, or max if positive)
let decoded = <i64 as ExternalZigZag>::decode(val);
let decoded_stat = if decoded < 0 { Stat::Min } else { Stat::Max };
stats.set(decoded_stat, Scalar::from(decoded).cast(array.dtype())?);
stats.set(decoded_stat, ScalarValue::from(decoded));
}
}

Expand All @@ -125,6 +122,7 @@ mod test {
use vortex_array::compute::{scalar_at, slice};
use vortex_array::IntoArrayData;
use vortex_buffer::buffer;
use vortex_scalar::Scalar;

use super::*;

Expand All @@ -133,19 +131,36 @@ mod test {
let array = buffer![1i32, -5i32, 2, 3, 4, 5, 6, 7, 8, 9, 10].into_array();
let zigzag = ZigZagArray::encode(&array).unwrap();

for stat in [Stat::Max, Stat::NullCount, Stat::IsConstant] {
let value = zigzag.statistics().compute(stat);
assert_eq!(value, array.statistics().compute(stat));
}
assert_eq!(
zigzag.statistics().compute_max::<i32>(),
array.statistics().compute_max::<i32>()
);
assert_eq!(
zigzag.statistics().compute_null_count(),
array.statistics().compute_null_count()
);
assert_eq!(
zigzag.statistics().compute_is_constant(),
array.statistics().compute_is_constant()
);

let sliced = ZigZagArray::try_from(slice(zigzag, 0, 2).unwrap()).unwrap();
assert_eq!(
scalar_at(&sliced, sliced.len() - 1).unwrap(),
Scalar::from(-5i32)
);
for stat in [Stat::Min, Stat::NullCount, Stat::IsConstant] {
let value = sliced.statistics().compute(stat);
assert_eq!(value, array.statistics().compute(stat));
}

assert_eq!(
sliced.statistics().compute_min::<i32>(),
array.statistics().compute_min::<i32>()
);
assert_eq!(
sliced.statistics().compute_null_count(),
array.statistics().compute_null_count()
);
assert_eq!(
sliced.statistics().compute_is_constant(),
array.statistics().compute_is_constant()
);
}
}
6 changes: 3 additions & 3 deletions vortex-array/src/array/bool/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use arrow_array::BooleanArray;
use arrow_buffer::MutableBuffer;
use vortex_buffer::{Alignment, ByteBuffer};
use vortex_dtype::{DType, Nullability};
use vortex_error::{vortex_bail, VortexError, VortexExpect as _, VortexResult};
use vortex_error::{vortex_bail, VortexExpect as _, VortexResult};

use crate::encoding::ids;
use crate::stats::StatsSet;
Expand All @@ -13,8 +13,8 @@ use crate::validity::{LogicalValidity, Validity, ValidityMetadata, ValidityVTabl
use crate::variants::{BoolArrayTrait, VariantsVTable};
use crate::visitor::{ArrayVisitor, VisitorVTable};
use crate::{
impl_encoding, ArrayData, ArrayLen, Canonical, DeserializeMetadata, IntoArrayData,
IntoCanonical, RkyvMetadata,
impl_encoding, ArrayLen, Canonical, DeserializeMetadata, IntoArrayData, IntoCanonical,
RkyvMetadata,
};

pub mod compute;
Expand Down
4 changes: 2 additions & 2 deletions vortex-array/src/array/bool/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -276,8 +276,8 @@ mod test {
assert!(!bool_arr.statistics().compute_is_strict_sorted().unwrap());
assert!(bool_arr.statistics().compute_is_sorted().unwrap());
assert!(bool_arr.statistics().compute_is_constant().unwrap());
assert_eq!(bool_arr.statistics().compute(Stat::Min), None);
assert_eq!(bool_arr.statistics().compute(Stat::Max), None);
assert!(bool_arr.statistics().compute(Stat::Min).is_none());
assert!(bool_arr.statistics().compute(Stat::Max).is_none());
assert_eq!(bool_arr.statistics().compute_run_count().unwrap(), 1);
assert_eq!(bool_arr.statistics().compute_true_count().unwrap(), 0);
assert_eq!(bool_arr.statistics().compute_null_count().unwrap(), 5);
Expand Down
5 changes: 1 addition & 4 deletions vortex-array/src/array/chunked/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,10 @@
use std::fmt::{Debug, Display};

use futures_util::stream;
use rkyv::{access, to_bytes};
use serde::{Deserialize, Serialize};
use vortex_buffer::BufferMut;
use vortex_dtype::{DType, Nullability, PType};
use vortex_error::{
vortex_bail, vortex_panic, VortexError, VortexExpect as _, VortexResult, VortexUnwrap,
};
use vortex_error::{vortex_bail, vortex_panic, VortexExpect as _, VortexResult, VortexUnwrap};

use crate::array::primitive::PrimitiveArray;
use crate::compute::{scalar_at, search_sorted_usize, SearchSortedSide};
Expand Down
3 changes: 2 additions & 1 deletion vortex-array/src/array/chunked/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use vortex_error::VortexResult;
use crate::array::chunked::ChunkedArray;
use crate::array::ChunkedEncoding;
use crate::stats::{ArrayStatistics, Stat, StatisticsVTable, StatsSet};
use crate::ArrayDType;

impl StatisticsVTable<ChunkedArray> for ChunkedEncoding {
fn compute_statistics(&self, array: &ChunkedArray, stat: Stat) -> VortexResult<StatsSet> {
Expand All @@ -20,7 +21,7 @@ impl StatisticsVTable<ChunkedArray> for ChunkedEncoding {
}
.unwrap_or_default()
})
.reduce(|acc, x| acc.merge_ordered(&x))
.reduce(|acc, x| acc.merge_ordered(&x, array.dtype()))
.unwrap_or_default())
}
}
24 changes: 20 additions & 4 deletions vortex-array/src/array/constant/canonical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,16 @@ fn canonical_byte_view(

#[cfg(test)]
mod tests {
use enum_iterator::all;
use vortex_dtype::half::f16;
use vortex_dtype::{DType, Nullability, PType};
use vortex_scalar::Scalar;

use crate::array::ConstantArray;
use crate::canonical::IntoArrayVariant;
use crate::compute::scalar_at;
use crate::stats::{ArrayStatistics as _, StatsSet};
use crate::{ArrayLen, IntoArrayData as _, IntoCanonical};
use crate::stats::{ArrayStatistics as _, Stat, StatsSet};
use crate::{ArrayDType, ArrayLen, IntoArrayData as _, IntoCanonical};

#[test]
fn test_canonicalize_null() {
Expand Down Expand Up @@ -154,8 +155,23 @@ mod tests {
let canonical = const_array.into_canonical().unwrap();
let canonical_stats = canonical.statistics().to_set();

assert_eq!(canonical_stats, StatsSet::constant(&scalar, 4));
assert_eq!(canonical_stats, stats);
let reference = StatsSet::constant(scalar, 4);
for stat in all::<Stat>() {
let canonical_stat = canonical_stats
.get(stat)
.cloned()
.map(|sv| Scalar::new(stat.dtype(canonical.dtype()), sv));
let reference_stat = reference
.get(stat)
.cloned()
.map(|sv| Scalar::new(stat.dtype(canonical.dtype()), sv));
let original_stat = stats
.get(stat)
.cloned()
.map(|sv| Scalar::new(stat.dtype(canonical.dtype()), sv));
assert_eq!(canonical_stat, reference_stat);
assert_eq!(canonical_stat, original_stat);
}
}

#[test]
Expand Down
9 changes: 4 additions & 5 deletions vortex-array/src/array/constant/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::fmt::Display;
use std::num::IntErrorKind::Empty;

use serde::{Deserialize, Serialize};
use vortex_error::{VortexExpect, VortexResult};
Expand All @@ -25,7 +24,7 @@ impl ConstantArray {
S: Into<Scalar>,
{
let scalar = scalar.into();
let stats = StatsSet::constant(&scalar, length);
let stats = StatsSet::constant(scalar.clone(), length);
let (dtype, scalar_value) = scalar.into_parts();

// Serialize the scalar_value into a FlatBuffer
Expand All @@ -44,13 +43,13 @@ impl ConstantArray {

/// Returns the [`Scalar`] value of this constant array.
pub fn scalar(&self) -> Scalar {
let value = ScalarValue::from_flexbytes(
let sv = ScalarValue::from_flexbytes(
self.as_ref()
.byte_buffer(0)
.vortex_expect("Missing scalar value buffer"),
)
.vortex_expect("Failed to deserialize scalar value");
Scalar::new(self.dtype().clone(), value)
Scalar::new(self.dtype().clone(), sv)
}
}

Expand All @@ -71,7 +70,7 @@ impl ValidityVTable<ConstantArray> for ConstantEncoding {

impl StatisticsVTable<ConstantArray> for ConstantEncoding {
fn compute_statistics(&self, array: &ConstantArray, _stat: Stat) -> VortexResult<StatsSet> {
Ok(StatsSet::constant(&array.scalar(), array.len()))
Ok(StatsSet::constant(array.scalar(), array.len()))
}
}

Expand Down
Loading
Loading