diff --git a/crates/polars-core/src/chunked_array/cast.rs b/crates/polars-core/src/chunked_array/cast.rs index 708ce95bf401..7c9e9a523c05 100644 --- a/crates/polars-core/src/chunked_array/cast.rs +++ b/crates/polars-core/src/chunked_array/cast.rs @@ -4,7 +4,7 @@ use polars_compute::cast::CastOptionsImpl; #[cfg(feature = "serde-lazy")] use serde::{Deserialize, Serialize}; -use crate::chunked_array::metadata::MetadataProperties; +use super::flags::StatisticsFlags; #[cfg(feature = "timezones")] use crate::chunked_array::temporal::validate_time_zone; #[cfg(feature = "dtype-datetime")] @@ -380,15 +380,14 @@ impl BinaryChunked { pub unsafe fn to_string_unchecked(&self) -> StringChunked { let chunks = self .downcast_iter() - .map(|arr| arr.to_utf8view_unchecked().boxed()) + .map(|arr| unsafe { arr.to_utf8view_unchecked() }.boxed()) .collect(); let field = Arc::new(Field::new(self.name().clone(), DataType::String)); let mut ca = StringChunked::new_with_compute_len(field, chunks); - use MetadataProperties as P; - ca.copy_metadata_cast(self, P::SORTED | P::FAST_EXPLODE_LIST); - + use StatisticsFlags as F; + ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST); ca } } @@ -403,9 +402,8 @@ impl StringChunked { let mut ca = BinaryChunked::new_with_compute_len(field, chunks); - use MetadataProperties as P; - ca.copy_metadata_cast(self, P::SORTED | P::FAST_EXPLODE_LIST); - + use StatisticsFlags as F; + ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST); ca } } diff --git a/crates/polars-core/src/chunked_array/flags.rs b/crates/polars-core/src/chunked_array/flags.rs new file mode 100644 index 000000000000..73522fe3baff --- /dev/null +++ b/crates/polars-core/src/chunked_array/flags.rs @@ -0,0 +1,116 @@ +use std::sync::atomic::{AtomicU32, Ordering}; + +use crate::series::IsSorted; + +/// An interior mutable version of [`StatisticsFlags`] +pub struct StatisticsFlagsIM { + inner: AtomicU32, +} + +bitflags::bitflags! { + #[derive(Clone, Copy, Debug, PartialEq, Eq)] + #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] + pub struct StatisticsFlags: u32 { + const IS_SORTED_ANY = 0x03; + + const IS_SORTED_ASC = 0x01; + const IS_SORTED_DSC = 0x02; + const CAN_FAST_EXPLODE_LIST = 0x04; + } +} + +impl std::fmt::Debug for StatisticsFlagsIM { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("ChunkedArrayFlagsIM") + .field(&self.get()) + .finish() + } +} + +impl Clone for StatisticsFlagsIM { + fn clone(&self) -> Self { + Self::new(self.get()) + } +} + +impl PartialEq for StatisticsFlagsIM { + fn eq(&self, other: &Self) -> bool { + self.get() == other.get() + } +} +impl Eq for StatisticsFlagsIM {} + +impl From for StatisticsFlagsIM { + fn from(value: StatisticsFlags) -> Self { + Self { + inner: AtomicU32::new(value.bits()), + } + } +} + +impl StatisticsFlagsIM { + pub fn new(value: StatisticsFlags) -> Self { + Self { + inner: AtomicU32::new(value.bits()), + } + } + + pub fn empty() -> Self { + Self::new(StatisticsFlags::empty()) + } + + pub fn get_mut(&mut self) -> StatisticsFlags { + StatisticsFlags::from_bits(*self.inner.get_mut()).unwrap() + } + pub fn set_mut(&mut self, value: StatisticsFlags) { + *self.inner.get_mut() = value.bits(); + } + + pub fn get(&self) -> StatisticsFlags { + StatisticsFlags::from_bits(self.inner.load(Ordering::Relaxed)).unwrap() + } + pub fn set(&self, value: StatisticsFlags) { + self.inner.store(value.bits(), Ordering::Relaxed); + } +} + +impl StatisticsFlags { + pub fn is_sorted(&self) -> IsSorted { + let is_sorted_asc = self.contains(Self::IS_SORTED_ASC); + let is_sorted_dsc = self.contains(Self::IS_SORTED_DSC); + + assert!(!is_sorted_asc || !is_sorted_dsc); + + if is_sorted_asc { + IsSorted::Ascending + } else if is_sorted_dsc { + IsSorted::Descending + } else { + IsSorted::Not + } + } + + pub fn set_sorted(&mut self, is_sorted: IsSorted) { + let is_sorted = match is_sorted { + IsSorted::Not => Self::empty(), + IsSorted::Ascending => Self::IS_SORTED_ASC, + IsSorted::Descending => Self::IS_SORTED_DSC, + }; + self.remove(Self::IS_SORTED_ASC | Self::IS_SORTED_DSC); + self.insert(is_sorted); + } + + pub fn is_sorted_any(&self) -> bool { + self.contains(Self::IS_SORTED_ASC) | self.contains(Self::IS_SORTED_DSC) + } + pub fn is_sorted_ascending(&self) -> bool { + self.contains(Self::IS_SORTED_ASC) + } + pub fn is_sorted_descending(&self) -> bool { + self.contains(Self::IS_SORTED_DSC) + } + + pub fn can_fast_explode_list(&self) -> bool { + self.contains(Self::CAN_FAST_EXPLODE_LIST) + } +} diff --git a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs index 94fef8aeffa7..f9de31cb353b 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs @@ -14,7 +14,7 @@ pub use revmap::*; use super::*; use crate::chunked_array::cast::CastOptions; -use crate::chunked_array::metadata::MetadataFlags; +use crate::chunked_array::flags::StatisticsFlags; use crate::prelude::*; use crate::series::IsSorted; use crate::using_string_cache; @@ -174,15 +174,15 @@ impl CategoricalChunked { } } - pub(crate) fn get_flags(&self) -> MetadataFlags { + pub(crate) fn get_flags(&self) -> StatisticsFlags { self.physical().get_flags() } /// Set flags for the Chunked Array - pub(crate) fn set_flags(&mut self, mut flags: MetadataFlags) { + pub(crate) fn set_flags(&mut self, mut flags: StatisticsFlags) { // We should not set the sorted flag if we are sorting in lexical order if self.uses_lexical_ordering() { - flags.set_sorted_flag(IsSorted::Not) + flags.set_sorted(IsSorted::Not) } self.physical_mut().set_flags(flags) } diff --git a/crates/polars-core/src/chunked_array/metadata/collect.rs b/crates/polars-core/src/chunked_array/metadata/collect.rs deleted file mode 100644 index fc07b54fce4c..000000000000 --- a/crates/polars-core/src/chunked_array/metadata/collect.rs +++ /dev/null @@ -1,46 +0,0 @@ -use super::{Metadata, MetadataEnv}; -use crate::chunked_array::{ChunkAgg, ChunkedArray, PolarsDataType, PolarsNumericType}; -use crate::series::IsSorted; - -pub trait MetadataCollectable: Sized { - fn collect_cheap_metadata(&mut self) {} - - #[inline(always)] - fn with_cheap_metadata(mut self) -> Self { - self.collect_cheap_metadata(); - self - } -} - -impl MetadataCollectable for ChunkedArray -where - T: PolarsDataType, - T: PolarsNumericType, - ChunkedArray: ChunkAgg, -{ - fn collect_cheap_metadata(&mut self) { - if !MetadataEnv::experimental_enabled() { - return; - } - - if self.len() < 32 { - let (min, max) = self - .min_max() - .map_or((None, None), |(l, r)| (Some(l), Some(r))); - - let has_one_value = self.len() - self.null_count() == 1; - - let md = Metadata::DEFAULT - .sorted_opt(has_one_value.then_some(IsSorted::Ascending)) - .min_value_opt(min) - .max_value_opt(max) - .distinct_count_opt(has_one_value.then_some(1)); - - if !md.is_empty() { - mdlog!("Initializing cheap metadata"); - } - - self.merge_metadata(md); - } - } -} diff --git a/crates/polars-core/src/chunked_array/metadata/env.rs b/crates/polars-core/src/chunked_array/metadata/env.rs deleted file mode 100644 index c34826ea6ab8..000000000000 --- a/crates/polars-core/src/chunked_array/metadata/env.rs +++ /dev/null @@ -1,122 +0,0 @@ -#[derive(Debug, Clone, Copy)] -pub struct MetadataEnv(u32); - -impl MetadataEnv { - const ENV_VAR: &'static str = "POLARS_METADATA_USE"; - - const ENABLED: u32 = 0x1; - const EXPERIMENTAL: u32 = 0x2; - const LOG: u32 = 0x4; - - #[inline] - fn get_cached() -> Self { - if cfg!(debug_assertions) { - let Ok(env) = std::env::var(Self::ENV_VAR) else { - return Self(Self::ENABLED); - }; - - // @NOTE - // We use a RwLock here so that we can mutate it for specific runs or sections of runs - // when we perform A/B tests. - static CACHED: std::sync::RwLock> = - std::sync::RwLock::new(None); - - if let Some((cached_str, cached_value)) = CACHED.read().unwrap().as_ref() { - if cached_str == &env[..] { - return *cached_value; - } - } - - let v = Self::get(); - *CACHED.write().unwrap() = Some((env.to_string(), v)); - v - } else { - static CACHED: std::sync::OnceLock = std::sync::OnceLock::new(); - *CACHED.get_or_init(Self::get) - } - } - - #[inline(never)] - fn get() -> Self { - let Ok(env) = std::env::var(Self::ENV_VAR) else { - return Self(Self::ENABLED); - }; - - match &env[..] { - "0" => Self(0), - "1" => Self(Self::ENABLED), - "experimental" => Self(Self::ENABLED | Self::EXPERIMENTAL), - "experimental,log" => Self(Self::ENABLED | Self::EXPERIMENTAL | Self::LOG), - "log" => Self(Self::ENABLED | Self::LOG), - _ => { - eprintln!("Invalid `{}` environment variable", Self::ENV_VAR); - eprintln!("Possible values:"); - eprintln!(" - 0 = Turn off all usage of metadata"); - eprintln!(" - 1 = Turn on usage of metadata (default)"); - eprintln!( - " - experimental = Turn on normal and experimental usage of metadata" - ); - eprintln!(" - experimental,log = Turn on normal, experimental usage and logging of metadata usage"); - eprintln!(" - log = Turn on normal and logging of metadata usage"); - eprintln!(); - panic!("Invalid environment variable") - }, - } - } - - #[inline(always)] - pub fn disabled() -> bool { - !Self::enabled() - } - - #[inline(always)] - pub fn enabled() -> bool { - if cfg!(debug_assertions) { - Self::get_cached().0 & Self::ENABLED != 0 - } else { - true - } - } - - #[inline(always)] - pub fn log() -> bool { - if cfg!(debug_assertions) { - Self::get_cached().0 & Self::LOG != 0 - } else { - false - } - } - - #[inline(always)] - pub fn experimental_enabled() -> bool { - Self::get_cached().0 & Self::EXPERIMENTAL != 0 - } - - #[cfg(debug_assertions)] - pub fn logfile() -> &'static std::sync::Mutex { - static CACHED: std::sync::OnceLock> = - std::sync::OnceLock::new(); - CACHED.get_or_init(|| { - std::sync::Mutex::new(std::fs::File::create(".polars-metadata.log").unwrap()) - }) - } -} - -macro_rules! mdlog { - ($s:literal$(, $arg:expr)* $(,)?) => { - #[cfg(debug_assertions)] - { - use std::io::Write; - let file = MetadataEnv::logfile(); - writeln!(file.lock().unwrap(), $s$(, $arg)*).unwrap(); - } - - #[cfg(not(debug_assertions))] - { - _ = $s; - $( - _ = $arg; - )* - } - }; -} diff --git a/crates/polars-core/src/chunked_array/metadata/guard.rs b/crates/polars-core/src/chunked_array/metadata/guard.rs deleted file mode 100644 index d558c0d12437..000000000000 --- a/crates/polars-core/src/chunked_array/metadata/guard.rs +++ /dev/null @@ -1,23 +0,0 @@ -use std::ops::Deref; -use std::sync::RwLockReadGuard; - -use super::Metadata; -use crate::chunked_array::PolarsDataType; - -/// A read guard for [`Metadata`] -pub enum MetadataReadGuard<'a, T: PolarsDataType + 'a> { - Unlocked(RwLockReadGuard<'a, Metadata>), - Locked(&'a Metadata), -} - -impl<'a, T: PolarsDataType + 'a> Deref for MetadataReadGuard<'a, T> { - type Target = Metadata; - - #[inline] - fn deref(&self) -> &Self::Target { - match self { - Self::Unlocked(v) => v.deref(), - Self::Locked(v) => v, - } - } -} diff --git a/crates/polars-core/src/chunked_array/metadata/interior_mutable.rs b/crates/polars-core/src/chunked_array/metadata/interior_mutable.rs deleted file mode 100644 index 2b55c22e89e4..000000000000 --- a/crates/polars-core/src/chunked_array/metadata/interior_mutable.rs +++ /dev/null @@ -1,80 +0,0 @@ -use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; - -use super::{Metadata, MetadataTrait}; -use crate::chunked_array::PolarsDataType; - -// I have attempted multiple times to move this interior mutability to a per metadata field basis. -// While this might allow the use of Atomics instead of RwLocks, it suffers several problems: -// -// 1. The amount of boilerplate explodes. For example, you want read, read_blocking, write, -// write_blocking, get_mut, set for each field. -// 2. It is also very difficult to combine with the dynamic dispatch. -// 3. It is difficult to combine with types that do not allow for atomics (e.g. Box<[u8]>). -// 4. You actually have 2 fields per field: the Option and the Value. You run into critical section -// problems if you try to separate these. - -/// An interiorally mutable [`Metadata`] -/// -/// This is essentially a more convenient API around `RwLock`. This also allows it to be -/// `Clone`. -pub struct IMMetadata(RwLock>); - -impl<'a, T: PolarsDataType + 'a> IMMetadata -where - Metadata: MetadataTrait + 'a, -{ - /// Cast the [`IMMetadata`] to a trait object of [`MetadataTrait`] - pub fn upcast(&'a self) -> &'a RwLock { - &self.0 as &RwLock - } - - /// Cast the [`IMMetadata`] to a boxed trait object of [`MetadataTrait`] - pub fn boxed_upcast(&'a self) -> Box { - Box::new(self.0.read().unwrap().clone()) as Box - } -} - -impl IMMetadata { - pub const fn new(md: Metadata) -> Self { - Self(RwLock::new(md)) - } - - /// Try to grab a read guard to the [`Metadata`], this fails if this blocks. - pub fn try_read(&self) -> Option>> { - self.0.try_read().ok() - } - /// Block to grab a read guard the [`Metadata`] - pub fn read(&self) -> RwLockReadGuard> { - self.0.read().unwrap() - } - - /// Try to grab a write guard to the [`Metadata`], this fails if this blocks. - pub fn try_write(&self) -> Option>> { - self.0.try_write().ok() - } - /// Block to grab a write guard the [`Metadata`] - pub fn write(&self) -> RwLockWriteGuard> { - self.0.write().unwrap() - } - - /// Take the internal [`Metadata`] - pub fn take(self) -> Metadata { - self.0.into_inner().unwrap() - } - /// Get the mutable to the internal [`Metadata`] - pub fn get_mut(&mut self) -> &mut Metadata { - self.0.get_mut().unwrap() - } -} - -impl Clone for IMMetadata { - fn clone(&self) -> Self { - Self::new(self.read().clone()) - } -} - -impl Default for IMMetadata { - fn default() -> Self { - Self::new(Default::default()) - } -} diff --git a/crates/polars-core/src/chunked_array/metadata/md_trait.rs b/crates/polars-core/src/chunked_array/metadata/md_trait.rs deleted file mode 100644 index d4a5660c89f3..000000000000 --- a/crates/polars-core/src/chunked_array/metadata/md_trait.rs +++ /dev/null @@ -1,36 +0,0 @@ -use polars_utils::IdxSize; - -use super::{Metadata, MetadataFlags}; -use crate::chunked_array::{IntoScalar, PolarsDataType, Scalar}; - -pub trait MetadataTrait { - fn get_flags(&self) -> MetadataFlags; - fn min_value(&self) -> Option; - fn max_value(&self) -> Option; - - /// Number of unique non-null values - fn distinct_count(&self) -> Option; -} - -impl MetadataTrait for Metadata -where - T::OwnedPhysical: IntoScalar + Clone, -{ - fn get_flags(&self) -> MetadataFlags { - self.get_flags() - } - - fn min_value(&self) -> Option { - self.get_min_value() - .map(|v| v.clone().into_scalar(T::get_dtype()).unwrap()) - } - - fn max_value(&self) -> Option { - self.get_max_value() - .map(|v| v.clone().into_scalar(T::get_dtype()).unwrap()) - } - - fn distinct_count(&self) -> Option { - self.get_distinct_count() - } -} diff --git a/crates/polars-core/src/chunked_array/metadata/mod.rs b/crates/polars-core/src/chunked_array/metadata/mod.rs deleted file mode 100644 index 57a48a2577c1..000000000000 --- a/crates/polars-core/src/chunked_array/metadata/mod.rs +++ /dev/null @@ -1,442 +0,0 @@ -use std::fmt; - -use bitflags::bitflags; -use polars_utils::IdxSize; -#[cfg(feature = "serde")] -use serde::{Deserialize, Serialize}; - -pub use self::collect::MetadataCollectable; -pub use self::env::MetadataEnv; -pub use self::guard::MetadataReadGuard; -pub use self::interior_mutable::IMMetadata; -pub use self::md_trait::MetadataTrait; -use super::PolarsDataType; -use crate::series::IsSorted; - -#[macro_use] -mod env; -mod collect; -mod guard; -mod interior_mutable; -mod md_trait; - -macro_rules! mdenv_may_bail { - (get: $field:literal, $value:expr $(=> $default:expr)?) => {{ - if MetadataEnv::disabled() { - return $($default)?; - } - if MetadataEnv::log() { - mdlog!("Get: '{}' <- {:?}", $field, $value); - } - $value - }}; - (set: $field:literal, $value:expr) => { - if MetadataEnv::disabled() { - return; - } - if MetadataEnv::log() { - mdlog!("Set: '{}' <- {:?}", $field, $value); - } - }; - (init: $field:literal, $value:expr ; $default:expr) => {{ - if MetadataEnv::enabled() { - if MetadataEnv::log() { - mdlog!("Ini: '{}' <- {:?}", $field, $value); - } - $value - } else { - $default - } - }}; -} - -bitflags! { - #[derive(Default, Debug, Clone, Copy, PartialEq)] - pub struct MetadataProperties: u32 { - const SORTED = 0x01; - const FAST_EXPLODE_LIST = 0x02; - const MIN_VALUE = 0x04; - const MAX_VALUE = 0x08; - const DISTINCT_COUNT = 0x10; - } -} - -pub struct Metadata { - flags: MetadataFlags, - - min_value: Option, - max_value: Option, - - /// Number of unique non-null values - distinct_count: Option, -} - -bitflags! { - #[derive(Default, Debug, Clone, Copy, PartialEq)] - #[cfg_attr(feature = "serde", derive(Serialize, Deserialize), serde(transparent))] - pub struct MetadataFlags: u8 { - const SORTED_ASC = 0x01; - const SORTED_DSC = 0x02; - const FAST_EXPLODE_LIST = 0x04; - } -} - -impl MetadataFlags { - pub fn set_sorted_flag(&mut self, sorted: IsSorted) { - mdenv_may_bail!(set: "sorted", sorted); - match sorted { - IsSorted::Not => { - self.remove(MetadataFlags::SORTED_ASC | MetadataFlags::SORTED_DSC); - }, - IsSorted::Ascending => { - self.remove(MetadataFlags::SORTED_DSC); - self.insert(MetadataFlags::SORTED_ASC) - }, - IsSorted::Descending => { - self.remove(MetadataFlags::SORTED_ASC); - self.insert(MetadataFlags::SORTED_DSC) - }, - } - } - - pub fn get_sorted_flag(&self) -> IsSorted { - let sorted = if self.contains(MetadataFlags::SORTED_ASC) { - IsSorted::Ascending - } else if self.contains(MetadataFlags::SORTED_DSC) { - IsSorted::Descending - } else { - IsSorted::Not - }; - - mdenv_may_bail!(get: "sorted", sorted => IsSorted::Not) - } - - pub fn set_fast_explode_list(&mut self, fast_explode_list: bool) { - mdenv_may_bail!(set: "fast_explode_list", fast_explode_list); - self.set(Self::FAST_EXPLODE_LIST, fast_explode_list) - } - - pub fn get_fast_explode_list(&self) -> bool { - let value = self.contains(MetadataFlags::FAST_EXPLODE_LIST); - mdenv_may_bail!(get: "fast_explode_list", value => false) - } -} - -impl Default for Metadata { - fn default() -> Self { - Self::DEFAULT - } -} - -impl Clone for Metadata { - fn clone(&self) -> Self { - Self { - flags: self.flags, - min_value: self.min_value.clone(), - max_value: self.max_value.clone(), - distinct_count: self.distinct_count, - } - } -} - -impl fmt::Debug for Metadata { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("Metadata") - .field("flags", &self.flags) - .field("min_value", &self.min_value) - .field("max_value", &self.max_value) - .field("distinct_count", &self.distinct_count) - .finish() - } -} - -pub enum MetadataMerge { - Keep, - Conflict, - New(Metadata), -} - -impl Metadata { - pub const DEFAULT: Metadata = Self { - flags: MetadataFlags::empty(), - - min_value: None, - max_value: None, - - distinct_count: None, - }; - - // Builder Pattern Methods - pub fn sorted(mut self, is_sorted: IsSorted) -> Self { - self.flags.set_sorted_flag(is_sorted); - self - } - pub fn fast_explode_list(mut self, fast_explode_list: bool) -> Self { - self.flags.set_fast_explode_list(fast_explode_list); - self - } - pub fn flags(mut self, flags: MetadataFlags) -> Self { - self.set_flags(flags); - self - } - pub fn min_value(mut self, min_value: T::OwnedPhysical) -> Self { - self.set_min_value(Some(min_value)); - self - } - pub fn max_value(mut self, max_value: T::OwnedPhysical) -> Self { - self.set_max_value(Some(max_value)); - self - } - pub fn distinct_count(mut self, distinct_count: IdxSize) -> Self { - self.set_distinct_count(Some(distinct_count)); - self - } - pub fn sorted_opt(self, is_sorted: Option) -> Self { - if let Some(is_sorted) = is_sorted { - self.sorted(is_sorted) - } else { - self - } - } - pub fn fast_explode_list_opt(self, fast_explode_list: Option) -> Self { - if let Some(fast_explode_list) = fast_explode_list { - self.fast_explode_list(fast_explode_list) - } else { - self - } - } - pub fn flags_opt(mut self, flags: Option) -> Self { - self.set_flags(flags.unwrap_or(MetadataFlags::empty())); - self - } - pub fn min_value_opt(mut self, min_value: Option) -> Self { - self.set_min_value(min_value); - self - } - pub fn max_value_opt(mut self, max_value: Option) -> Self { - self.set_max_value(max_value); - self - } - pub fn distinct_count_opt(mut self, distinct_count: Option) -> Self { - self.set_distinct_count(distinct_count); - self - } - - /// Create a [`Metadata`] with only the properties set in `props`. - pub fn filter_props_cast(&self, props: MetadataProperties) -> Metadata { - if props.is_empty() { - return Metadata::DEFAULT; - } - - debug_assert!(!props.contains(P::MIN_VALUE)); - debug_assert!(!props.contains(P::MAX_VALUE)); - - use {MetadataFlags as F, MetadataProperties as P}; - - let sorted = if props.contains(P::SORTED) { - self.flags & (F::SORTED_ASC | F::SORTED_DSC) - } else { - F::empty() - }; - let fast_explode_list = if props.contains(P::FAST_EXPLODE_LIST) { - self.flags & F::FAST_EXPLODE_LIST - } else { - F::empty() - }; - - Metadata { - flags: sorted | fast_explode_list, - min_value: None, - max_value: None, - distinct_count: self - .distinct_count - .as_ref() - .cloned() - .filter(|_| props.contains(P::DISTINCT_COUNT)), - } - } - - /// Create a [`Metadata`] with only the properties set in `props`. - pub fn filter_props(&self, props: MetadataProperties) -> Self { - if props.is_empty() { - return Metadata::DEFAULT; - } - - use {MetadataFlags as F, MetadataProperties as P}; - - let sorted = if props.contains(P::SORTED) { - self.flags & (F::SORTED_ASC | F::SORTED_DSC) - } else { - F::empty() - }; - let fast_explode_list = if props.contains(P::FAST_EXPLODE_LIST) { - self.flags & F::FAST_EXPLODE_LIST - } else { - F::empty() - }; - - let min_value = self - .min_value - .as_ref() - .cloned() - .filter(|_| props.contains(P::MIN_VALUE)); - let max_value = self - .max_value - .as_ref() - .cloned() - .filter(|_| props.contains(P::MAX_VALUE)); - let distinct_count = self - .distinct_count - .as_ref() - .cloned() - .filter(|_| props.contains(P::DISTINCT_COUNT)); - - Self { - flags: mdenv_may_bail!(init: "flags", sorted | fast_explode_list ; MetadataFlags::empty()), - min_value: mdenv_may_bail!(init: "min_value", min_value ; None), - max_value: mdenv_may_bail!(init: "max_value", max_value ; None), - distinct_count: mdenv_may_bail!(init: "distinct_count", distinct_count ; None), - } - } - - /// Merge the maximum information from both [`Metadata`]s into one [`Metadata`]. - /// - /// It returns - /// - [`MetadataMerge::Keep`] if the `self` already contains all the information - /// - [`MetadataMerge::New(md)`][MetadataMerge::New] if we have learned new information - /// - [`MetadataMerge::Conflict`] if the two structures contain conflicting metadata - pub fn merge(&self, other: Self) -> MetadataMerge { - if MetadataEnv::disabled() || other.is_empty() { - return MetadataMerge::Keep; - } - - let sorted_conflicts = matches!( - (self.is_sorted(), other.is_sorted()), - (IsSorted::Ascending, IsSorted::Descending) - | (IsSorted::Descending, IsSorted::Ascending) - ); - - let is_conflict = sorted_conflicts - || matches!((self.get_min_value(), other.get_min_value()), (Some(x), Some(y)) if x != y) - || matches!((self.get_max_value(), other.get_max_value()), (Some(x), Some(y)) if x != y) - || matches!((self.get_distinct_count(), other.get_distinct_count()), (Some(x), Some(y)) if x != y); - - if is_conflict { - return MetadataMerge::Conflict; - } - - let is_new = (!self.get_fast_explode_list() && other.get_fast_explode_list()) - || (self.is_sorted() == IsSorted::Not && other.is_sorted() != IsSorted::Not) - || matches!( - (self.get_min_value(), other.get_min_value()), - (None, Some(_)) - ) - || matches!( - (self.get_max_value(), other.get_max_value()), - (None, Some(_)) - ) - || matches!( - (self.get_distinct_count(), other.get_distinct_count()), - (None, Some(_)) - ); - - if !is_new { - return MetadataMerge::Keep; - } - - let min_value = self.min_value.as_ref().cloned().or(other.min_value); - let max_value = self.max_value.as_ref().cloned().or(other.max_value); - let distinct_count = self.distinct_count.or(other.distinct_count); - - MetadataMerge::New(Metadata { - flags: mdenv_may_bail!(init: "flags", self.flags | other.flags ; MetadataFlags::empty()), - min_value: mdenv_may_bail!(init: "min_value", min_value ; None), - max_value: mdenv_may_bail!(init: "max_value", max_value ; None), - distinct_count: mdenv_may_bail!(init: "distinct_count", distinct_count ; None), - }) - } - - pub fn is_empty(&self) -> bool { - self.flags.is_empty() - && self.min_value.is_none() - && self.max_value.is_none() - && self.distinct_count.is_none() - } - - pub fn is_sorted_ascending(&self) -> bool { - self.flags.get_sorted_flag() == IsSorted::Ascending - } - - pub fn set_sorted_ascending(&mut self, value: bool) { - self.flags.set_sorted_flag(if value { - IsSorted::Ascending - } else { - IsSorted::Not - }); - } - - pub fn is_sorted_descending(&self) -> bool { - self.flags.get_sorted_flag() == IsSorted::Descending - } - - pub fn set_sorted_descending(&mut self, value: bool) { - self.flags.set_sorted_flag(if value { - IsSorted::Descending - } else { - IsSorted::Not - }); - } - - pub fn get_fast_explode_list(&self) -> bool { - self.flags.get_fast_explode_list() - } - - pub fn set_fast_explode_list(&mut self, value: bool) { - self.flags.set_fast_explode_list(value); - } - - pub fn is_sorted_any(&self) -> bool { - self.flags.get_sorted_flag() != IsSorted::Not - } - pub fn is_sorted(&self) -> IsSorted { - self.flags.get_sorted_flag() - } - - pub fn set_sorted_flag(&mut self, is_sorted: IsSorted) { - self.flags.set_sorted_flag(is_sorted) - } - - pub fn set_flags(&mut self, flags: MetadataFlags) { - mdenv_may_bail!(set: "flags", flags); - self.flags = flags; - } - pub fn set_min_value(&mut self, min_value: Option) { - mdenv_may_bail!(set: "min_value", min_value); - self.min_value = min_value; - } - pub fn set_max_value(&mut self, max_value: Option) { - mdenv_may_bail!(set: "max_value", max_value); - self.max_value = max_value; - } - pub fn set_distinct_count(&mut self, distinct_count: Option) { - mdenv_may_bail!(set: "distinct_count", distinct_count); - self.distinct_count = distinct_count; - } - - pub fn get_flags(&self) -> MetadataFlags { - let flags = self.flags; - mdenv_may_bail!(get: "flags", flags => MetadataFlags::empty()) - } - pub fn get_min_value(&self) -> Option<&T::OwnedPhysical> { - let min_value = self.min_value.as_ref(); - mdenv_may_bail!(get: "min_value", min_value => None) - } - pub fn get_max_value(&self) -> Option<&T::OwnedPhysical> { - let max_value = self.max_value.as_ref(); - mdenv_may_bail!(get: "max_value", max_value => None) - } - pub fn get_distinct_count(&self) -> Option { - let distinct_count = self.distinct_count; - mdenv_may_bail!(get: "distinct_count", distinct_count => None) - } -} diff --git a/crates/polars-core/src/chunked_array/mod.rs b/crates/polars-core/src/chunked_array/mod.rs index ab2b73db05c3..3f09d6f37574 100644 --- a/crates/polars-core/src/chunked_array/mod.rs +++ b/crates/polars-core/src/chunked_array/mod.rs @@ -1,6 +1,6 @@ //! The typed heart of every Series column. use std::iter::Map; -use std::sync::{Arc, RwLockReadGuard, RwLockWriteGuard}; +use std::sync::Arc; use arrow::array::*; use arrow::bitmap::Bitmap; @@ -15,9 +15,9 @@ pub mod builder; pub mod cast; pub mod collect; pub mod comparison; +pub mod flags; pub mod float; pub mod iterator; -pub mod metadata; #[cfg(feature = "ndarray")] pub(crate) mod ndarray; @@ -55,10 +55,7 @@ use arrow::legacy::prelude::*; #[cfg(feature = "dtype-struct")] pub use struct_::StructChunked; -use self::metadata::{ - IMMetadata, Metadata, MetadataFlags, MetadataMerge, MetadataProperties, MetadataReadGuard, - MetadataTrait, -}; +use self::flags::{StatisticsFlags, StatisticsFlagsIM}; use crate::series::IsSorted; use crate::utils::{first_non_null, last_non_null}; @@ -145,31 +142,11 @@ pub struct ChunkedArray { pub(crate) field: Arc, pub(crate) chunks: Vec, - // While it might be temping to make Arc<...> into Option>, it is very difficult to - // combine with the interior mutability that IMMetadata provides. - pub(crate) md: Arc>, + pub(crate) flags: StatisticsFlagsIM, length: usize, null_count: usize, -} - -impl ChunkedArray -where - Metadata: MetadataTrait, -{ - /// Attempt to get a reference to the trait object containing the [`ChunkedArray`]'s [`Metadata`] - /// - /// This fails if there is a need to block. - pub fn metadata_dyn(&self) -> Option> { - self.md.as_ref().upcast().try_read().ok() - } - - /// Attempt to get a reference to the trait object containing the [`ChunkedArray`]'s [`Metadata`] - /// - /// This fails if there is a need to block. - pub fn boxed_metadata_dyn<'a>(&'a self) -> Box { - self.md.as_ref().boxed_upcast() - } + _pd: std::marker::PhantomData, } impl ChunkedArray { @@ -242,52 +219,25 @@ impl ChunkedArray { Self { field, chunks, - md: Arc::new(IMMetadata::default()), + flags: StatisticsFlagsIM::empty(), + _pd: Default::default(), length, null_count, } } - /// Get a guard to read the [`ChunkedArray`]'s [`Metadata`] - pub fn metadata(&self) -> MetadataReadGuard { - self.md.as_ref().try_read().map_or( - MetadataReadGuard::Locked(&Metadata::DEFAULT), - MetadataReadGuard::Unlocked, - ) - } - - /// Get a guard to read/write the [`ChunkedArray`]'s [`Metadata`] - pub fn interior_mut_metadata(&self) -> RwLockWriteGuard> { - self.md.as_ref().write() - } - - /// Get a reference to [`Arc`] that contains the [`ChunkedArray`]'s [`Metadata`] - pub fn metadata_arc(&self) -> &Arc> { - &self.md - } - - /// Get a [`Arc`] that contains the [`ChunkedArray`]'s [`Metadata`] - pub fn metadata_owned_arc(&self) -> Arc> { - self.md.clone() - } - - /// Get a mutable reference to the [`Arc`] that contains the [`ChunkedArray`]'s [`Metadata`] - pub fn metadata_mut(&mut self) -> &mut Arc> { - &mut self.md - } - pub(crate) fn is_sorted_ascending_flag(&self) -> bool { - self.metadata().is_sorted_ascending() + self.get_flags().is_sorted_ascending() } pub(crate) fn is_sorted_descending_flag(&self) -> bool { - self.metadata().is_sorted_descending() + self.get_flags().is_sorted_descending() } /// Whether `self` is sorted in any direction. pub(crate) fn is_sorted_any(&self) -> bool { - self.metadata().is_sorted_any() + self.get_flags().is_sorted_any() } pub fn unset_fast_explode_list(&mut self) { @@ -295,35 +245,45 @@ impl ChunkedArray { } pub fn set_fast_explode_list(&mut self, value: bool) { - Arc::make_mut(self.metadata_mut()) - .get_mut() - .set_fast_explode_list(value) + let mut flags = self.flags.get_mut(); + flags.set(StatisticsFlags::CAN_FAST_EXPLODE_LIST, value); + self.flags.set_mut(flags); } pub fn get_fast_explode_list(&self) -> bool { - self.get_flags().get_fast_explode_list() + self.get_flags().can_fast_explode_list() } - pub fn get_flags(&self) -> MetadataFlags { - self.metadata().get_flags() + pub fn get_flags(&self) -> StatisticsFlags { + self.flags.get() } /// Set flags for the [`ChunkedArray`] - pub(crate) fn set_flags(&mut self, flags: MetadataFlags) { - // @TODO: This should probably just not be here - let md = Arc::make_mut(self.metadata_mut()); - md.get_mut().set_flags(flags); + pub(crate) fn set_flags(&mut self, flags: StatisticsFlags) { + self.flags = StatisticsFlagsIM::new(flags); } pub fn is_sorted_flag(&self) -> IsSorted { - self.metadata().is_sorted() + self.get_flags().is_sorted() + } + + pub fn retain_flags_from( + &mut self, + from: &ChunkedArray, + retain_flags: StatisticsFlags, + ) { + let flags = from.flags.get(); + // Try to avoid write contention. + if !flags.is_empty() { + self.set_flags(flags & retain_flags) + } } /// Set the 'sorted' bit meta info. pub fn set_sorted_flag(&mut self, sorted: IsSorted) { - Arc::make_mut(self.metadata_mut()) - .get_mut() - .set_sorted_flag(sorted) + let mut flags = self.flags.get_mut(); + flags.set_sorted(sorted); + self.flags.set_mut(flags); } /// Set the 'sorted' bit meta info. @@ -333,116 +293,6 @@ impl ChunkedArray { out } - pub fn get_min_value(&self) -> Option { - self.metadata().get_min_value().cloned() - } - - pub fn get_max_value(&self) -> Option { - self.metadata().get_max_value().cloned() - } - - pub fn get_distinct_count(&self) -> Option { - self.metadata().get_distinct_count() - } - - pub fn merge_metadata(&mut self, md: Metadata) { - let self_md = self.metadata_mut(); - let self_md = self_md.as_ref(); - let self_md = self_md.read(); - - match self_md.merge(md) { - MetadataMerge::Keep => {}, - MetadataMerge::New(md) => { - let md = Arc::new(IMMetadata::new(md)); - drop(self_md); - self.md = md; - }, - MetadataMerge::Conflict => { - panic!("Trying to merge metadata, but got conflicting information") - }, - } - } - - /// Copies [`Metadata`] properties specified by `props` from `other` with different underlying [`PolarsDataType`] into - /// `self`. - /// - /// This does not copy the properties with a different type between the [`Metadata`]s (e.g. - /// `min_value` and `max_value`) and will panic on debug builds if that is attempted. - #[inline(always)] - pub fn copy_metadata_cast( - &mut self, - other: &ChunkedArray, - props: MetadataProperties, - ) { - use MetadataProperties as P; - - // If you add a property, add it here and below to ensure that metadata is copied - // properly. - debug_assert!( - { - props - - (P::SORTED - | P::FAST_EXPLODE_LIST - | P::MIN_VALUE - | P::MAX_VALUE - | P::DISTINCT_COUNT) - } - .is_empty(), - "A MetadataProperty was not added to the copy_metadata_cast check" - ); - - debug_assert!(!props.contains(P::MIN_VALUE)); - debug_assert!(!props.contains(P::MAX_VALUE)); - - // We add a fast path here for if both metadatas are empty, as this is quite a common case. - if props.is_empty() { - return; - } - - let other_md = other.metadata(); - - if other_md.is_empty() { - return; - } - - let other_md = other_md.filter_props_cast(props); - self.merge_metadata(other_md); - } - - /// Copies [`Metadata`] properties specified by `props` from `other` into `self`. - #[inline(always)] - pub fn copy_metadata(&mut self, other: &Self, props: MetadataProperties) { - use MetadataProperties as P; - - // If you add a property add it here and below to ensure that metadata is copied properly. - debug_assert!( - { - props - - (P::SORTED - | P::FAST_EXPLODE_LIST - | P::MIN_VALUE - | P::MAX_VALUE - | P::DISTINCT_COUNT) - } - .is_empty(), - "A MetadataProperty was not added to the copy_metadata check" - ); - - // We add a fast path here for if both metadatas are empty, as this is quite a common case. - if props.is_empty() { - return; - } - - let other_md = other.metadata(); - - if other_md.is_empty() { - return; - } - - let other_md = other_md.filter_props(props); - self.merge_metadata(other_md); - } - /// Get the index of the first non null value in this [`ChunkedArray`]. pub fn first_non_null(&self) -> Option { if self.null_count() == self.len() { @@ -555,9 +405,8 @@ impl ChunkedArray { )]) }; - use MetadataProperties as P; - ca.copy_metadata(self, P::SORTED | P::FAST_EXPLODE_LIST); - + use StatisticsFlags as F; + ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST); ca } @@ -644,7 +493,7 @@ impl ChunkedArray { /// Rename this [`ChunkedArray`]. pub fn rename(&mut self, name: PlSmallStr) { - self.field = Arc::new(Field::new(name, self.field.dtype().clone())) + self.field = Arc::new(Field::new(name, self.field.dtype().clone())); } /// Return this [`ChunkedArray`] with a new name. @@ -921,7 +770,9 @@ impl Clone for ChunkedArray { ChunkedArray { field: self.field.clone(), chunks: self.chunks.clone(), - md: self.md.clone(), + flags: self.flags.clone(), + + _pd: Default::default(), length: self.length, null_count: self.null_count, } @@ -992,7 +843,9 @@ impl Default for ChunkedArray { field: Arc::new(Field::new(PlSmallStr::EMPTY, dtype)), // Invariant: always has 1 chunk. chunks: vec![new_empty_array(arrow_dtype)], - md: Arc::new(IMMetadata::default()), + flags: StatisticsFlagsIM::empty(), + + _pd: Default::default(), length: 0, null_count: 0, } diff --git a/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs index 0a059eb54274..ef5fd105203d 100644 --- a/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs @@ -18,7 +18,6 @@ pub use var::*; use super::float_sorted_arg_max::{ float_arg_max_sorted_ascending, float_arg_max_sorted_descending, }; -use crate::chunked_array::metadata::MetadataEnv; use crate::chunked_array::ChunkedArray; use crate::datatypes::{BooleanChunked, PolarsNumericType}; use crate::prelude::*; @@ -115,10 +114,6 @@ where .reduce(MinMax::min_ignore_nan), }; - if MetadataEnv::experimental_enabled() { - self.interior_mut_metadata().set_min_value(result); - } - result } @@ -153,10 +148,6 @@ where .reduce(MinMax::max_ignore_nan), }; - if MetadataEnv::experimental_enabled() { - self.interior_mut_metadata().set_max_value(result); - } - result } @@ -205,18 +196,6 @@ where }), }; - if MetadataEnv::experimental_enabled() { - let (min, max) = match result { - Some((min, max)) => (Some(min), Some(max)), - None => (None, None), - }; - - let mut md = self.interior_mut_metadata(); - - md.set_min_value(min); - md.set_max_value(max); - } - result } diff --git a/crates/polars-core/src/chunked_array/ops/arity.rs b/crates/polars-core/src/chunked_array/ops/arity.rs index 774b6fba6755..90c195493e3f 100644 --- a/crates/polars-core/src/chunked_array/ops/arity.rs +++ b/crates/polars-core/src/chunked_array/ops/arity.rs @@ -5,7 +5,7 @@ use arrow::compute::utils::combine_validities_and; use polars_error::PolarsResult; use polars_utils::pl_str::PlSmallStr; -use crate::chunked_array::metadata::MetadataProperties; +use crate::chunked_array::flags::StatisticsFlags; use crate::datatypes::{ArrayCollectIterExt, ArrayFromIter}; use crate::prelude::{ChunkedArray, CompatLevel, PolarsDataType, Series, StringChunked}; use crate::utils::{align_chunks_binary, align_chunks_binary_owned, align_chunks_ternary}; @@ -540,12 +540,11 @@ where let mut ca = lhs.copy_with_chunks(chunks); - use MetadataProperties as P; - - let mut properties = P::empty(); - properties.set(P::SORTED, keep_sorted); - properties.set(P::FAST_EXPLODE_LIST, keep_fast_explode); - ca.copy_metadata(&lhs, properties); + let mut retain_flags = StatisticsFlags::empty(); + use StatisticsFlags as F; + retain_flags.set(F::IS_SORTED_ANY, keep_sorted); + retain_flags.set(F::CAN_FAST_EXPLODE_LIST, keep_fast_explode); + ca.retain_flags_from(lhs.as_ref(), retain_flags); ca } @@ -596,11 +595,11 @@ where .collect::, E>>()?; let mut ca = lhs.copy_with_chunks(chunks); - use MetadataProperties as P; - let mut properties = P::empty(); - properties.set(P::SORTED, keep_sorted); - properties.set(P::FAST_EXPLODE_LIST, keep_fast_explode); - ca.copy_metadata(&lhs, properties); + let mut retain_flags = StatisticsFlags::empty(); + use StatisticsFlags as F; + retain_flags.set(F::IS_SORTED_ANY, keep_sorted); + retain_flags.set(F::CAN_FAST_EXPLODE_LIST, keep_fast_explode); + ca.retain_flags_from(lhs.as_ref(), retain_flags); Ok(ca) } diff --git a/crates/polars-core/src/chunked_array/ops/chunkops.rs b/crates/polars-core/src/chunked_array/ops/chunkops.rs index e8c42a681a28..c910fa40bf8d 100644 --- a/crates/polars-core/src/chunked_array/ops/chunkops.rs +++ b/crates/polars-core/src/chunked_array/ops/chunkops.rs @@ -5,7 +5,7 @@ use arrow::legacy::kernels::concatenate::concatenate_owned_unchecked; use polars_error::constants::LENGTH_LIMIT_MSG; use super::*; -use crate::chunked_array::metadata::MetadataProperties; +use crate::chunked_array::flags::StatisticsFlags; #[cfg(feature = "object")] use crate::chunked_array::object::builder::ObjectChunkedBuilder; use crate::utils::slice_offsets; @@ -176,16 +176,8 @@ impl ChunkedArray { let mut ca = unsafe { self.copy_with_chunks(chunks) }; - use MetadataProperties as P; - ca.copy_metadata( - self, - P::SORTED - | P::FAST_EXPLODE_LIST - | P::MIN_VALUE - | P::MAX_VALUE - | P::DISTINCT_COUNT, - ); - + use StatisticsFlags as F; + ca.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST); ca } }, @@ -223,49 +215,9 @@ impl ChunkedArray { let mut out_l = unsafe { self.copy_with_chunks(l) }; let mut out_r = unsafe { self.copy_with_chunks(r) }; - use MetadataProperties as P; - let mut properties_l = P::SORTED | P::FAST_EXPLODE_LIST; - let mut properties_r = P::SORTED | P::FAST_EXPLODE_LIST; - - let is_ascending = self.is_sorted_ascending_flag(); - let is_descending = self.is_sorted_descending_flag(); - - if is_ascending || is_descending { - let has_nulls_at_start = self.null_count() != 0 - && self - .chunks() - .first() - .unwrap() - .as_ref() - .validity() - .is_some_and(|bm| bm.get(0).unwrap()); - - if !has_nulls_at_start { - let can_copy_min_value = !has_nulls_at_start && is_ascending; - let can_copy_max_value = !has_nulls_at_start && is_descending; - - properties_l.set(P::MIN_VALUE, can_copy_min_value); - properties_l.set(P::MAX_VALUE, can_copy_max_value); - } - - let has_nulls_at_end = self.null_count() != 0 - && self - .chunks() - .last() - .unwrap() - .as_ref() - .validity() - .is_some_and(|bm| bm.get(bm.len() - 1).unwrap()); - - if !has_nulls_at_end { - let can_copy_min_value = !has_nulls_at_end && is_descending; - let can_copy_max_value = !has_nulls_at_end && is_ascending; - properties_r.set(P::MIN_VALUE, can_copy_min_value); - properties_r.set(P::MAX_VALUE, can_copy_max_value); - } - } - out_l.copy_metadata(self, properties_l); - out_r.copy_metadata(self, properties_r); + use StatisticsFlags as F; + out_l.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST); + out_r.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST); (out_l, out_r) } @@ -282,53 +234,8 @@ impl ChunkedArray { let (chunks, len) = slice(&self.chunks, offset, length, self.len()); let mut out = unsafe { self.copy_with_chunks(chunks) }; - use MetadataProperties as P; - let mut properties = P::SORTED | P::FAST_EXPLODE_LIST; - - let is_ascending = self.is_sorted_ascending_flag(); - let is_descending = self.is_sorted_descending_flag(); - - if length != 0 && (is_ascending || is_descending) { - let (raw_offset, slice_len) = slice_offsets(offset, length, self.len()); - - let mut can_copy_min_value = false; - let mut can_copy_max_value = false; - - let is_at_start = raw_offset == 0; - if is_at_start { - let has_nulls_at_start = self.null_count() != 0 - && self - .chunks() - .first() - .unwrap() - .as_ref() - .validity() - .is_some_and(|bm| bm.get(0).unwrap()); - - can_copy_min_value |= !has_nulls_at_start && is_ascending; - can_copy_max_value |= !has_nulls_at_start && is_descending; - } - - let is_until_end = raw_offset + slice_len == self.len(); - if is_until_end { - let has_nulls_at_end = self.null_count() != 0 - && self - .chunks() - .last() - .unwrap() - .as_ref() - .validity() - .is_some_and(|bm| bm.get(bm.len() - 1).unwrap()); - - can_copy_min_value |= !has_nulls_at_end && is_descending; - can_copy_max_value |= !has_nulls_at_end && is_ascending; - } - - properties.set(P::MIN_VALUE, can_copy_min_value); - properties.set(P::MAX_VALUE, can_copy_max_value); - } - - out.copy_metadata(self, properties); + use StatisticsFlags as F; + out.retain_flags_from(self, F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST); out.length = len; out diff --git a/crates/polars-core/src/chunked_array/ops/nulls.rs b/crates/polars-core/src/chunked_array/ops/nulls.rs index 1d1640055a72..96a2d2e8cd96 100644 --- a/crates/polars-core/src/chunked_array/ops/nulls.rs +++ b/crates/polars-core/src/chunked_array/ops/nulls.rs @@ -1,7 +1,7 @@ use arrow::bitmap::Bitmap; use super::*; -use crate::chunked_array::metadata::MetadataProperties; +use crate::chunked_array::flags::StatisticsFlags; impl ChunkedArray { /// Get a mask of the null values. @@ -25,7 +25,8 @@ impl ChunkedArray { pub(crate) fn coalesce_nulls(&self, other: &[ArrayRef]) -> Self { let chunks = coalesce_nulls(&self.chunks, other); let mut ca = unsafe { self.copy_with_chunks(chunks) }; - ca.copy_metadata(self, MetadataProperties::SORTED); + use StatisticsFlags as F; + ca.retain_flags_from(self, F::IS_SORTED_ANY); ca } } diff --git a/crates/polars-core/src/frame/column/mod.rs b/crates/polars-core/src/frame/column/mod.rs index e83658efb2d1..c4484ddbfe5d 100644 --- a/crates/polars-core/src/frame/column/mod.rs +++ b/crates/polars-core/src/frame/column/mod.rs @@ -13,7 +13,7 @@ use self::gather::check_bounds_ca; use self::partitioned::PartitionedColumn; use self::series::SeriesColumn; use crate::chunked_array::cast::CastOptions; -use crate::chunked_array::metadata::{MetadataFlags, MetadataTrait}; +use crate::chunked_array::flags::StatisticsFlags; use crate::datatypes::ReshapeDimension; use crate::prelude::*; use crate::series::{BitRepr, IsSorted, SeriesPhysIter}; @@ -860,23 +860,14 @@ impl Column { } } - pub fn get_flags(&self) -> MetadataFlags { + pub fn get_flags(&self) -> StatisticsFlags { match self { Column::Series(s) => s.get_flags(), // @partition-opt - Column::Partitioned(_) => MetadataFlags::empty(), - // @scalar-opt - Column::Scalar(_) => MetadataFlags::empty(), - } - } - - pub fn get_metadata<'a>(&'a self) -> Option> { - match self { - Column::Series(s) => s.boxed_metadata(), - // @partition-opt - Column::Partitioned(_) => None, - // @scalar-opt - Column::Scalar(_) => None, + Column::Partitioned(_) => StatisticsFlags::empty(), + Column::Scalar(_) => { + StatisticsFlags::IS_SORTED_ASC | StatisticsFlags::CAN_FAST_EXPLODE_LIST + }, } } diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 2980976c81ac..a0cce52e65d1 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -8,7 +8,7 @@ use polars_schema::schema::debug_ensure_matching_schema_names; use polars_utils::itertools::Itertools; use rayon::prelude::*; -use crate::chunked_array::metadata::MetadataFlags; +use crate::chunked_array::flags::StatisticsFlags; #[cfg(feature = "algorithm_group_by")] use crate::chunked_array::ops::unique::is_unique_helper; use crate::prelude::*; @@ -2176,44 +2176,26 @@ impl DataFrame { BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns); let mut fast_explode_list_ca = BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns); - let mut min_value_ca = - StringChunkedBuilder::new(PlSmallStr::from_static("min_value"), num_columns); - let mut max_value_ca = - StringChunkedBuilder::new(PlSmallStr::from_static("max_value"), num_columns); - let mut distinct_count_ca: Vec> = Vec::with_capacity(num_columns); let mut materialized_at_ca = StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns); for col in &self.columns { - let metadata = col.get_metadata(); - - let (flags, min_value, max_value, distinct_count) = - metadata.map_or((MetadataFlags::default(), None, None, None), |md| { - ( - md.get_flags(), - md.min_value(), - md.max_value(), - md.distinct_count(), - ) - }); + let flags = col.get_flags(); let (repr, materialized_at) = match col { Column::Series(s) => ("series", s.materialized_at()), Column::Partitioned(_) => ("partitioned", None), Column::Scalar(_) => ("scalar", None), }; - let sorted_asc = flags.contains(MetadataFlags::SORTED_ASC); - let sorted_dsc = flags.contains(MetadataFlags::SORTED_DSC); - let fast_explode_list = flags.contains(MetadataFlags::FAST_EXPLODE_LIST); + let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC); + let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC); + let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST); column_names.append_value(col.name().clone()); repr_ca.append_value(repr); sorted_asc_ca.append_value(sorted_asc); sorted_dsc_ca.append_value(sorted_dsc); fast_explode_list_ca.append_value(fast_explode_list); - min_value_ca.append_option(min_value.map(|v| v.as_any_value().to_string())); - max_value_ca.append_option(max_value.map(|v| v.as_any_value().to_string())); - distinct_count_ca.push(distinct_count); materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}"))); } @@ -2226,13 +2208,6 @@ impl DataFrame { sorted_asc_ca.finish().into_column(), sorted_dsc_ca.finish().into_column(), fast_explode_list_ca.finish().into_column(), - min_value_ca.finish().into_column(), - max_value_ca.finish().into_column(), - IdxCa::from_slice_options( - PlSmallStr::from_static("distinct_count"), - &distinct_count_ca[..], - ) - .into_column(), materialized_at_ca.finish().into_column(), ], ) diff --git a/crates/polars-core/src/serde/chunked_array.rs b/crates/polars-core/src/serde/chunked_array.rs index 7a643d17185e..0ff8b7b53f4b 100644 --- a/crates/polars-core/src/serde/chunked_array.rs +++ b/crates/polars-core/src/serde/chunked_array.rs @@ -3,7 +3,7 @@ use std::cell::RefCell; use serde::ser::{Error, SerializeMap}; use serde::{Serialize, Serializer}; -use crate::chunked_array::metadata::MetadataFlags; +use crate::chunked_array::flags::StatisticsFlags; use crate::prelude::*; use crate::series::implementations::null::NullChunked; @@ -48,7 +48,7 @@ fn serialize_impl( serializer: S, name: &PlSmallStr, dtype: &DataType, - bit_settings: MetadataFlags, + bit_settings: StatisticsFlags, ca: &ChunkedArray, ) -> std::result::Result<::Ok, ::Error> where diff --git a/crates/polars-core/src/serde/mod.rs b/crates/polars-core/src/serde/mod.rs index 997170210388..e7b96c1312d8 100644 --- a/crates/polars-core/src/serde/mod.rs +++ b/crates/polars-core/src/serde/mod.rs @@ -4,7 +4,7 @@ pub mod series; #[cfg(test)] mod test { - use crate::chunked_array::metadata::MetadataFlags; + use crate::chunked_array::flags::StatisticsFlags; use crate::prelude::*; use crate::series::IsSorted; @@ -56,7 +56,7 @@ mod test { let json = serde_json::to_string(&column).unwrap(); let out = serde_json::from_reader::<_, Column>(json.as_bytes()).unwrap(); let f = out.get_flags(); - assert_ne!(f, MetadataFlags::empty()); + assert_ne!(f, StatisticsFlags::empty()); assert_eq!(column.get_flags(), out.get_flags()); } } diff --git a/crates/polars-core/src/serde/series.rs b/crates/polars-core/src/serde/series.rs index 3b91c0048699..a14778d250da 100644 --- a/crates/polars-core/src/serde/series.rs +++ b/crates/polars-core/src/serde/series.rs @@ -6,7 +6,7 @@ use arrow::io::ipc::write::WriteOptions; use serde::de::{Error as DeError, Visitor}; use serde::{Deserialize, Deserializer, Serialize, Serializer}; -use crate::chunked_array::metadata::MetadataFlags; +use crate::chunked_array::flags::StatisticsFlags; use crate::config; use crate::prelude::*; use crate::utils::accumulate_dataframes_vertical; @@ -120,8 +120,8 @@ impl<'de> Deserialize<'de> for Series { if let Some(custom_metadata) = custom_metadata { if let Some(flags) = custom_metadata.get(&FLAGS_KEY) { - if let Ok(v) = flags.parse::() { - if let Some(flags) = MetadataFlags::from_bits(v) { + if let Ok(v) = flags.parse::() { + if let Some(flags) = StatisticsFlags::from_bits(v) { s.set_flags(flags); } } else if config::verbose() { diff --git a/crates/polars-core/src/series/any_value.rs b/crates/polars-core/src/series/any_value.rs index 81752428e9fb..00cdeb213ffa 100644 --- a/crates/polars-core/src/series/any_value.rs +++ b/crates/polars-core/src/series/any_value.rs @@ -90,51 +90,27 @@ impl Series { dtype: &DataType, strict: bool, ) -> PolarsResult { - use crate::chunked_array::metadata::MetadataCollectable; - if values.is_empty() { return Ok(Self::new_empty(name, dtype)); } let mut s = match dtype { #[cfg(feature = "dtype-i8")] - DataType::Int8 => any_values_to_integer::(values, strict)? - .with_cheap_metadata() - .into_series(), + DataType::Int8 => any_values_to_integer::(values, strict)?.into_series(), #[cfg(feature = "dtype-i16")] - DataType::Int16 => any_values_to_integer::(values, strict)? - .with_cheap_metadata() - .into_series(), - DataType::Int32 => any_values_to_integer::(values, strict)? - .with_cheap_metadata() - .into_series(), - DataType::Int64 => any_values_to_integer::(values, strict)? - .with_cheap_metadata() - .into_series(), + DataType::Int16 => any_values_to_integer::(values, strict)?.into_series(), + DataType::Int32 => any_values_to_integer::(values, strict)?.into_series(), + DataType::Int64 => any_values_to_integer::(values, strict)?.into_series(), #[cfg(feature = "dtype-i128")] - DataType::Int128 => any_values_to_integer::(values, strict)? - .with_cheap_metadata() - .into_series(), + DataType::Int128 => any_values_to_integer::(values, strict)?.into_series(), #[cfg(feature = "dtype-u8")] - DataType::UInt8 => any_values_to_integer::(values, strict)? - .with_cheap_metadata() - .into_series(), + DataType::UInt8 => any_values_to_integer::(values, strict)?.into_series(), #[cfg(feature = "dtype-u16")] - DataType::UInt16 => any_values_to_integer::(values, strict)? - .with_cheap_metadata() - .into_series(), - DataType::UInt32 => any_values_to_integer::(values, strict)? - .with_cheap_metadata() - .into_series(), - DataType::UInt64 => any_values_to_integer::(values, strict)? - .with_cheap_metadata() - .into_series(), - DataType::Float32 => any_values_to_f32(values, strict)? - .with_cheap_metadata() - .into_series(), - DataType::Float64 => any_values_to_f64(values, strict)? - .with_cheap_metadata() - .into_series(), + DataType::UInt16 => any_values_to_integer::(values, strict)?.into_series(), + DataType::UInt32 => any_values_to_integer::(values, strict)?.into_series(), + DataType::UInt64 => any_values_to_integer::(values, strict)?.into_series(), + DataType::Float32 => any_values_to_f32(values, strict)?.into_series(), + DataType::Float64 => any_values_to_f64(values, strict)?.into_series(), DataType::Boolean => any_values_to_bool(values, strict)?.into_series(), DataType::String => any_values_to_string(values, strict)?.into_series(), DataType::Binary => any_values_to_binary(values, strict)?.into_series(), diff --git a/crates/polars-core/src/series/implementations/array.rs b/crates/polars-core/src/series/implementations/array.rs index 1fa55a2e7a98..893ee2b6b0c8 100644 --- a/crates/polars-core/src/series/implementations/array.rs +++ b/crates/polars-core/src/series/implementations/array.rs @@ -3,7 +3,7 @@ use std::borrow::Cow; use self::compare_inner::{TotalEqInner, TotalOrdInner}; use self::sort::arg_sort_row_fmt; -use super::{private, MetadataFlags}; +use super::{private, StatisticsFlags}; use crate::chunked_array::cast::CastOptions; use crate::chunked_array::comparison::*; use crate::chunked_array::AsSinglePtr; @@ -23,11 +23,11 @@ impl private::PrivateSeries for SeriesWrap { self.0.ref_field().dtype() } - fn _get_flags(&self) -> MetadataFlags { + fn _get_flags(&self) -> StatisticsFlags { self.0.get_flags() } - fn _set_flags(&mut self, flags: MetadataFlags) { + fn _set_flags(&mut self, flags: StatisticsFlags) { self.0.set_flags(flags) } diff --git a/crates/polars-core/src/series/implementations/binary.rs b/crates/polars-core/src/series/implementations/binary.rs index fac7a5086c5d..8c011f5b8104 100644 --- a/crates/polars-core/src/series/implementations/binary.rs +++ b/crates/polars-core/src/series/implementations/binary.rs @@ -15,10 +15,10 @@ impl private::PrivateSeries for SeriesWrap { fn _dtype(&self) -> &DataType { self.0.ref_field().dtype() } - fn _get_flags(&self) -> MetadataFlags { + fn _get_flags(&self) -> StatisticsFlags { self.0.get_flags() } - fn _set_flags(&mut self, flags: MetadataFlags) { + fn _set_flags(&mut self, flags: StatisticsFlags) { self.0.set_flags(flags) } diff --git a/crates/polars-core/src/series/implementations/binary_offset.rs b/crates/polars-core/src/series/implementations/binary_offset.rs index 07844d96a994..574f6617252e 100644 --- a/crates/polars-core/src/series/implementations/binary_offset.rs +++ b/crates/polars-core/src/series/implementations/binary_offset.rs @@ -15,10 +15,10 @@ impl private::PrivateSeries for SeriesWrap { fn _dtype(&self) -> &DataType { self.0.ref_field().dtype() } - fn _get_flags(&self) -> MetadataFlags { + fn _get_flags(&self) -> StatisticsFlags { self.0.get_flags() } - fn _set_flags(&mut self, flags: MetadataFlags) { + fn _set_flags(&mut self, flags: StatisticsFlags) { self.0.set_flags(flags) } diff --git a/crates/polars-core/src/series/implementations/boolean.rs b/crates/polars-core/src/series/implementations/boolean.rs index 2c8becc2edab..eaa9bd9a641a 100644 --- a/crates/polars-core/src/series/implementations/boolean.rs +++ b/crates/polars-core/src/series/implementations/boolean.rs @@ -14,10 +14,10 @@ impl private::PrivateSeries for SeriesWrap { fn _dtype(&self) -> &DataType { self.0.ref_field().dtype() } - fn _get_flags(&self) -> MetadataFlags { + fn _get_flags(&self) -> StatisticsFlags { self.0.get_flags() } - fn _set_flags(&mut self, flags: MetadataFlags) { + fn _set_flags(&mut self, flags: StatisticsFlags) { self.0.set_flags(flags) } @@ -115,14 +115,6 @@ impl private::PrivateSeries for SeriesWrap { } impl SeriesTrait for SeriesWrap { - fn get_metadata(&self) -> Option> { - self.0.metadata_dyn() - } - - fn boxed_metadata<'a>(&'a self) -> Option> { - Some(self.0.boxed_metadata_dyn()) - } - fn rename(&mut self, name: PlSmallStr) { self.0.rename(name); } diff --git a/crates/polars-core/src/series/implementations/categorical.rs b/crates/polars-core/src/series/implementations/categorical.rs index 32ecd4b5962c..8bad6f18e1db 100644 --- a/crates/polars-core/src/series/implementations/categorical.rs +++ b/crates/polars-core/src/series/implementations/categorical.rs @@ -55,10 +55,10 @@ impl private::PrivateSeries for SeriesWrap { fn _dtype(&self) -> &DataType { self.0.dtype() } - fn _get_flags(&self) -> MetadataFlags { + fn _get_flags(&self) -> StatisticsFlags { self.0.get_flags() } - fn _set_flags(&mut self, flags: MetadataFlags) { + fn _set_flags(&mut self, flags: StatisticsFlags) { self.0.set_flags(flags) } diff --git a/crates/polars-core/src/series/implementations/date.rs b/crates/polars-core/src/series/implementations/date.rs index 480ecc0b1c18..da11edf51e70 100644 --- a/crates/polars-core/src/series/implementations/date.rs +++ b/crates/polars-core/src/series/implementations/date.rs @@ -31,11 +31,11 @@ impl private::PrivateSeries for SeriesWrap { self.0.dtype() } - fn _get_flags(&self) -> MetadataFlags { + fn _get_flags(&self) -> StatisticsFlags { self.0.get_flags() } - fn _set_flags(&mut self, flags: MetadataFlags) { + fn _set_flags(&mut self, flags: StatisticsFlags) { self.0.set_flags(flags) } @@ -151,14 +151,6 @@ impl SeriesTrait for SeriesWrap { self.0.rename(name); } - fn get_metadata(&self) -> Option> { - self.0.metadata_dyn() - } - - fn boxed_metadata<'a>(&'a self) -> Option> { - Some(self.0.boxed_metadata_dyn()) - } - fn chunk_lengths(&self) -> ChunkLenIter { self.0.chunk_lengths() } diff --git a/crates/polars-core/src/series/implementations/datetime.rs b/crates/polars-core/src/series/implementations/datetime.rs index 5f97fd1146d8..85772c82e545 100644 --- a/crates/polars-core/src/series/implementations/datetime.rs +++ b/crates/polars-core/src/series/implementations/datetime.rs @@ -25,10 +25,10 @@ impl private::PrivateSeries for SeriesWrap { fn _dtype(&self) -> &DataType { self.0.dtype() } - fn _get_flags(&self) -> MetadataFlags { + fn _get_flags(&self) -> StatisticsFlags { self.0.get_flags() } - fn _set_flags(&mut self, flags: MetadataFlags) { + fn _set_flags(&mut self, flags: StatisticsFlags) { self.0.set_flags(flags) } diff --git a/crates/polars-core/src/series/implementations/decimal.rs b/crates/polars-core/src/series/implementations/decimal.rs index 0079cd8bbec6..557fa9e46c66 100644 --- a/crates/polars-core/src/series/implementations/decimal.rs +++ b/crates/polars-core/src/series/implementations/decimal.rs @@ -100,10 +100,10 @@ impl private::PrivateSeries for SeriesWrap { fn _dtype(&self) -> &DataType { self.0.dtype() } - fn _get_flags(&self) -> MetadataFlags { + fn _get_flags(&self) -> StatisticsFlags { self.0.get_flags() } - fn _set_flags(&mut self, flags: MetadataFlags) { + fn _set_flags(&mut self, flags: StatisticsFlags) { self.0.set_flags(flags) } diff --git a/crates/polars-core/src/series/implementations/duration.rs b/crates/polars-core/src/series/implementations/duration.rs index 7bca8fc20b65..1ededbed7d16 100644 --- a/crates/polars-core/src/series/implementations/duration.rs +++ b/crates/polars-core/src/series/implementations/duration.rs @@ -29,10 +29,10 @@ impl private::PrivateSeries for SeriesWrap { self.0.dtype() } - fn _set_flags(&mut self, flags: MetadataFlags) { + fn _set_flags(&mut self, flags: StatisticsFlags) { self.0.deref_mut().set_flags(flags) } - fn _get_flags(&self) -> MetadataFlags { + fn _get_flags(&self) -> StatisticsFlags { self.0.deref().get_flags() } diff --git a/crates/polars-core/src/series/implementations/floats.rs b/crates/polars-core/src/series/implementations/floats.rs index 780f8130ed72..0a80f1a0474e 100644 --- a/crates/polars-core/src/series/implementations/floats.rs +++ b/crates/polars-core/src/series/implementations/floats.rs @@ -17,10 +17,10 @@ macro_rules! impl_dyn_series { self.0.ref_field().dtype() } - fn _set_flags(&mut self, flags: MetadataFlags) { + fn _set_flags(&mut self, flags: StatisticsFlags) { self.0.set_flags(flags) } - fn _get_flags(&self) -> MetadataFlags { + fn _get_flags(&self) -> StatisticsFlags { self.0.get_flags() } unsafe fn equal_element( @@ -148,14 +148,6 @@ macro_rules! impl_dyn_series { ChunkRollApply::rolling_map(&self.0, _f, _options).map(|ca| ca.into_series()) } - fn get_metadata(&self) -> Option> { - self.0.metadata_dyn() - } - - fn boxed_metadata<'a>(&'a self) -> Option> { - Some(self.0.boxed_metadata_dyn()) - } - fn rename(&mut self, name: PlSmallStr) { self.0.rename(name); } diff --git a/crates/polars-core/src/series/implementations/list.rs b/crates/polars-core/src/series/implementations/list.rs index e8abe187ec72..7b5d9c3f79f5 100644 --- a/crates/polars-core/src/series/implementations/list.rs +++ b/crates/polars-core/src/series/implementations/list.rs @@ -15,10 +15,10 @@ impl private::PrivateSeries for SeriesWrap { fn _dtype(&self) -> &DataType { self.0.ref_field().dtype() } - fn _get_flags(&self) -> MetadataFlags { + fn _get_flags(&self) -> StatisticsFlags { self.0.get_flags() } - fn _set_flags(&mut self, flags: MetadataFlags) { + fn _set_flags(&mut self, flags: StatisticsFlags) { self.0.set_flags(flags) } diff --git a/crates/polars-core/src/series/implementations/mod.rs b/crates/polars-core/src/series/implementations/mod.rs index 9bcf512c7df8..9e9a5bb96b14 100644 --- a/crates/polars-core/src/series/implementations/mod.rs +++ b/crates/polars-core/src/series/implementations/mod.rs @@ -26,11 +26,9 @@ mod time; use std::any::Any; use std::borrow::Cow; -use std::sync::RwLockReadGuard; use super::*; use crate::chunked_array::comparison::*; -use crate::chunked_array::metadata::MetadataTrait; use crate::chunked_array::ops::compare_inner::{ IntoTotalEqInner, IntoTotalOrdInner, TotalEqInner, TotalOrdInner, }; @@ -80,11 +78,11 @@ macro_rules! impl_dyn_series { self.0.ref_field().dtype() } - fn _get_flags(&self) -> MetadataFlags { + fn _get_flags(&self) -> StatisticsFlags { self.0.get_flags() } - fn _set_flags(&mut self, flags: MetadataFlags) { + fn _set_flags(&mut self, flags: StatisticsFlags) { self.0.set_flags(flags) } @@ -220,14 +218,6 @@ macro_rules! impl_dyn_series { ChunkRollApply::rolling_map(&self.0, _f, _options).map(|ca| ca.into_series()) } - fn get_metadata(&self) -> Option> { - self.0.metadata_dyn() - } - - fn boxed_metadata<'a>(&'a self) -> Option> { - Some(self.0.boxed_metadata_dyn()) - } - fn rename(&mut self, name: PlSmallStr) { self.0.rename(name); } diff --git a/crates/polars-core/src/series/implementations/null.rs b/crates/polars-core/src/series/implementations/null.rs index f171bce1319b..912a191ead43 100644 --- a/crates/polars-core/src/series/implementations/null.rs +++ b/crates/polars-core/src/series/implementations/null.rs @@ -60,7 +60,7 @@ impl PrivateSeries for NullChunked { } #[allow(unused)] - fn _set_flags(&mut self, flags: MetadataFlags) {} + fn _set_flags(&mut self, flags: StatisticsFlags) {} fn _dtype(&self) -> &DataType { &DataType::Null @@ -122,8 +122,8 @@ impl PrivateSeries for NullChunked { AggList::agg_list(self, groups) } - fn _get_flags(&self) -> MetadataFlags { - MetadataFlags::empty() + fn _get_flags(&self) -> StatisticsFlags { + StatisticsFlags::empty() } fn vec_hash(&self, random_state: PlRandomState, buf: &mut Vec) -> PolarsResult<()> { diff --git a/crates/polars-core/src/series/implementations/object.rs b/crates/polars-core/src/series/implementations/object.rs index d2d1c8e0dd2d..22afb23fec9e 100644 --- a/crates/polars-core/src/series/implementations/object.rs +++ b/crates/polars-core/src/series/implementations/object.rs @@ -2,7 +2,7 @@ use std::any::Any; use std::borrow::Cow; use self::compare_inner::TotalOrdInner; -use super::{BitRepr, MetadataFlags}; +use super::{BitRepr, StatisticsFlags}; use crate::chunked_array::cast::CastOptions; use crate::chunked_array::object::PolarsObjectSafe; use crate::chunked_array::ops::compare_inner::{IntoTotalEqInner, TotalEqInner}; @@ -41,10 +41,10 @@ where self.0.dtype() } - fn _set_flags(&mut self, flags: MetadataFlags) { + fn _set_flags(&mut self, flags: StatisticsFlags) { self.0.set_flags(flags) } - fn _get_flags(&self) -> MetadataFlags { + fn _get_flags(&self) -> StatisticsFlags { self.0.get_flags() } unsafe fn agg_list(&self, groups: &GroupsProxy) -> Series { diff --git a/crates/polars-core/src/series/implementations/string.rs b/crates/polars-core/src/series/implementations/string.rs index 1d68298681a5..44c8d5522491 100644 --- a/crates/polars-core/src/series/implementations/string.rs +++ b/crates/polars-core/src/series/implementations/string.rs @@ -15,10 +15,10 @@ impl private::PrivateSeries for SeriesWrap { self.0.ref_field().dtype() } - fn _set_flags(&mut self, flags: MetadataFlags) { + fn _set_flags(&mut self, flags: StatisticsFlags) { self.0.set_flags(flags) } - fn _get_flags(&self) -> MetadataFlags { + fn _get_flags(&self) -> StatisticsFlags { self.0.get_flags() } unsafe fn equal_element(&self, idx_self: usize, idx_other: usize, other: &Series) -> bool { diff --git a/crates/polars-core/src/series/implementations/struct_.rs b/crates/polars-core/src/series/implementations/struct_.rs index 4677a1494d05..7cd943771351 100644 --- a/crates/polars-core/src/series/implementations/struct_.rs +++ b/crates/polars-core/src/series/implementations/struct_.rs @@ -26,11 +26,11 @@ impl PrivateSeries for SeriesWrap { self.0.compute_len() } - fn _get_flags(&self) -> MetadataFlags { - MetadataFlags::empty() + fn _get_flags(&self) -> StatisticsFlags { + StatisticsFlags::empty() } - fn _set_flags(&mut self, _flags: MetadataFlags) {} + fn _set_flags(&mut self, _flags: StatisticsFlags) {} // TODO! remove this. Very slow. Asof join should use row-encoding. unsafe fn equal_element(&self, idx_self: usize, idx_other: usize, other: &Series) -> bool { diff --git a/crates/polars-core/src/series/implementations/time.rs b/crates/polars-core/src/series/implementations/time.rs index 5fe2c64bd542..75f64e651e1a 100644 --- a/crates/polars-core/src/series/implementations/time.rs +++ b/crates/polars-core/src/series/implementations/time.rs @@ -31,11 +31,11 @@ impl private::PrivateSeries for SeriesWrap { self.0.dtype() } - fn _get_flags(&self) -> MetadataFlags { + fn _get_flags(&self) -> StatisticsFlags { self.0.get_flags() } - fn _set_flags(&mut self, flags: MetadataFlags) { + fn _set_flags(&mut self, flags: StatisticsFlags) { self.0.set_flags(flags) } diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index 6767906dc4d2..afaf57f769f0 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -1,4 +1,5 @@ //! Type agnostic columnar data structure. +use crate::chunked_array::flags::StatisticsFlags; pub use crate::prelude::ChunkCompareEq; use crate::prelude::*; use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH}; @@ -38,7 +39,6 @@ use polars_utils::itertools::Itertools; pub use series_trait::{IsSorted, *}; use crate::chunked_array::cast::CastOptions; -use crate::chunked_array::metadata::{IMMetadata, Metadata, MetadataFlags}; #[cfg(feature = "zip_with")] use crate::series::arithmetic::coerce_lhs_rhs; use crate::utils::{handle_casting_failures, materialize_dyn_int, Wrap}; @@ -215,21 +215,10 @@ impl Series { // TODO! this probably can now be removed, now we don't have special case for structs. pub fn select_chunk(&self, i: usize) -> Self { let mut new = self.clear(); - let flags = self.get_flags(); + let mut flags = self.get_flags(); - let mut new_flags = MetadataFlags::empty(); - new_flags.set( - MetadataFlags::SORTED_ASC, - flags.contains(MetadataFlags::SORTED_ASC), - ); - new_flags.set( - MetadataFlags::SORTED_DSC, - flags.contains(MetadataFlags::SORTED_DSC), - ); - new_flags.set( - MetadataFlags::FAST_EXPLODE_LIST, - flags.contains(MetadataFlags::FAST_EXPLODE_LIST), - ); + use StatisticsFlags as F; + flags &= F::IS_SORTED_ANY | F::CAN_FAST_EXPLODE_LIST; // Assign mut so we go through arc only once. let mut_new = new._get_inner_mut(); @@ -238,7 +227,7 @@ impl Series { chunks.clear(); chunks.push(chunk); mut_new.compute_len(); - mut_new._set_flags(new_flags); + mut_new._set_flags(flags); new } @@ -246,31 +235,23 @@ impl Series { if self.len() <= 1 { return IsSorted::Ascending; } - let flags = self.get_flags(); - if flags.contains(MetadataFlags::SORTED_DSC) { - IsSorted::Descending - } else if flags.contains(MetadataFlags::SORTED_ASC) { - IsSorted::Ascending - } else { - IsSorted::Not - } + self.get_flags().is_sorted() } pub fn set_sorted_flag(&mut self, sorted: IsSorted) { let mut flags = self.get_flags(); - flags.set_sorted_flag(sorted); + flags.set_sorted(sorted); self.set_flags(flags); } pub(crate) fn clear_flags(&mut self) { - self.set_flags(MetadataFlags::empty()); + self.set_flags(StatisticsFlags::empty()); } - #[allow(dead_code)] - pub fn get_flags(&self) -> MetadataFlags { + pub fn get_flags(&self) -> StatisticsFlags { self.0._get_flags() } - pub(crate) fn set_flags(&mut self, flags: MetadataFlags) { + pub(crate) fn set_flags(&mut self, flags: StatisticsFlags) { self._get_inner_mut()._set_flags(flags) } @@ -291,24 +272,6 @@ impl Series { self } - /// to set the [`Metadata`] for the underlying [`ChunkedArray`] - /// - /// This does not guarantee that the [`Metadata`] is always set. It returns whether it was - /// successful. - pub fn try_set_metadata(&mut self, metadata: Metadata) -> bool { - let inner = self._get_inner_mut(); - - // @NOTE: These types are not the same if they are logical for example. For now, we just - // say: do not set the metadata when you get into this situation. This can be a @TODO for - // later. - if &T::get_dtype() != inner.dtype() { - return false; - } - - inner.as_mut().md = Arc::new(IMMetadata::new(metadata)); - true - } - pub fn from_arrow_chunks(name: PlSmallStr, arrays: Vec) -> PolarsResult { Self::try_from((name, arrays)) } diff --git a/crates/polars-core/src/series/series_trait.rs b/crates/polars-core/src/series/series_trait.rs index 05760a533171..64f403e0393e 100644 --- a/crates/polars-core/src/series/series_trait.rs +++ b/crates/polars-core/src/series/series_trait.rs @@ -1,13 +1,11 @@ use std::any::Any; use std::borrow::Cow; -use std::sync::RwLockReadGuard; use arrow::bitmap::{Bitmap, MutableBitmap}; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::chunked_array::cast::CastOptions; -use crate::chunked_array::metadata::MetadataTrait; #[cfg(feature = "object")] use crate::chunked_array::object::PolarsObjectSafe; use crate::prelude::*; @@ -38,7 +36,7 @@ pub enum BitRepr { pub(crate) mod private { use super::*; - use crate::chunked_array::metadata::MetadataFlags; + use crate::chunked_array::flags::StatisticsFlags; use crate::chunked_array::ops::compare_inner::{TotalEqInner, TotalOrdInner}; pub trait PrivateSeriesNumeric { @@ -66,9 +64,9 @@ pub(crate) mod private { fn compute_len(&mut self); - fn _get_flags(&self) -> MetadataFlags; + fn _get_flags(&self) -> StatisticsFlags; - fn _set_flags(&mut self, flags: MetadataFlags); + fn _set_flags(&mut self, flags: StatisticsFlags); unsafe fn equal_element( &self, @@ -205,14 +203,6 @@ pub trait SeriesTrait: /// Rename the Series. fn rename(&mut self, name: PlSmallStr); - fn get_metadata(&self) -> Option> { - None - } - - fn boxed_metadata<'a>(&'a self) -> Option> { - None - } - /// Get the lengths of the underlying chunks fn chunk_lengths(&self) -> ChunkLenIter; diff --git a/crates/polars-expr/src/expressions/aggregation.rs b/crates/polars-expr/src/expressions/aggregation.rs index a2e868c59fa2..7ba0c26ab619 100644 --- a/crates/polars-expr/src/expressions/aggregation.rs +++ b/crates/polars-expr/src/expressions/aggregation.rs @@ -4,7 +4,6 @@ use arrow::array::*; use arrow::compute::concatenate::concatenate; use arrow::legacy::utils::CustomIterTools; use arrow::offset::Offsets; -use polars_core::chunked_array::metadata::MetadataEnv; use polars_core::prelude::*; use polars_core::series::IsSorted; use polars_core::utils::{NoNull, _split_offsets}; @@ -66,23 +65,15 @@ impl PhysicalExpr for AggregationExpr { }; match group_by { - GroupByMethod::Min => { - if MetadataEnv::experimental_enabled() { - if let Some(sc) = s.get_metadata().and_then(|v| v.min_value()) { - return Ok(sc.into_column(s.name().clone())); - } - } - - match s.is_sorted_flag() { - IsSorted::Ascending | IsSorted::Descending => { - s.min_reduce().map(|sc| sc.into_column(s.name().clone())) - }, - IsSorted::Not => parallel_op_columns( - |s| s.min_reduce().map(|sc| sc.into_column(s.name().clone())), - s, - allow_threading, - ), - } + GroupByMethod::Min => match s.is_sorted_flag() { + IsSorted::Ascending | IsSorted::Descending => { + s.min_reduce().map(|sc| sc.into_column(s.name().clone())) + }, + IsSorted::Not => parallel_op_columns( + |s| s.min_reduce().map(|sc| sc.into_column(s.name().clone())), + s, + allow_threading, + ), }, #[cfg(feature = "propagate_nans")] GroupByMethod::NanMin => parallel_op_columns( @@ -100,23 +91,15 @@ impl PhysicalExpr for AggregationExpr { GroupByMethod::NanMin => { panic!("activate 'propagate_nans' feature") }, - GroupByMethod::Max => { - if MetadataEnv::experimental_enabled() { - if let Some(sc) = s.get_metadata().and_then(|v| v.max_value()) { - return Ok(sc.into_column(s.name().clone())); - } - } - - match s.is_sorted_flag() { - IsSorted::Ascending | IsSorted::Descending => { - s.max_reduce().map(|sc| sc.into_column(s.name().clone())) - }, - IsSorted::Not => parallel_op_columns( - |s| s.max_reduce().map(|sc| sc.into_column(s.name().clone())), - s, - allow_threading, - ), - } + GroupByMethod::Max => match s.is_sorted_flag() { + IsSorted::Ascending | IsSorted::Descending => { + s.max_reduce().map(|sc| sc.into_column(s.name().clone())) + }, + IsSorted::Not => parallel_op_columns( + |s| s.max_reduce().map(|sc| sc.into_column(s.name().clone())), + s, + allow_threading, + ), }, #[cfg(feature = "propagate_nans")] GroupByMethod::NanMax => parallel_op_columns( @@ -152,18 +135,9 @@ impl PhysicalExpr for AggregationExpr { allow_threading, ), GroupByMethod::Groups => unreachable!(), - GroupByMethod::NUnique => { - if MetadataEnv::experimental_enabled() { - if let Some(count) = s.get_metadata().and_then(|v| v.distinct_count()) { - let count = count + IdxSize::from(s.null_count() > 0); - return Ok(IdxCa::from_slice(s.name().clone(), &[count]).into_column()); - } - } - - s.n_unique().map(|count| { - IdxCa::from_slice(s.name().clone(), &[count as IdxSize]).into_column() - }) - }, + GroupByMethod::NUnique => s.n_unique().map(|count| { + IdxCa::from_slice(s.name().clone(), &[count as IdxSize]).into_column() + }), GroupByMethod::Count { include_nulls } => { let count = s.len() - s.null_count() * !include_nulls as usize; diff --git a/crates/polars-io/src/parquet/read/mod.rs b/crates/polars-io/src/parquet/read/mod.rs index cc0020cc7857..02189a0dce85 100644 --- a/crates/polars-io/src/parquet/read/mod.rs +++ b/crates/polars-io/src/parquet/read/mod.rs @@ -21,7 +21,6 @@ mod options; mod predicates; mod read_impl; mod reader; -mod to_metadata; mod utils; const ROW_COUNT_OVERFLOW_ERR: PolarsError = PolarsError::ComputeError(ErrString::new_static( diff --git a/crates/polars-io/src/parquet/read/read_impl.rs b/crates/polars-io/src/parquet/read/read_impl.rs index 5f4f381a2e9b..a065d999e943 100644 --- a/crates/polars-io/src/parquet/read/read_impl.rs +++ b/crates/polars-io/src/parquet/read/read_impl.rs @@ -10,18 +10,13 @@ use polars_core::prelude::*; use polars_core::series::IsSorted; use polars_core::utils::{accumulate_dataframes_vertical, split_df}; use polars_core::{config, POOL}; -use polars_parquet::parquet::error::ParquetResult; -use polars_parquet::parquet::statistics::Statistics; -use polars_parquet::read::{ - self, ColumnChunkMetadata, FileMetadata, Filter, PhysicalType, RowGroupMetadata, -}; +use polars_parquet::read::{self, ColumnChunkMetadata, FileMetadata, Filter, RowGroupMetadata}; use rayon::prelude::*; #[cfg(feature = "cloud")] use super::async_impl::FetchRowGroupsFromObjectStore; use super::mmap::{mmap_columns, ColumnStore}; use super::predicates::read_this_row_group; -use super::to_metadata::ToMetadata; use super::utils::materialize_empty_df; use super::{mmap, ParallelStrategy}; use crate::hive::{self, materialize_hive_partitions}; @@ -122,8 +117,6 @@ fn column_idx_to_series( file_schema: &ArrowSchema, store: &mmap::ColumnStore, ) -> PolarsResult { - let did_filter = filter.is_some(); - let field = file_schema.get_at_index(column_i).unwrap().1; #[cfg(debug_assertions)] @@ -131,69 +124,8 @@ fn column_idx_to_series( assert_dtypes(field.dtype()) } let columns = mmap_columns(store, field_md); - let stats = columns - .iter() - .map(|(col_md, _)| col_md.statistics().transpose()) - .collect::>>>(); let array = mmap::to_deserializer(columns, field.clone(), filter)?; - let mut series = Series::try_from((field, array))?; - - // We cannot really handle nested metadata at the moment. Just skip it. - use ArrowDataType as AD; - match field.dtype() { - AD::List(_) | AD::LargeList(_) | AD::Struct(_) | AD::FixedSizeList(_, _) => { - return Ok(series) - }, - _ => {}, - } - - // We cannot trust the statistics if we filtered the parquet already. - if did_filter { - return Ok(series); - } - - // See if we can find some statistics for this series. If we cannot find anything just return - // the series as is. - let Ok(Some(stats)) = stats.map(|mut s| s.pop().flatten()) else { - return Ok(series); - }; - - let series_trait = series.as_ref(); - - macro_rules! match_dtypes_into_metadata { - ($(($dtype:pat, $phystype:pat) => ($stats:ident, $pldtype:ty),)+) => { - match (series_trait.dtype(), stats.physical_type()) { - $( - ($dtype, $phystype) => { - series.try_set_metadata( - ToMetadata::<$pldtype>::to_metadata(stats.$stats()) - ); - })+ - _ => {}, - } - }; - } - - // Match the data types used by the Series and by the Statistics. If we find a match, set some - // Metadata for the underlying ChunkedArray. - use {DataType as D, PhysicalType as P}; - match_dtypes_into_metadata! { - (D::Boolean, P::Boolean ) => (expect_as_boolean, BooleanType), - (D::UInt8, P::Int32 ) => (expect_as_int32, UInt8Type ), - (D::UInt16, P::Int32 ) => (expect_as_int32, UInt16Type ), - (D::UInt32, P::Int32 ) => (expect_as_int32, UInt32Type ), - (D::UInt64, P::Int64 ) => (expect_as_int64, UInt64Type ), - (D::Int8, P::Int32 ) => (expect_as_int32, Int8Type ), - (D::Int16, P::Int32 ) => (expect_as_int32, Int16Type ), - (D::Int32, P::Int32 ) => (expect_as_int32, Int32Type ), - (D::Int64, P::Int64 ) => (expect_as_int64, Int64Type ), - (D::Float32, P::Float ) => (expect_as_float, Float32Type), - (D::Float64, P::Double ) => (expect_as_double, Float64Type), - (D::String, P::ByteArray) => (expect_as_binary, StringType ), - (D::Binary, P::ByteArray) => (expect_as_binary, BinaryType ), - } - - Ok(series) + Series::try_from((field, array)) } #[allow(clippy::too_many_arguments)] diff --git a/crates/polars-io/src/parquet/read/to_metadata.rs b/crates/polars-io/src/parquet/read/to_metadata.rs deleted file mode 100644 index 23067f719ad0..000000000000 --- a/crates/polars-io/src/parquet/read/to_metadata.rs +++ /dev/null @@ -1,97 +0,0 @@ -use polars_core::chunked_array::metadata::Metadata; -use polars_core::datatypes::{ - BinaryType, BooleanType, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, - PolarsDataType, StringType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, -}; -use polars_parquet::parquet::statistics::{ - BinaryStatistics, BooleanStatistics, PrimitiveStatistics, -}; - -pub trait ToMetadata: Sized + 'static { - fn to_metadata(&self) -> Metadata; -} - -impl ToMetadata for BooleanStatistics { - fn to_metadata(&self) -> Metadata { - let mut md = Metadata::default(); - - md.set_distinct_count(self.distinct_count.and_then(|v| v.try_into().ok())); - md.set_min_value(self.min_value); - md.set_max_value(self.max_value); - - md - } -} - -impl ToMetadata for BinaryStatistics { - fn to_metadata(&self) -> Metadata { - let mut md = Metadata::default(); - - md.set_distinct_count(self.distinct_count.and_then(|v| v.try_into().ok())); - md.set_min_value( - self.min_value - .as_ref() - .map(|v| v.clone().into_boxed_slice()), - ); - md.set_max_value( - self.max_value - .as_ref() - .map(|v| v.clone().into_boxed_slice()), - ); - - md - } -} - -impl ToMetadata for BinaryStatistics { - fn to_metadata(&self) -> Metadata { - let mut md = Metadata::default(); - - md.set_distinct_count(self.distinct_count.and_then(|v| v.try_into().ok())); - md.set_min_value( - self.min_value - .as_ref() - .and_then(|s| String::from_utf8(s.clone()).ok()), - ); - md.set_max_value( - self.max_value - .as_ref() - .and_then(|s| String::from_utf8(s.clone()).ok()), - ); - - md - } -} - -macro_rules! prim_statistics { - ($(($bstore:ty, $pltype:ty),)+) => { - $( - impl ToMetadata<$pltype> for PrimitiveStatistics<$bstore> { - fn to_metadata(&self) -> Metadata<$pltype> { - let mut md = Metadata::default(); - - md.set_distinct_count(self.distinct_count.and_then(|v| v.try_into().ok())); - md.set_min_value(self.min_value.map(|v| v as <$pltype as PolarsDataType>::OwnedPhysical)); - md.set_max_value(self.max_value.map(|v| v as <$pltype as PolarsDataType>::OwnedPhysical)); - - md - } - } - )+ - } -} - -prim_statistics! { - (i32, Int8Type), - (i32, Int16Type), - (i32, Int32Type), - (i64, Int64Type), - - (i32, UInt8Type), - (i32, UInt16Type), - (i32, UInt32Type), - (i64, UInt64Type), - - (f32, Float32Type), - (f64, Float64Type), -}