From cbc0ea0d4d42d037710a2f156e69fa1b56f4d0c5 Mon Sep 17 00:00:00 2001 From: Gijs Burghoorn Date: Thu, 5 Dec 2024 08:33:21 +0100 Subject: [PATCH] perf: Utilize the RangedUniqueKernel for Enum/Categorical (#20150) --- .../polars-arrow/src/array/dictionary/mod.rs | 4 + crates/polars-arrow/src/bitmap/bitmask.rs | 4 + crates/polars-compute/src/unique/boolean.rs | 96 +++--- .../polars-compute/src/unique/dictionary.rs | 63 ++++ crates/polars-compute/src/unique/mod.rs | 9 +- crates/polars-compute/src/unique/primitive.rs | 291 +++++++++++------- .../logical/categorical/ops/unique.rs | 22 +- .../src/chunked_array/ops/unique/mod.rs | 35 +-- 8 files changed, 313 insertions(+), 211 deletions(-) create mode 100644 crates/polars-compute/src/unique/dictionary.rs diff --git a/crates/polars-arrow/src/array/dictionary/mod.rs b/crates/polars-arrow/src/array/dictionary/mod.rs index f23c409c48a9..3f44dd604980 100644 --- a/crates/polars-arrow/src/array/dictionary/mod.rs +++ b/crates/polars-arrow/src/array/dictionary/mod.rs @@ -401,6 +401,10 @@ impl DictionaryArray { }, }) } + + pub fn take(self) -> (ArrowDataType, PrimitiveArray, Box) { + (self.dtype, self.keys, self.values) + } } impl Array for DictionaryArray { diff --git a/crates/polars-arrow/src/bitmap/bitmask.rs b/crates/polars-arrow/src/bitmap/bitmask.rs index 637ae06a8e6b..b212b3fd7fed 100644 --- a/crates/polars-arrow/src/bitmap/bitmask.rs +++ b/crates/polars-arrow/src/bitmap/bitmask.rs @@ -91,6 +91,10 @@ impl std::fmt::Debug for BitMask<'_> { impl<'a> BitMask<'a> { pub fn from_bitmap(bitmap: &'a Bitmap) -> Self { let (bytes, offset, len) = bitmap.as_slice(); + Self::new(bytes, offset, len) + } + + pub fn new(bytes: &'a [u8], offset: usize, len: usize) -> Self { // Check length so we can use unsafe access in our get. assert!(bytes.len() * 8 >= len + offset); Self { bytes, offset, len } diff --git a/crates/polars-compute/src/unique/boolean.rs b/crates/polars-compute/src/unique/boolean.rs index bee639d2bacf..e72d6348dc43 100644 --- a/crates/polars-compute/src/unique/boolean.rs +++ b/crates/polars-compute/src/unique/boolean.rs @@ -4,31 +4,14 @@ use arrow::datatypes::ArrowDataType; use super::{GenericUniqueKernel, RangedUniqueKernel}; +#[derive(Default)] pub struct BooleanUniqueKernelState { seen: u32, - has_null: bool, - dtype: ArrowDataType, -} - -const fn to_value(scalar: Option) -> u8 { - match scalar { - None => 0, - Some(false) => 1, - Some(true) => 2, - } } impl BooleanUniqueKernelState { - pub fn new(has_null: bool, dtype: ArrowDataType) -> Self { - Self { - seen: 0, - has_null, - dtype, - } - } - - fn has_seen_null(&self) -> bool { - self.has_null && self.seen & (1 << to_value(None)) != 0 + pub fn new() -> Self { + Self::default() } } @@ -45,79 +28,70 @@ impl RangedUniqueKernel for BooleanUniqueKernelState { } let null_count = array.null_count(); - let values = array.values(); - - if !self.has_null || null_count == 0 { - let set_bits = values.set_bits(); - self.seen |= u32::from(set_bits != 0) << to_value(Some(true)); - self.seen |= u32::from(set_bits != values.len()) << to_value(Some(false)); - - return; - } - - self.seen |= u32::from(null_count > 0) << to_value(None); + self.seen |= u32::from(null_count > 0) << 2; + let set_bits = if null_count > 0 { + array + .values() + .num_intersections_with(array.validity().unwrap()) + } else { + array.values().set_bits() + }; - if array.len() != null_count { - let validity = array.validity().unwrap(); + self.seen |= u32::from(set_bits != array.len() - null_count); + self.seen |= u32::from(set_bits != 0) << 1; + } - let set_bits = values.num_intersections_with(validity); - self.seen |= u32::from(set_bits != 0) << to_value(Some(true)); - self.seen |= u32::from(set_bits != values.len() - null_count) << to_value(Some(false)); - } + fn append_state(&mut self, other: &Self) { + self.seen |= other.seen; } fn finalize_unique(self) -> Self::Array { - let mut values = MutableBitmap::with_capacity(3); - let validity = if self.has_seen_null() { - let mut validity = MutableBitmap::with_capacity(3); - - for i in 0..3 { - if self.seen & (1 << i) != 0 { - values.push(i > 1); - validity.push(i > 0); - } - } + let mut values = MutableBitmap::with_capacity(self.seen.count_ones() as usize); + if self.seen & 0b001 != 0 { + values.push(false); + } + if self.seen & 0b010 != 0 { + values.push(true); + } + let validity = if self.seen & 0b100 != 0 { + let mut validity = MutableBitmap::with_capacity(values.len() + 1); + validity.extend_constant(values.len(), true); + validity.push(false); + values.push(false); Some(validity.freeze()) } else { - for i in 1..3 { - if self.seen & (1 << i) != 0 { - values.push(i > 1); - } - } - None }; let values = values.freeze(); - - BooleanArray::new(self.dtype, values, validity) + BooleanArray::new(ArrowDataType::Boolean, values, validity) } - fn finalize_n_unique(self) -> usize { + fn finalize_n_unique(&self) -> usize { self.seen.count_ones() as usize } - fn finalize_n_unique_non_null(self) -> usize { - (self.seen & !1).count_ones() as usize + fn finalize_n_unique_non_null(&self) -> usize { + (self.seen & 0b011).count_ones() as usize } } impl GenericUniqueKernel for BooleanArray { fn unique(&self) -> Self { - let mut state = BooleanUniqueKernelState::new(self.null_count() > 0, self.dtype().clone()); + let mut state = BooleanUniqueKernelState::new(); state.append(self); state.finalize_unique() } fn n_unique(&self) -> usize { - let mut state = BooleanUniqueKernelState::new(self.null_count() > 0, self.dtype().clone()); + let mut state = BooleanUniqueKernelState::new(); state.append(self); state.finalize_n_unique() } fn n_unique_non_null(&self) -> usize { - let mut state = BooleanUniqueKernelState::new(self.null_count() > 0, self.dtype().clone()); + let mut state = BooleanUniqueKernelState::new(); state.append(self); state.finalize_n_unique_non_null() } diff --git a/crates/polars-compute/src/unique/dictionary.rs b/crates/polars-compute/src/unique/dictionary.rs new file mode 100644 index 000000000000..9ad967ed38c2 --- /dev/null +++ b/crates/polars-compute/src/unique/dictionary.rs @@ -0,0 +1,63 @@ +use arrow::array::{Array, DictionaryArray}; +use arrow::datatypes::ArrowDataType; + +use super::{PrimitiveRangedUniqueState, RangedUniqueKernel}; + +/// A specialized unique kernel for [`DictionaryArray`] for when all values are in a small known +/// range. +pub struct DictionaryRangedUniqueState { + key_state: PrimitiveRangedUniqueState, + values: Box, +} + +impl DictionaryRangedUniqueState { + pub fn new(values: Box) -> Self { + Self { + key_state: PrimitiveRangedUniqueState::new(0, values.len() as u32 + 1), + values, + } + } + + pub fn key_state(&mut self) -> &mut PrimitiveRangedUniqueState { + &mut self.key_state + } +} + +impl RangedUniqueKernel for DictionaryRangedUniqueState { + type Array = DictionaryArray; + + fn has_seen_all(&self) -> bool { + self.key_state.has_seen_all() + } + + fn append(&mut self, array: &Self::Array) { + self.key_state.append(array.keys()); + } + + fn append_state(&mut self, other: &Self) { + debug_assert_eq!(self.values, other.values); + self.key_state.append_state(&other.key_state); + } + + fn finalize_unique(self) -> Self::Array { + let keys = self.key_state.finalize_unique(); + DictionaryArray::::try_new( + ArrowDataType::Dictionary( + arrow::datatypes::IntegerType::UInt32, + Box::new(self.values.dtype().clone()), + false, + ), + keys, + self.values, + ) + .unwrap() + } + + fn finalize_n_unique(&self) -> usize { + self.key_state.finalize_n_unique() + } + + fn finalize_n_unique_non_null(&self) -> usize { + self.key_state.finalize_n_unique_non_null() + } +} diff --git a/crates/polars-compute/src/unique/mod.rs b/crates/polars-compute/src/unique/mod.rs index d64052325f01..645daea39126 100644 --- a/crates/polars-compute/src/unique/mod.rs +++ b/crates/polars-compute/src/unique/mod.rs @@ -39,12 +39,15 @@ pub trait RangedUniqueKernel { /// Append an `Array`'s values to the `State` fn append(&mut self, array: &Self::Array); + /// Append another state into the `State` + fn append_state(&mut self, other: &Self); + /// Consume the state to get the unique elements fn finalize_unique(self) -> Self::Array; /// Consume the state to get the number of unique elements including null - fn finalize_n_unique(self) -> usize; + fn finalize_n_unique(&self) -> usize; /// Consume the state to get the number of unique elements excluding null - fn finalize_n_unique_non_null(self) -> usize; + fn finalize_n_unique_non_null(&self) -> usize; } /// A generic unique kernel that selects the generally applicable unique kernel for an `Array`. @@ -58,7 +61,9 @@ pub trait GenericUniqueKernel { } mod boolean; +mod dictionary; mod primitive; pub use boolean::BooleanUniqueKernelState; +pub use dictionary::DictionaryRangedUniqueState; pub use primitive::PrimitiveRangedUniqueState; diff --git a/crates/polars-compute/src/unique/primitive.rs b/crates/polars-compute/src/unique/primitive.rs index c1e258f800fa..576d3832af4f 100644 --- a/crates/polars-compute/src/unique/primitive.rs +++ b/crates/polars-compute/src/unique/primitive.rs @@ -1,11 +1,11 @@ use std::ops::{Add, RangeInclusive, Sub}; use arrow::array::PrimitiveArray; +use arrow::bitmap::bitmask::BitMask; use arrow::bitmap::MutableBitmap; use arrow::datatypes::ArrowDataType; use arrow::types::NativeType; -use num_traits::FromPrimitive; -use polars_utils::float::IsFloat; +use num_traits::{FromPrimitive, ToPrimitive}; use polars_utils::total_ord::TotalOrd; use super::RangedUniqueKernel; @@ -13,159 +13,224 @@ use super::RangedUniqueKernel; /// A specialized unique kernel for [`PrimitiveArray`] for when all values are in a small known /// range. pub struct PrimitiveRangedUniqueState { - seen: u128, + seen: Seen, range: RangeInclusive, - has_null: bool, - dtype: ArrowDataType, } -impl PrimitiveRangedUniqueState -where - T: Add + Sub + FromPrimitive + IsFloat, -{ - pub fn new(min_value: T, max_value: T, has_null: bool, dtype: ArrowDataType) -> Option { - // We cannot really do this for floating point number as these are not as discrete as - // integers. - if T::is_float() { - return None; - } +enum Seen { + Small(u128), + Large(MutableBitmap), +} - if TotalOrd::tot_gt( - &(max_value - min_value), - &T::from_u8(128 - u8::from(has_null)).unwrap(), - ) { - return None; +impl Seen { + pub fn from_size(size: usize) -> Self { + if size <= 128 { + Self::Small(0) + } else { + Self::Large(MutableBitmap::from_len_zeroed(size)) } - - Some(Self { - seen: 0, - range: min_value..=max_value, - has_null, - dtype, - }) } - fn len(&self) -> u8 { - (*self.range.end() - *self.range.start()).to_le_bytes()[0] + fn num_seen(&self) -> usize { + match self { + Seen::Small(v) => v.count_ones() as usize, + Seen::Large(v) => v.set_bits(), + } } - fn has_seen_null(&self) -> bool { - self.has_null && self.seen & 1 != 0 + fn has_seen_null(&self, size: usize) -> bool { + match self { + Self::Small(v) => v >> (size - 1) != 0, + Self::Large(v) => v.get(size - 1), + } } +} - #[inline(always)] - fn to_value(&self, scalar: Option) -> u8 { - match scalar { - None => { - debug_assert!(self.has_null); - 0 - }, - Some(v) => { - debug_assert!(::tot_le(&v, self.range.end())); - debug_assert!(::tot_ge(&v, self.range.start())); - - (v - *self.range.start()).to_le_bytes()[0] + u8::from(self.has_null) - }, +impl PrimitiveRangedUniqueState +where + T: Add + Sub + ToPrimitive + FromPrimitive, +{ + pub fn new(min_value: T, max_value: T) -> Self { + let size = (max_value - min_value).to_usize().unwrap(); + // Range is inclusive + let size = size + 1; + // One value is left for null + let size = size + 1; + + Self { + seen: Seen::from_size(size), + range: min_value..=max_value, } } + + fn size(&self) -> usize { + (*self.range.end() - *self.range.start()) + .to_usize() + .unwrap() + + 1 + } } impl RangedUniqueKernel for PrimitiveRangedUniqueState where - T: Add + Sub + FromPrimitive + IsFloat, + T: Add + Sub + ToPrimitive + FromPrimitive, { type Array = PrimitiveArray; fn has_seen_all(&self) -> bool { - let len = self.len(); - let bit_length = len + u8::from(self.has_null); - - debug_assert!(bit_length > 0); - debug_assert!(bit_length <= 128); - - self.seen == (1u128 << len).wrapping_sub(1) + let size = self.size(); + match &self.seen { + Seen::Small(v) if size == 128 => !v == 0, + Seen::Small(v) => *v == ((1 << size) - 1), + Seen::Large(v) => BitMask::new(v.as_slice(), 0, size).unset_bits() == 0, + } } fn append(&mut self, array: &Self::Array) { - const STEP_SIZE: usize = 128; - - if !self.has_null { - let mut i = 0; - let values = array.values().as_slice(); - - while !self.has_seen_all() && i < values.len() { - for v in values[i..].iter().take(STEP_SIZE) { - self.seen |= 1 << self.to_value(Some(*v)); + let size = self.size(); + match array.validity().as_ref().filter(|v| v.unset_bits() > 0) { + None => { + const STEP_SIZE: usize = 512; + + let mut i = 0; + let values = array.values().as_slice(); + + match self.seen { + Seen::Small(ref mut seen) => { + // Check every so often whether we have already seen all the values. + while *seen != ((1 << (size - 1)) - 1) && i < values.len() { + for v in values[i..].iter().take(STEP_SIZE) { + if cfg!(debug_assertions) { + assert!(TotalOrd::tot_ge(v, self.range.start())); + assert!(TotalOrd::tot_le(v, self.range.end())); + } + + let v = *v - *self.range.start(); + let v = unsafe { v.to_usize().unwrap_unchecked() }; + *seen |= 1 << v; + } + + i += STEP_SIZE; + } + }, + Seen::Large(ref mut seen) => { + // Check every so often whether we have already seen all the values. + while BitMask::new(seen.as_slice(), 0, size - 1).unset_bits() > 0 + && i < values.len() + { + for v in values[i..].iter().take(STEP_SIZE) { + if cfg!(debug_assertions) { + assert!(TotalOrd::tot_ge(v, self.range.start())); + assert!(TotalOrd::tot_le(v, self.range.end())); + } + + let v = *v - *self.range.start(); + let v = unsafe { v.to_usize().unwrap_unchecked() }; + seen.set(v, true); + } + + i += STEP_SIZE; + } + }, } - - i += STEP_SIZE; - } - } else { - let mut i = 0; - let mut values = array.iter(); - - while !self.has_seen_all() && i < values.len() { - for _ in 0..STEP_SIZE { - let Some(v) = values.next() else { - break; - }; - self.seen |= 1 << self.to_value(v.copied()); + }, + Some(_) => { + let iter = array.non_null_values_iter(); + + match self.seen { + Seen::Small(ref mut seen) => { + *seen |= 1 << (size - 1); + + for v in iter { + if cfg!(debug_assertions) { + assert!(TotalOrd::tot_ge(&v, self.range.start())); + assert!(TotalOrd::tot_le(&v, self.range.end())); + } + + let v = v - *self.range.start(); + let v = unsafe { v.to_usize().unwrap_unchecked() }; + *seen |= 1 << v; + } + }, + Seen::Large(ref mut seen) => { + seen.set(size - 1, true); + + for v in iter { + if cfg!(debug_assertions) { + assert!(TotalOrd::tot_ge(&v, self.range.start())); + assert!(TotalOrd::tot_le(&v, self.range.end())); + } + + let v = v - *self.range.start(); + let v = unsafe { v.to_usize().unwrap_unchecked() }; + seen.set(v, true); + } + }, } + }, + } + } - i += STEP_SIZE; - } + fn append_state(&mut self, other: &Self) { + debug_assert_eq!(self.size(), other.size()); + match (&mut self.seen, &other.seen) { + (Seen::Small(lhs), Seen::Small(rhs)) => *lhs |= rhs, + (Seen::Large(lhs), Seen::Large(ref rhs)) => { + let mut lhs = lhs; + <&mut MutableBitmap as std::ops::BitOrAssign<&MutableBitmap>>::bitor_assign( + &mut lhs, rhs, + ) + }, + _ => unreachable!(), } } fn finalize_unique(self) -> Self::Array { - let mut seen = self.seen; + let size = self.size(); + let seen = self.seen; - let num_values = seen.count_ones() as usize; + let has_null = seen.has_seen_null(size); + let num_values = seen.num_seen(); let mut values = Vec::with_capacity(num_values); - let (values, validity) = if self.has_seen_null() { - let mut validity = MutableBitmap::with_capacity(num_values); + let mut offset = 0; + match seen { + Seen::Small(mut v) => { + while v != 0 { + let shift = v.trailing_zeros(); + offset += shift as u8; + values.push(*self.range.start() + T::from_u8(offset).unwrap()); - values.push(T::zeroed()); - validity.push(false); - seen >>= 1; - - let mut offset = 0u8; - while seen != 0 { - let shift = self.seen.trailing_zeros(); - offset += shift as u8; - values.push(*self.range.start() + T::from_u8(offset).unwrap()); - validity.push(true); - - seen >>= shift + 1; - offset += 1; - } + v >>= shift + 1; + offset += 1; + } + }, + Seen::Large(v) => { + for offset in v.freeze().true_idx_iter() { + values.push(*self.range.start() + T::from_usize(offset).unwrap()); + } + }, + } - (values, Some(validity.freeze())) + let validity = if has_null { + let mut validity = MutableBitmap::new(); + validity.extend_constant(values.len() - 1, true); + validity.push(false); + // The null has already been pushed. + *values.last_mut().unwrap() = T::zeroed(); + Some(validity.freeze()) } else { - seen >>= u8::from(self.has_null); - - let mut offset = 0u8; - while seen != 0 { - let shift = seen.trailing_zeros(); - offset += shift as u8; - values.push(*self.range.start() + T::from_u8(offset).unwrap()); - - seen >>= shift + 1; - offset += 1; - } - - (values, None) + None }; - PrimitiveArray::new(self.dtype, values.into(), validity) + PrimitiveArray::new(ArrowDataType::from(T::PRIMITIVE), values.into(), validity) } - fn finalize_n_unique(self) -> usize { - self.seen.count_ones() as usize + fn finalize_n_unique(&self) -> usize { + self.seen.num_seen() } - fn finalize_n_unique_non_null(self) -> usize { - (self.seen & !1).count_ones() as usize + fn finalize_n_unique_non_null(&self) -> usize { + self.seen.num_seen() - usize::from(self.seen.has_seen_null(self.size())) } } diff --git a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs index 707fc79f0364..1a0356463907 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/ops/unique.rs @@ -1,3 +1,5 @@ +use polars_compute::unique::{DictionaryRangedUniqueState, RangedUniqueKernel}; + use super::*; impl CategoricalChunked { @@ -27,7 +29,18 @@ impl CategoricalChunked { Ok(out) } } else { - let ca = self.physical().unique()?; + let mut state = DictionaryRangedUniqueState::new(cat_map.get_categories().to_boxed()); + for chunk in self.physical().downcast_iter() { + state.key_state().append(chunk); + } + let (_, unique, _) = state.finalize_unique().take(); + let ca = unsafe { + UInt32Chunked::from_chunks_and_dtype_unchecked( + self.physical().name().clone(), + vec![unique.to_boxed()], + DataType::UInt32, + ) + }; // SAFETY: // we only removed some indexes so we are still in bounds unsafe { @@ -45,7 +58,12 @@ impl CategoricalChunked { if self._can_fast_unique() { Ok(self.get_rev_map().len()) } else { - self.physical().n_unique() + let cat_map = self.get_rev_map(); + let mut state = DictionaryRangedUniqueState::new(cat_map.get_categories().to_boxed()); + for chunk in self.physical().downcast_iter() { + state.key_state().append(chunk); + } + Ok(state.finalize_n_unique()) } } diff --git a/crates/polars-core/src/chunked_array/ops/unique/mod.rs b/crates/polars-core/src/chunked_array/ops/unique/mod.rs index b073700867af..f8ae3d78cfc7 100644 --- a/crates/polars-core/src/chunked_array/ops/unique/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/unique/mod.rs @@ -1,11 +1,9 @@ use std::hash::Hash; use arrow::bitmap::MutableBitmap; -use polars_compute::unique::{BooleanUniqueKernelState, PrimitiveRangedUniqueState}; -use polars_utils::float::IsFloat; +use polars_compute::unique::BooleanUniqueKernelState; use polars_utils::total_ord::{ToTotalOrd, TotalHash}; -use crate::chunked_array::metadata::MetadataEnv; use crate::hashing::_HASHMAP_INIT_SIZE; use crate::prelude::*; use crate::series::IsSorted; @@ -124,33 +122,6 @@ where } }, IsSorted::Not => { - if !T::Native::is_float() && MetadataEnv::experimental_enabled() { - let md = self.metadata(); - if let (Some(min), Some(max)) = (md.get_min_value(), md.get_max_value()) { - let dtype = self.field.as_ref().dtype().to_arrow(CompatLevel::oldest()); - if let Some(mut state) = PrimitiveRangedUniqueState::new( - *min, - *max, - self.null_count() > 0, - dtype, - ) { - use polars_compute::unique::RangedUniqueKernel; - - for chunk in self.downcast_iter() { - state.append(chunk); - - if state.has_seen_all() { - break; - } - } - - let unique = state.finalize_unique(); - - return Ok(Self::with_chunk(self.name().clone(), unique)); - } - } - } - let sorted = self.sort(false); sorted.unique() }, @@ -269,9 +240,7 @@ impl ChunkUnique for BooleanChunked { fn unique(&self) -> PolarsResult { use polars_compute::unique::RangedUniqueKernel; - let dtype = self.field.as_ref().dtype().to_arrow(CompatLevel::oldest()); - let has_null = self.null_count() > 0; - let mut state = BooleanUniqueKernelState::new(has_null, dtype); + let mut state = BooleanUniqueKernelState::new(); for arr in self.downcast_iter() { state.append(arr);