Skip to content

Commit

Permalink
perf: Utilize the RangedUniqueKernel for Enum/Categorical (#20150)
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite authored Dec 5, 2024
1 parent 9b491fd commit cbc0ea0
Show file tree
Hide file tree
Showing 8 changed files with 313 additions and 211 deletions.
4 changes: 4 additions & 0 deletions crates/polars-arrow/src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,10 @@ impl<K: DictionaryKey> DictionaryArray<K> {
},
})
}

pub fn take(self) -> (ArrowDataType, PrimitiveArray<K>, Box<dyn Array>) {
(self.dtype, self.keys, self.values)
}
}

impl<K: DictionaryKey> Array for DictionaryArray<K> {
Expand Down
4 changes: 4 additions & 0 deletions crates/polars-arrow/src/bitmap/bitmask.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ impl std::fmt::Debug for BitMask<'_> {
impl<'a> BitMask<'a> {
pub fn from_bitmap(bitmap: &'a Bitmap) -> Self {
let (bytes, offset, len) = bitmap.as_slice();
Self::new(bytes, offset, len)
}

pub fn new(bytes: &'a [u8], offset: usize, len: usize) -> Self {
// Check length so we can use unsafe access in our get.
assert!(bytes.len() * 8 >= len + offset);
Self { bytes, offset, len }
Expand Down
96 changes: 35 additions & 61 deletions crates/polars-compute/src/unique/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,14 @@ use arrow::datatypes::ArrowDataType;

use super::{GenericUniqueKernel, RangedUniqueKernel};

#[derive(Default)]
pub struct BooleanUniqueKernelState {
seen: u32,
has_null: bool,
dtype: ArrowDataType,
}

const fn to_value(scalar: Option<bool>) -> u8 {
match scalar {
None => 0,
Some(false) => 1,
Some(true) => 2,
}
}

impl BooleanUniqueKernelState {
pub fn new(has_null: bool, dtype: ArrowDataType) -> Self {
Self {
seen: 0,
has_null,
dtype,
}
}

fn has_seen_null(&self) -> bool {
self.has_null && self.seen & (1 << to_value(None)) != 0
pub fn new() -> Self {
Self::default()
}
}

Expand All @@ -45,79 +28,70 @@ impl RangedUniqueKernel for BooleanUniqueKernelState {
}

let null_count = array.null_count();
let values = array.values();

if !self.has_null || null_count == 0 {
let set_bits = values.set_bits();
self.seen |= u32::from(set_bits != 0) << to_value(Some(true));
self.seen |= u32::from(set_bits != values.len()) << to_value(Some(false));

return;
}

self.seen |= u32::from(null_count > 0) << to_value(None);
self.seen |= u32::from(null_count > 0) << 2;
let set_bits = if null_count > 0 {
array
.values()
.num_intersections_with(array.validity().unwrap())
} else {
array.values().set_bits()
};

if array.len() != null_count {
let validity = array.validity().unwrap();
self.seen |= u32::from(set_bits != array.len() - null_count);
self.seen |= u32::from(set_bits != 0) << 1;
}

let set_bits = values.num_intersections_with(validity);
self.seen |= u32::from(set_bits != 0) << to_value(Some(true));
self.seen |= u32::from(set_bits != values.len() - null_count) << to_value(Some(false));
}
fn append_state(&mut self, other: &Self) {
self.seen |= other.seen;
}

fn finalize_unique(self) -> Self::Array {
let mut values = MutableBitmap::with_capacity(3);
let validity = if self.has_seen_null() {
let mut validity = MutableBitmap::with_capacity(3);

for i in 0..3 {
if self.seen & (1 << i) != 0 {
values.push(i > 1);
validity.push(i > 0);
}
}
let mut values = MutableBitmap::with_capacity(self.seen.count_ones() as usize);

if self.seen & 0b001 != 0 {
values.push(false);
}
if self.seen & 0b010 != 0 {
values.push(true);
}
let validity = if self.seen & 0b100 != 0 {
let mut validity = MutableBitmap::with_capacity(values.len() + 1);
validity.extend_constant(values.len(), true);
validity.push(false);
values.push(false);
Some(validity.freeze())
} else {
for i in 1..3 {
if self.seen & (1 << i) != 0 {
values.push(i > 1);
}
}

None
};

let values = values.freeze();

BooleanArray::new(self.dtype, values, validity)
BooleanArray::new(ArrowDataType::Boolean, values, validity)
}

fn finalize_n_unique(self) -> usize {
fn finalize_n_unique(&self) -> usize {
self.seen.count_ones() as usize
}

fn finalize_n_unique_non_null(self) -> usize {
(self.seen & !1).count_ones() as usize
fn finalize_n_unique_non_null(&self) -> usize {
(self.seen & 0b011).count_ones() as usize
}
}

impl GenericUniqueKernel for BooleanArray {
fn unique(&self) -> Self {
let mut state = BooleanUniqueKernelState::new(self.null_count() > 0, self.dtype().clone());
let mut state = BooleanUniqueKernelState::new();
state.append(self);
state.finalize_unique()
}

fn n_unique(&self) -> usize {
let mut state = BooleanUniqueKernelState::new(self.null_count() > 0, self.dtype().clone());
let mut state = BooleanUniqueKernelState::new();
state.append(self);
state.finalize_n_unique()
}

fn n_unique_non_null(&self) -> usize {
let mut state = BooleanUniqueKernelState::new(self.null_count() > 0, self.dtype().clone());
let mut state = BooleanUniqueKernelState::new();
state.append(self);
state.finalize_n_unique_non_null()
}
Expand Down
63 changes: 63 additions & 0 deletions crates/polars-compute/src/unique/dictionary.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
use arrow::array::{Array, DictionaryArray};
use arrow::datatypes::ArrowDataType;

use super::{PrimitiveRangedUniqueState, RangedUniqueKernel};

/// A specialized unique kernel for [`DictionaryArray`] for when all values are in a small known
/// range.
pub struct DictionaryRangedUniqueState {
key_state: PrimitiveRangedUniqueState<u32>,
values: Box<dyn Array>,
}

impl DictionaryRangedUniqueState {
pub fn new(values: Box<dyn Array>) -> Self {
Self {
key_state: PrimitiveRangedUniqueState::new(0, values.len() as u32 + 1),
values,
}
}

pub fn key_state(&mut self) -> &mut PrimitiveRangedUniqueState<u32> {
&mut self.key_state
}
}

impl RangedUniqueKernel for DictionaryRangedUniqueState {
type Array = DictionaryArray<u32>;

fn has_seen_all(&self) -> bool {
self.key_state.has_seen_all()
}

fn append(&mut self, array: &Self::Array) {
self.key_state.append(array.keys());
}

fn append_state(&mut self, other: &Self) {
debug_assert_eq!(self.values, other.values);
self.key_state.append_state(&other.key_state);
}

fn finalize_unique(self) -> Self::Array {
let keys = self.key_state.finalize_unique();
DictionaryArray::<u32>::try_new(
ArrowDataType::Dictionary(
arrow::datatypes::IntegerType::UInt32,
Box::new(self.values.dtype().clone()),
false,
),
keys,
self.values,
)
.unwrap()
}

fn finalize_n_unique(&self) -> usize {
self.key_state.finalize_n_unique()
}

fn finalize_n_unique_non_null(&self) -> usize {
self.key_state.finalize_n_unique_non_null()
}
}
9 changes: 7 additions & 2 deletions crates/polars-compute/src/unique/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,15 @@ pub trait RangedUniqueKernel {
/// Append an `Array`'s values to the `State`
fn append(&mut self, array: &Self::Array);

/// Append another state into the `State`
fn append_state(&mut self, other: &Self);

/// Consume the state to get the unique elements
fn finalize_unique(self) -> Self::Array;
/// Consume the state to get the number of unique elements including null
fn finalize_n_unique(self) -> usize;
fn finalize_n_unique(&self) -> usize;
/// Consume the state to get the number of unique elements excluding null
fn finalize_n_unique_non_null(self) -> usize;
fn finalize_n_unique_non_null(&self) -> usize;
}

/// A generic unique kernel that selects the generally applicable unique kernel for an `Array`.
Expand All @@ -58,7 +61,9 @@ pub trait GenericUniqueKernel {
}

mod boolean;
mod dictionary;
mod primitive;

pub use boolean::BooleanUniqueKernelState;
pub use dictionary::DictionaryRangedUniqueState;
pub use primitive::PrimitiveRangedUniqueState;
Loading

0 comments on commit cbc0ea0

Please sign in to comment.