diff --git a/crates/polars-compute/Cargo.toml b/crates/polars-compute/Cargo.toml index 4ade7134ec5e..b4ab185eec56 100644 --- a/crates/polars-compute/Cargo.toml +++ b/crates/polars-compute/Cargo.toml @@ -26,4 +26,5 @@ version_check = { workspace = true } [features] nightly = [] simd = ["arrow/simd"] +approx_unique = [] dtype-array = [] diff --git a/crates/polars-ops/src/series/ops/approx_algo/hyperloglogplus.rs b/crates/polars-compute/src/hyperloglogplus.rs similarity index 99% rename from crates/polars-ops/src/series/ops/approx_algo/hyperloglogplus.rs rename to crates/polars-compute/src/hyperloglogplus.rs index b341cab65b87..710af92f35ef 100644 --- a/crates/polars-ops/src/series/ops/approx_algo/hyperloglogplus.rs +++ b/crates/polars-compute/src/hyperloglogplus.rs @@ -9,7 +9,7 @@ //! # Examples //! //! ``` -//! # use polars_ops::prelude::*; +//! # use polars_compute::hyperloglogplus::*; //! let mut hllp = HyperLogLog::new(); //! hllp.add(&12345); //! hllp.add(&23456); diff --git a/crates/polars-compute/src/lib.rs b/crates/polars-compute/src/lib.rs index a89303ff8f7f..da56c65983db 100644 --- a/crates/polars-compute/src/lib.rs +++ b/crates/polars-compute/src/lib.rs @@ -13,6 +13,8 @@ pub mod bitwise; pub mod comparisons; pub mod filter; pub mod float_sum; +#[cfg(feature = "approx_unique")] +pub mod hyperloglogplus; pub mod if_then_else; pub mod min_max; pub mod size; diff --git a/crates/polars-core/Cargo.toml b/crates/polars-core/Cargo.toml index 95371084f40b..c9d68f0ee173 100644 --- a/crates/polars-core/Cargo.toml +++ b/crates/polars-core/Cargo.toml @@ -76,6 +76,7 @@ fmt_no_tty = ["comfy-table"] rows = [] # operations +approx_unique = ["polars-compute/approx_unique"] bitwise = ["algorithm_group_by"] zip_with = [] round_series = [] diff --git a/crates/polars-core/src/chunked_array/ops/approx_n_unique.rs b/crates/polars-core/src/chunked_array/ops/approx_n_unique.rs new file mode 100644 index 000000000000..213383ca4699 --- /dev/null +++ b/crates/polars-core/src/chunked_array/ops/approx_n_unique.rs @@ -0,0 +1,20 @@ +use std::hash::Hash; + +use polars_compute::hyperloglogplus::HyperLogLog; +use polars_utils::total_ord::{ToTotalOrd, TotalEq, TotalHash}; +use polars_utils::IdxSize; + +use super::{ChunkApproxNUnique, ChunkedArray, PolarsDataType}; + +impl ChunkApproxNUnique for ChunkedArray +where + T: PolarsDataType, + for<'a> T::Physical<'a>: TotalHash + TotalEq + Copy + ToTotalOrd, + for<'a> > as ToTotalOrd>::TotalOrdItem: Hash + Eq, +{ + fn approx_n_unique(&self) -> IdxSize { + let mut hllp = HyperLogLog::new(); + self.iter().for_each(|item| hllp.add(&item.to_total_ord())); + hllp.count() as IdxSize + } +} diff --git a/crates/polars-core/src/chunked_array/ops/mod.rs b/crates/polars-core/src/chunked_array/ops/mod.rs index 3de8df28d746..061278a22cc9 100644 --- a/crates/polars-core/src/chunked_array/ops/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/mod.rs @@ -7,6 +7,8 @@ pub(crate) mod aggregate; pub(crate) mod any_value; pub(crate) mod append; mod apply; +#[cfg(feature = "approx_unique")] +mod approx_n_unique; pub mod arity; mod bit_repr; #[cfg(feature = "bitwise")] @@ -375,6 +377,11 @@ pub trait ChunkUnique { } } +#[cfg(feature = "approx_unique")] +pub trait ChunkApproxNUnique { + fn approx_n_unique(&self) -> IdxSize; +} + /// Sort operations on `ChunkedArray`. pub trait ChunkSort { #[allow(unused_variables)] diff --git a/crates/polars-core/src/frame/column/mod.rs b/crates/polars-core/src/frame/column/mod.rs index e1a447d437bb..313a4ec89df2 100644 --- a/crates/polars-core/src/frame/column/mod.rs +++ b/crates/polars-core/src/frame/column/mod.rs @@ -1031,6 +1031,18 @@ impl Column { .into()), } } + + #[cfg(feature = "approx_unique")] + pub fn approx_n_unique(&self) -> PolarsResult { + match self { + Column::Series(s) => s.approx_n_unique(), + Column::Scalar(s) => { + // @NOTE: We do this for the error handling. + s.as_single_value_series().approx_n_unique()?; + Ok(1) + }, + } + } } impl Default for Column { diff --git a/crates/polars-core/src/series/implementations/binary.rs b/crates/polars-core/src/series/implementations/binary.rs index 7c5af2b9ccc7..b0fe79845a0d 100644 --- a/crates/polars-core/src/series/implementations/binary.rs +++ b/crates/polars-core/src/series/implementations/binary.rs @@ -214,6 +214,11 @@ impl SeriesTrait for SeriesWrap { ChunkUnique::arg_unique(&self.0) } + #[cfg(feature = "approx_unique")] + fn approx_n_unique(&self) -> PolarsResult { + Ok(ChunkApproxNUnique::approx_n_unique(&self.0)) + } + fn is_null(&self) -> BooleanChunked { self.0.is_null() } diff --git a/crates/polars-core/src/series/implementations/boolean.rs b/crates/polars-core/src/series/implementations/boolean.rs index 14409fbdb91c..83bbacd12c00 100644 --- a/crates/polars-core/src/series/implementations/boolean.rs +++ b/crates/polars-core/src/series/implementations/boolean.rs @@ -370,6 +370,11 @@ impl SeriesTrait for SeriesWrap { )) } + #[cfg(feature = "approx_unique")] + fn approx_n_unique(&self) -> PolarsResult { + Ok(ChunkApproxNUnique::approx_n_unique(&self.0)) + } + fn clone_inner(&self) -> Arc { Arc::new(SeriesWrap(Clone::clone(&self.0))) } diff --git a/crates/polars-core/src/series/implementations/floats.rs b/crates/polars-core/src/series/implementations/floats.rs index eeba7c9b0a6b..24be56671d69 100644 --- a/crates/polars-core/src/series/implementations/floats.rs +++ b/crates/polars-core/src/series/implementations/floats.rs @@ -391,6 +391,11 @@ macro_rules! impl_dyn_series { Ok(Scalar::new(dt, av)) } + #[cfg(feature = "approx_unique")] + fn approx_n_unique(&self) -> PolarsResult { + Ok(ChunkApproxNUnique::approx_n_unique(&self.0)) + } + fn clone_inner(&self) -> Arc { Arc::new(SeriesWrap(Clone::clone(&self.0))) } diff --git a/crates/polars-core/src/series/implementations/mod.rs b/crates/polars-core/src/series/implementations/mod.rs index 8d4d2caa3ddf..9d8357a905bc 100644 --- a/crates/polars-core/src/series/implementations/mod.rs +++ b/crates/polars-core/src/series/implementations/mod.rs @@ -497,6 +497,11 @@ macro_rules! impl_dyn_series { Ok(Scalar::new(dt, av)) } + #[cfg(feature = "approx_unique")] + fn approx_n_unique(&self) -> PolarsResult { + Ok(ChunkApproxNUnique::approx_n_unique(&self.0)) + } + fn clone_inner(&self) -> Arc { Arc::new(SeriesWrap(Clone::clone(&self.0))) } diff --git a/crates/polars-core/src/series/implementations/string.rs b/crates/polars-core/src/series/implementations/string.rs index 8b64afcd9895..2547d9bd237c 100644 --- a/crates/polars-core/src/series/implementations/string.rs +++ b/crates/polars-core/src/series/implementations/string.rs @@ -252,6 +252,12 @@ impl SeriesTrait for SeriesWrap { fn min_reduce(&self) -> PolarsResult { Ok(ChunkAggSeries::min_reduce(&self.0)) } + + #[cfg(feature = "approx_unique")] + fn approx_n_unique(&self) -> PolarsResult { + Ok(ChunkApproxNUnique::approx_n_unique(&self.0)) + } + fn clone_inner(&self) -> Arc { Arc::new(SeriesWrap(Clone::clone(&self.0))) } diff --git a/crates/polars-core/src/series/series_trait.rs b/crates/polars-core/src/series/series_trait.rs index 46b45633b74e..2dc8de00dcd7 100644 --- a/crates/polars-core/src/series/series_trait.rs +++ b/crates/polars-core/src/series/series_trait.rs @@ -147,6 +147,7 @@ pub(crate) mod private { unsafe fn agg_list(&self, groups: &GroupsProxy) -> Series { Series::full_null(self._field().name().clone(), groups.len(), self._dtype()) } + /// # Safety /// /// Does no bounds checks, groups must be correct. @@ -154,6 +155,7 @@ pub(crate) mod private { unsafe fn agg_and(&self, groups: &GroupsProxy) -> Series { Series::full_null(self._field().name().clone(), groups.len(), self._dtype()) } + /// # Safety /// /// Does no bounds checks, groups must be correct. @@ -161,6 +163,7 @@ pub(crate) mod private { unsafe fn agg_or(&self, groups: &GroupsProxy) -> Series { Series::full_null(self._field().name().clone(), groups.len(), self._dtype()) } + /// # Safety /// /// Does no bounds checks, groups must be correct. @@ -504,15 +507,45 @@ pub trait SeriesTrait: } /// Get the bitwise AND of the Series as a new Series of length 1, fn and_reduce(&self) -> PolarsResult { - polars_bail!(opq = sum, self._dtype()); + polars_bail!(opq = and_reduce, self._dtype()); } /// Get the bitwise OR of the Series as a new Series of length 1, fn or_reduce(&self) -> PolarsResult { - polars_bail!(opq = sum, self._dtype()); + polars_bail!(opq = or_reduce, self._dtype()); } /// Get the bitwise XOR of the Series as a new Series of length 1, fn xor_reduce(&self) -> PolarsResult { - polars_bail!(opq = sum, self._dtype()); + polars_bail!(opq = xor_reduce, self._dtype()); + } + + /// Get the first element of the [`Series`] as a [`Scalar`] + /// + /// If the [`Series`] is empty, a [`Scalar`] with a [`AnyValue::Null`] is returned. + fn first(&self) -> Scalar { + let dt = self.dtype(); + let av = self.get(0).map_or(AnyValue::Null, AnyValue::into_static); + + Scalar::new(dt.clone(), av) + } + + /// Get the last element of the [`Series`] as a [`Scalar`] + /// + /// If the [`Series`] is empty, a [`Scalar`] with a [`AnyValue::Null`] is returned. + fn last(&self) -> Scalar { + let dt = self.dtype(); + let av = if self.len() == 0 { + AnyValue::Null + } else { + // SAFETY: len-1 < len if len != 0 + unsafe { self.get_unchecked(self.len() - 1) }.into_static() + }; + + Scalar::new(dt.clone(), av) + } + + #[cfg(feature = "approx_unique")] + fn approx_n_unique(&self) -> PolarsResult { + polars_bail!(opq = approx_n_unique, self._dtype()); } /// Clone inner ChunkedArray and wrap in a new Arc diff --git a/crates/polars-ops/src/series/ops/approx_algo/mod.rs b/crates/polars-ops/src/series/ops/approx_algo/mod.rs deleted file mode 100644 index 9dbb65bbad0f..000000000000 --- a/crates/polars-ops/src/series/ops/approx_algo/mod.rs +++ /dev/null @@ -1,5 +0,0 @@ -#[cfg(feature = "approx_unique")] -mod hyperloglogplus; - -#[cfg(feature = "approx_unique")] -pub use hyperloglogplus::*; diff --git a/crates/polars-ops/src/series/ops/approx_unique.rs b/crates/polars-ops/src/series/ops/approx_unique.rs deleted file mode 100644 index ab0ea5db8966..000000000000 --- a/crates/polars-ops/src/series/ops/approx_unique.rs +++ /dev/null @@ -1,78 +0,0 @@ -use std::hash::Hash; - -use polars_core::prelude::*; -use polars_core::with_match_physical_integer_polars_type; -use polars_utils::total_ord::{ToTotalOrd, TotalEq, TotalHash}; - -#[cfg(feature = "approx_unique")] -use crate::series::ops::approx_algo::HyperLogLog; - -fn approx_n_unique_ca<'a, T>(ca: &'a ChunkedArray) -> PolarsResult -where - T: PolarsDataType, - T::Physical<'a>: TotalHash + TotalEq + Copy + ToTotalOrd, - > as ToTotalOrd>::TotalOrdItem: Hash + Eq, -{ - let mut hllp = HyperLogLog::new(); - ca.iter().for_each(|item| hllp.add(&item.to_total_ord())); - let c = hllp.count() as IdxSize; - - Ok(Series::new(ca.name().clone(), &[c])) -} - -fn dispatcher(s: &Series) -> PolarsResult { - let s = s.to_physical_repr(); - use DataType::*; - match s.dtype() { - Boolean => s.bool().and_then(approx_n_unique_ca), - Binary => s.binary().and_then(approx_n_unique_ca), - String => { - let ca = s.str().unwrap().as_binary(); - approx_n_unique_ca(&ca) - }, - Float32 => approx_n_unique_ca(AsRef::>::as_ref( - s.as_ref().as_ref(), - )), - Float64 => approx_n_unique_ca(AsRef::>::as_ref( - s.as_ref().as_ref(), - )), - dt if dt.is_numeric() => { - with_match_physical_integer_polars_type!(s.dtype(), |$T| { - let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref(); - approx_n_unique_ca(ca) - }) - }, - dt => polars_bail!(opq = approx_n_unique, dt), - } -} - -/// Approx count unique values. -/// -/// This is done using the HyperLogLog++ algorithm for cardinality estimation. -/// -/// # Example -/// -/// ```ignore -/// -/// # #[macro_use] extern crate polars_core; -/// # fn main() { -/// -/// use polars_core::prelude::*; -/// -/// let s = Series::new("s".into(), [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]); -/// -/// let approx_count = approx_n_unique(&s).unwrap(); -/// println!("{}", approx_count); -/// # } -/// ``` -/// Outputs: -/// ```text -/// approx_count = shape: (1,) -/// Series: 's' [u32] -/// [ -/// 3 -/// ] -/// ``` -pub fn approx_n_unique(s: &Series) -> PolarsResult { - dispatcher(s) -} diff --git a/crates/polars-ops/src/series/ops/mod.rs b/crates/polars-ops/src/series/ops/mod.rs index 88b509ff6450..b684815238f7 100644 --- a/crates/polars-ops/src/series/ops/mod.rs +++ b/crates/polars-ops/src/series/ops/mod.rs @@ -1,9 +1,5 @@ #[cfg(feature = "abs")] mod abs; -#[cfg(feature = "approx_unique")] -mod approx_algo; -#[cfg(feature = "approx_unique")] -mod approx_unique; mod arg_min_max; mod bitwise; #[cfg(feature = "business")] @@ -67,10 +63,6 @@ mod various; #[cfg(feature = "abs")] pub use abs::*; -#[cfg(feature = "approx_unique")] -pub use approx_algo::*; -#[cfg(feature = "approx_unique")] -pub use approx_unique::*; pub use arg_min_max::ArgAgg; pub use bitwise::*; #[cfg(feature = "business")] diff --git a/crates/polars-plan/Cargo.toml b/crates/polars-plan/Cargo.toml index d41be032bcf0..499bb115396a 100644 --- a/crates/polars-plan/Cargo.toml +++ b/crates/polars-plan/Cargo.toml @@ -107,7 +107,7 @@ extract_jsonpath = ["polars-ops/extract_jsonpath"] # operations bitwise = ["polars-core/bitwise", "polars-ops/bitwise"] -approx_unique = ["polars-ops/approx_unique"] +approx_unique = ["polars-ops/approx_unique", "polars-core/approx_unique"] is_in = ["polars-ops/is_in"] repeat_by = ["polars-ops/repeat_by"] round_series = ["polars-ops/round_series"] diff --git a/crates/polars-plan/src/dsl/function_expr/dispatch.rs b/crates/polars-plan/src/dsl/function_expr/dispatch.rs index 7d0669626253..895803cbcc63 100644 --- a/crates/polars-plan/src/dsl/function_expr/dispatch.rs +++ b/crates/polars-plan/src/dsl/function_expr/dispatch.rs @@ -6,7 +6,8 @@ pub(super) fn reverse(s: &Column) -> PolarsResult { #[cfg(feature = "approx_unique")] pub(super) fn approx_n_unique(s: &Column) -> PolarsResult { - polars_ops::prelude::approx_n_unique(s.as_materialized_series()).map(Column::from) + s.approx_n_unique() + .map(|v| Column::new_scalar(s.name().clone(), Scalar::new(IDX_DTYPE, v.into()), 1)) } #[cfg(feature = "diff")] diff --git a/crates/polars-python/Cargo.toml b/crates/polars-python/Cargo.toml index da4b992bfbec..943515e79f5c 100644 --- a/crates/polars-python/Cargo.toml +++ b/crates/polars-python/Cargo.toml @@ -163,6 +163,7 @@ hist = ["polars/hist"] find_many = ["polars/find_many"] new_streaming = ["polars-lazy/new_streaming"] bitwise = ["polars/bitwise"] +approx_unique = ["polars/approx_unique"] dtype-i8 = [] dtype-i16 = [] @@ -181,6 +182,7 @@ dtypes = [ ] operations = [ + "approx_unique", "array_any_all", "array_count", "bitwise", diff --git a/crates/polars-python/src/expr/general.rs b/crates/polars-python/src/expr/general.rs index e215a6ba2fe4..cf0c50cd2210 100644 --- a/crates/polars-python/src/expr/general.rs +++ b/crates/polars-python/src/expr/general.rs @@ -418,6 +418,7 @@ impl PyExpr { .into() } + #[cfg(feature = "approx_unique")] fn approx_n_unique(&self) -> Self { self.inner.clone().approx_n_unique().into() } diff --git a/crates/polars-python/src/series/aggregation.rs b/crates/polars-python/src/series/aggregation.rs index 65fda49bb71e..dbcbad59ddac 100644 --- a/crates/polars-python/src/series/aggregation.rs +++ b/crates/polars-python/src/series/aggregation.rs @@ -146,6 +146,23 @@ impl PySeries { .into_py(py)) } + fn first(&self, py: Python) -> PyObject { + Wrap(self.series.first().as_any_value()).into_py(py) + } + + fn last(&self, py: Python) -> PyObject { + Wrap(self.series.last().as_any_value()).into_py(py) + } + + #[cfg(feature = "approx_unique")] + fn approx_n_unique(&self, py: Python) -> PyResult { + Ok(self + .series + .approx_n_unique() + .map_err(PyPolarsErr::from)? + .into_py(py)) + } + #[cfg(feature = "bitwise")] fn bitwise_and(&self, py: Python) -> PyResult { Ok(Wrap( diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index a23d23c7c1ae..31bae99c6146 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -126,7 +126,7 @@ fmt_no_tty = ["polars-core/fmt_no_tty"] # extra operations abs = ["polars-ops/abs", "polars-lazy?/abs"] -approx_unique = ["polars-lazy?/approx_unique", "polars-ops/approx_unique"] +approx_unique = ["polars-lazy?/approx_unique", "polars-ops/approx_unique", "polars-core/approx_unique"] arg_where = ["polars-lazy?/arg_where"] array_any_all = ["polars-lazy?/array_any_all", "dtype-array"] asof_join = ["polars-lazy?/asof_join", "polars-ops/asof_join"] diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index d75bed5a3b56..b5a02c8756c2 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -7382,18 +7382,42 @@ def bitwise_trailing_ones(self) -> Self: def bitwise_trailing_zeros(self) -> Self: """Evaluate the number least-significant unset bits before seeing a set bit.""" - def bitwise_and(self) -> Self: + def bitwise_and(self) -> PythonLiteral | None: """Perform an aggregation of bitwise ANDs.""" return self._s.bitwise_and() - def bitwise_or(self) -> Self: + def bitwise_or(self) -> PythonLiteral | None: """Perform an aggregation of bitwise ORs.""" return self._s.bitwise_or() - def bitwise_xor(self) -> Self: + def bitwise_xor(self) -> PythonLiteral | None: """Perform an aggregation of bitwise XORs.""" return self._s.bitwise_xor() + def first(self) -> PythonLiteral | None: + """ + Get the first element of the Series. + + Returns `None` if the Series is empty. + """ + return self._s.first() + + def last(self) -> PythonLiteral | None: + """ + Get the last element of the Series. + + Returns `None` if the Series is empty. + """ + return self._s.last() + + def approx_n_unique(self) -> PythonLiteral | None: + """ + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + """ + return self._s.approx_n_unique() + # Keep the `list` and `str` properties below at the end of the definition of Series, # as to not confuse mypy with the type annotation `str` and `list`