Skip to content

Commit

Permalink
feat: Add Series::{first, last, approx_n_unique} (#19093)
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite authored Oct 4, 2024
1 parent f55658b commit baa65b8
Show file tree
Hide file tree
Showing 23 changed files with 157 additions and 101 deletions.
1 change: 1 addition & 0 deletions crates/polars-compute/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,5 @@ version_check = { workspace = true }
[features]
nightly = []
simd = ["arrow/simd"]
approx_unique = []
dtype-array = []
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
//! # Examples
//!
//! ```
//! # use polars_ops::prelude::*;
//! # use polars_compute::hyperloglogplus::*;
//! let mut hllp = HyperLogLog::new();
//! hllp.add(&12345);
//! hllp.add(&23456);
Expand Down
2 changes: 2 additions & 0 deletions crates/polars-compute/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ pub mod bitwise;
pub mod comparisons;
pub mod filter;
pub mod float_sum;
#[cfg(feature = "approx_unique")]
pub mod hyperloglogplus;
pub mod if_then_else;
pub mod min_max;
pub mod size;
Expand Down
1 change: 1 addition & 0 deletions crates/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ fmt_no_tty = ["comfy-table"]
rows = []

# operations
approx_unique = ["polars-compute/approx_unique"]
bitwise = ["algorithm_group_by"]
zip_with = []
round_series = []
Expand Down
20 changes: 20 additions & 0 deletions crates/polars-core/src/chunked_array/ops/approx_n_unique.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
use std::hash::Hash;

use polars_compute::hyperloglogplus::HyperLogLog;
use polars_utils::total_ord::{ToTotalOrd, TotalEq, TotalHash};
use polars_utils::IdxSize;

use super::{ChunkApproxNUnique, ChunkedArray, PolarsDataType};

impl<T> ChunkApproxNUnique for ChunkedArray<T>
where
T: PolarsDataType,
for<'a> T::Physical<'a>: TotalHash + TotalEq + Copy + ToTotalOrd,
for<'a> <Option<T::Physical<'a>> as ToTotalOrd>::TotalOrdItem: Hash + Eq,
{
fn approx_n_unique(&self) -> IdxSize {
let mut hllp = HyperLogLog::new();
self.iter().for_each(|item| hllp.add(&item.to_total_ord()));
hllp.count() as IdxSize
}
}
7 changes: 7 additions & 0 deletions crates/polars-core/src/chunked_array/ops/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ pub(crate) mod aggregate;
pub(crate) mod any_value;
pub(crate) mod append;
mod apply;
#[cfg(feature = "approx_unique")]
mod approx_n_unique;
pub mod arity;
mod bit_repr;
#[cfg(feature = "bitwise")]
Expand Down Expand Up @@ -375,6 +377,11 @@ pub trait ChunkUnique {
}
}

#[cfg(feature = "approx_unique")]
pub trait ChunkApproxNUnique {
fn approx_n_unique(&self) -> IdxSize;
}

/// Sort operations on `ChunkedArray`.
pub trait ChunkSort<T: PolarsDataType> {
#[allow(unused_variables)]
Expand Down
12 changes: 12 additions & 0 deletions crates/polars-core/src/frame/column/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1031,6 +1031,18 @@ impl Column {
.into()),
}
}

#[cfg(feature = "approx_unique")]
pub fn approx_n_unique(&self) -> PolarsResult<IdxSize> {
match self {
Column::Series(s) => s.approx_n_unique(),
Column::Scalar(s) => {
// @NOTE: We do this for the error handling.
s.as_single_value_series().approx_n_unique()?;
Ok(1)
},
}
}
}

impl Default for Column {
Expand Down
5 changes: 5 additions & 0 deletions crates/polars-core/src/series/implementations/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,11 @@ impl SeriesTrait for SeriesWrap<BinaryChunked> {
ChunkUnique::arg_unique(&self.0)
}

#[cfg(feature = "approx_unique")]
fn approx_n_unique(&self) -> PolarsResult<IdxSize> {
Ok(ChunkApproxNUnique::approx_n_unique(&self.0))
}

fn is_null(&self) -> BooleanChunked {
self.0.is_null()
}
Expand Down
5 changes: 5 additions & 0 deletions crates/polars-core/src/series/implementations/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,11 @@ impl SeriesTrait for SeriesWrap<BooleanChunked> {
))
}

#[cfg(feature = "approx_unique")]
fn approx_n_unique(&self) -> PolarsResult<IdxSize> {
Ok(ChunkApproxNUnique::approx_n_unique(&self.0))
}

fn clone_inner(&self) -> Arc<dyn SeriesTrait> {
Arc::new(SeriesWrap(Clone::clone(&self.0)))
}
Expand Down
5 changes: 5 additions & 0 deletions crates/polars-core/src/series/implementations/floats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,11 @@ macro_rules! impl_dyn_series {
Ok(Scalar::new(dt, av))
}

#[cfg(feature = "approx_unique")]
fn approx_n_unique(&self) -> PolarsResult<IdxSize> {
Ok(ChunkApproxNUnique::approx_n_unique(&self.0))
}

fn clone_inner(&self) -> Arc<dyn SeriesTrait> {
Arc::new(SeriesWrap(Clone::clone(&self.0)))
}
Expand Down
5 changes: 5 additions & 0 deletions crates/polars-core/src/series/implementations/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,11 @@ macro_rules! impl_dyn_series {
Ok(Scalar::new(dt, av))
}

#[cfg(feature = "approx_unique")]
fn approx_n_unique(&self) -> PolarsResult<IdxSize> {
Ok(ChunkApproxNUnique::approx_n_unique(&self.0))
}

fn clone_inner(&self) -> Arc<dyn SeriesTrait> {
Arc::new(SeriesWrap(Clone::clone(&self.0)))
}
Expand Down
6 changes: 6 additions & 0 deletions crates/polars-core/src/series/implementations/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,12 @@ impl SeriesTrait for SeriesWrap<StringChunked> {
fn min_reduce(&self) -> PolarsResult<Scalar> {
Ok(ChunkAggSeries::min_reduce(&self.0))
}

#[cfg(feature = "approx_unique")]
fn approx_n_unique(&self) -> PolarsResult<IdxSize> {
Ok(ChunkApproxNUnique::approx_n_unique(&self.0))
}

fn clone_inner(&self) -> Arc<dyn SeriesTrait> {
Arc::new(SeriesWrap(Clone::clone(&self.0)))
}
Expand Down
39 changes: 36 additions & 3 deletions crates/polars-core/src/series/series_trait.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,20 +147,23 @@ pub(crate) mod private {
unsafe fn agg_list(&self, groups: &GroupsProxy) -> Series {
Series::full_null(self._field().name().clone(), groups.len(), self._dtype())
}

/// # Safety
///
/// Does no bounds checks, groups must be correct.
#[cfg(feature = "bitwise")]
unsafe fn agg_and(&self, groups: &GroupsProxy) -> Series {
Series::full_null(self._field().name().clone(), groups.len(), self._dtype())
}

/// # Safety
///
/// Does no bounds checks, groups must be correct.
#[cfg(feature = "bitwise")]
unsafe fn agg_or(&self, groups: &GroupsProxy) -> Series {
Series::full_null(self._field().name().clone(), groups.len(), self._dtype())
}

/// # Safety
///
/// Does no bounds checks, groups must be correct.
Expand Down Expand Up @@ -504,15 +507,45 @@ pub trait SeriesTrait:
}
/// Get the bitwise AND of the Series as a new Series of length 1,
fn and_reduce(&self) -> PolarsResult<Scalar> {
polars_bail!(opq = sum, self._dtype());
polars_bail!(opq = and_reduce, self._dtype());
}
/// Get the bitwise OR of the Series as a new Series of length 1,
fn or_reduce(&self) -> PolarsResult<Scalar> {
polars_bail!(opq = sum, self._dtype());
polars_bail!(opq = or_reduce, self._dtype());
}
/// Get the bitwise XOR of the Series as a new Series of length 1,
fn xor_reduce(&self) -> PolarsResult<Scalar> {
polars_bail!(opq = sum, self._dtype());
polars_bail!(opq = xor_reduce, self._dtype());
}

/// Get the first element of the [`Series`] as a [`Scalar`]
///
/// If the [`Series`] is empty, a [`Scalar`] with a [`AnyValue::Null`] is returned.
fn first(&self) -> Scalar {
let dt = self.dtype();
let av = self.get(0).map_or(AnyValue::Null, AnyValue::into_static);

Scalar::new(dt.clone(), av)
}

/// Get the last element of the [`Series`] as a [`Scalar`]
///
/// If the [`Series`] is empty, a [`Scalar`] with a [`AnyValue::Null`] is returned.
fn last(&self) -> Scalar {
let dt = self.dtype();
let av = if self.len() == 0 {
AnyValue::Null
} else {
// SAFETY: len-1 < len if len != 0
unsafe { self.get_unchecked(self.len() - 1) }.into_static()
};

Scalar::new(dt.clone(), av)
}

#[cfg(feature = "approx_unique")]
fn approx_n_unique(&self) -> PolarsResult<IdxSize> {
polars_bail!(opq = approx_n_unique, self._dtype());
}

/// Clone inner ChunkedArray and wrap in a new Arc
Expand Down
5 changes: 0 additions & 5 deletions crates/polars-ops/src/series/ops/approx_algo/mod.rs

This file was deleted.

78 changes: 0 additions & 78 deletions crates/polars-ops/src/series/ops/approx_unique.rs

This file was deleted.

8 changes: 0 additions & 8 deletions crates/polars-ops/src/series/ops/mod.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
#[cfg(feature = "abs")]
mod abs;
#[cfg(feature = "approx_unique")]
mod approx_algo;
#[cfg(feature = "approx_unique")]
mod approx_unique;
mod arg_min_max;
mod bitwise;
#[cfg(feature = "business")]
Expand Down Expand Up @@ -67,10 +63,6 @@ mod various;

#[cfg(feature = "abs")]
pub use abs::*;
#[cfg(feature = "approx_unique")]
pub use approx_algo::*;
#[cfg(feature = "approx_unique")]
pub use approx_unique::*;
pub use arg_min_max::ArgAgg;
pub use bitwise::*;
#[cfg(feature = "business")]
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-plan/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ extract_jsonpath = ["polars-ops/extract_jsonpath"]

# operations
bitwise = ["polars-core/bitwise", "polars-ops/bitwise"]
approx_unique = ["polars-ops/approx_unique"]
approx_unique = ["polars-ops/approx_unique", "polars-core/approx_unique"]
is_in = ["polars-ops/is_in"]
repeat_by = ["polars-ops/repeat_by"]
round_series = ["polars-ops/round_series"]
Expand Down
3 changes: 2 additions & 1 deletion crates/polars-plan/src/dsl/function_expr/dispatch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ pub(super) fn reverse(s: &Column) -> PolarsResult<Column> {

#[cfg(feature = "approx_unique")]
pub(super) fn approx_n_unique(s: &Column) -> PolarsResult<Column> {
polars_ops::prelude::approx_n_unique(s.as_materialized_series()).map(Column::from)
s.approx_n_unique()
.map(|v| Column::new_scalar(s.name().clone(), Scalar::new(IDX_DTYPE, v.into()), 1))
}

#[cfg(feature = "diff")]
Expand Down
2 changes: 2 additions & 0 deletions crates/polars-python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ hist = ["polars/hist"]
find_many = ["polars/find_many"]
new_streaming = ["polars-lazy/new_streaming"]
bitwise = ["polars/bitwise"]
approx_unique = ["polars/approx_unique"]

dtype-i8 = []
dtype-i16 = []
Expand All @@ -181,6 +182,7 @@ dtypes = [
]

operations = [
"approx_unique",
"array_any_all",
"array_count",
"bitwise",
Expand Down
1 change: 1 addition & 0 deletions crates/polars-python/src/expr/general.rs
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,7 @@ impl PyExpr {
.into()
}

#[cfg(feature = "approx_unique")]
fn approx_n_unique(&self) -> Self {
self.inner.clone().approx_n_unique().into()
}
Expand Down
17 changes: 17 additions & 0 deletions crates/polars-python/src/series/aggregation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,23 @@ impl PySeries {
.into_py(py))
}

fn first(&self, py: Python) -> PyObject {
Wrap(self.series.first().as_any_value()).into_py(py)
}

fn last(&self, py: Python) -> PyObject {
Wrap(self.series.last().as_any_value()).into_py(py)
}

#[cfg(feature = "approx_unique")]
fn approx_n_unique(&self, py: Python) -> PyResult<PyObject> {
Ok(self
.series
.approx_n_unique()
.map_err(PyPolarsErr::from)?
.into_py(py))
}

#[cfg(feature = "bitwise")]
fn bitwise_and(&self, py: Python) -> PyResult<PyObject> {
Ok(Wrap(
Expand Down
Loading

0 comments on commit baa65b8

Please sign in to comment.