Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add Series::{first, last, approx_n_unique} #19093

Merged
merged 4 commits into from
Oct 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crates/polars-compute/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,5 @@ version_check = { workspace = true }
[features]
nightly = []
simd = ["arrow/simd"]
approx_unique = []
dtype-array = []
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
//! # Examples
//!
//! ```
//! # use polars_ops::prelude::*;
//! # use polars_compute::hyperloglogplus::*;
//! let mut hllp = HyperLogLog::new();
//! hllp.add(&12345);
//! hllp.add(&23456);
Expand Down
2 changes: 2 additions & 0 deletions crates/polars-compute/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ pub mod bitwise;
pub mod comparisons;
pub mod filter;
pub mod float_sum;
#[cfg(feature = "approx_unique")]
pub mod hyperloglogplus;
pub mod if_then_else;
pub mod min_max;
pub mod size;
Expand Down
1 change: 1 addition & 0 deletions crates/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ fmt_no_tty = ["comfy-table"]
rows = []

# operations
approx_unique = ["polars-compute/approx_unique"]
bitwise = ["algorithm_group_by"]
zip_with = []
round_series = []
Expand Down
20 changes: 20 additions & 0 deletions crates/polars-core/src/chunked_array/ops/approx_n_unique.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
use std::hash::Hash;

use polars_compute::hyperloglogplus::HyperLogLog;
use polars_utils::total_ord::{ToTotalOrd, TotalEq, TotalHash};
use polars_utils::IdxSize;

use super::{ChunkApproxNUnique, ChunkedArray, PolarsDataType};

impl<T> ChunkApproxNUnique for ChunkedArray<T>
where
T: PolarsDataType,
for<'a> T::Physical<'a>: TotalHash + TotalEq + Copy + ToTotalOrd,
for<'a> <Option<T::Physical<'a>> as ToTotalOrd>::TotalOrdItem: Hash + Eq,
{
fn approx_n_unique(&self) -> IdxSize {
let mut hllp = HyperLogLog::new();
self.iter().for_each(|item| hllp.add(&item.to_total_ord()));
hllp.count() as IdxSize
}
}
7 changes: 7 additions & 0 deletions crates/polars-core/src/chunked_array/ops/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ pub(crate) mod aggregate;
pub(crate) mod any_value;
pub(crate) mod append;
mod apply;
#[cfg(feature = "approx_unique")]
mod approx_n_unique;
pub mod arity;
mod bit_repr;
#[cfg(feature = "bitwise")]
Expand Down Expand Up @@ -375,6 +377,11 @@ pub trait ChunkUnique {
}
}

#[cfg(feature = "approx_unique")]
pub trait ChunkApproxNUnique {
fn approx_n_unique(&self) -> IdxSize;
}

/// Sort operations on `ChunkedArray`.
pub trait ChunkSort<T: PolarsDataType> {
#[allow(unused_variables)]
Expand Down
12 changes: 12 additions & 0 deletions crates/polars-core/src/frame/column/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1031,6 +1031,18 @@ impl Column {
.into()),
}
}

#[cfg(feature = "approx_unique")]
pub fn approx_n_unique(&self) -> PolarsResult<IdxSize> {
match self {
Column::Series(s) => s.approx_n_unique(),
Column::Scalar(s) => {
// @NOTE: We do this for the error handling.
s.as_single_value_series().approx_n_unique()?;
Ok(1)
},
}
}
}

impl Default for Column {
Expand Down
5 changes: 5 additions & 0 deletions crates/polars-core/src/series/implementations/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,11 @@ impl SeriesTrait for SeriesWrap<BinaryChunked> {
ChunkUnique::arg_unique(&self.0)
}

#[cfg(feature = "approx_unique")]
fn approx_n_unique(&self) -> PolarsResult<IdxSize> {
Ok(ChunkApproxNUnique::approx_n_unique(&self.0))
}

fn is_null(&self) -> BooleanChunked {
self.0.is_null()
}
Expand Down
5 changes: 5 additions & 0 deletions crates/polars-core/src/series/implementations/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,11 @@ impl SeriesTrait for SeriesWrap<BooleanChunked> {
))
}

#[cfg(feature = "approx_unique")]
fn approx_n_unique(&self) -> PolarsResult<IdxSize> {
Ok(ChunkApproxNUnique::approx_n_unique(&self.0))
}

fn clone_inner(&self) -> Arc<dyn SeriesTrait> {
Arc::new(SeriesWrap(Clone::clone(&self.0)))
}
Expand Down
5 changes: 5 additions & 0 deletions crates/polars-core/src/series/implementations/floats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,11 @@ macro_rules! impl_dyn_series {
Ok(Scalar::new(dt, av))
}

#[cfg(feature = "approx_unique")]
fn approx_n_unique(&self) -> PolarsResult<IdxSize> {
Ok(ChunkApproxNUnique::approx_n_unique(&self.0))
}

fn clone_inner(&self) -> Arc<dyn SeriesTrait> {
Arc::new(SeriesWrap(Clone::clone(&self.0)))
}
Expand Down
5 changes: 5 additions & 0 deletions crates/polars-core/src/series/implementations/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,11 @@ macro_rules! impl_dyn_series {
Ok(Scalar::new(dt, av))
}

#[cfg(feature = "approx_unique")]
fn approx_n_unique(&self) -> PolarsResult<IdxSize> {
Ok(ChunkApproxNUnique::approx_n_unique(&self.0))
}

fn clone_inner(&self) -> Arc<dyn SeriesTrait> {
Arc::new(SeriesWrap(Clone::clone(&self.0)))
}
Expand Down
6 changes: 6 additions & 0 deletions crates/polars-core/src/series/implementations/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,12 @@ impl SeriesTrait for SeriesWrap<StringChunked> {
fn min_reduce(&self) -> PolarsResult<Scalar> {
Ok(ChunkAggSeries::min_reduce(&self.0))
}

#[cfg(feature = "approx_unique")]
fn approx_n_unique(&self) -> PolarsResult<IdxSize> {
Ok(ChunkApproxNUnique::approx_n_unique(&self.0))
}

fn clone_inner(&self) -> Arc<dyn SeriesTrait> {
Arc::new(SeriesWrap(Clone::clone(&self.0)))
}
Expand Down
39 changes: 36 additions & 3 deletions crates/polars-core/src/series/series_trait.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,20 +147,23 @@ pub(crate) mod private {
unsafe fn agg_list(&self, groups: &GroupsProxy) -> Series {
Series::full_null(self._field().name().clone(), groups.len(), self._dtype())
}

/// # Safety
///
/// Does no bounds checks, groups must be correct.
#[cfg(feature = "bitwise")]
unsafe fn agg_and(&self, groups: &GroupsProxy) -> Series {
Series::full_null(self._field().name().clone(), groups.len(), self._dtype())
}

/// # Safety
///
/// Does no bounds checks, groups must be correct.
#[cfg(feature = "bitwise")]
unsafe fn agg_or(&self, groups: &GroupsProxy) -> Series {
Series::full_null(self._field().name().clone(), groups.len(), self._dtype())
}

/// # Safety
///
/// Does no bounds checks, groups must be correct.
Expand Down Expand Up @@ -504,15 +507,45 @@ pub trait SeriesTrait:
}
/// Get the bitwise AND of the Series as a new Series of length 1,
fn and_reduce(&self) -> PolarsResult<Scalar> {
polars_bail!(opq = sum, self._dtype());
polars_bail!(opq = and_reduce, self._dtype());
}
/// Get the bitwise OR of the Series as a new Series of length 1,
fn or_reduce(&self) -> PolarsResult<Scalar> {
polars_bail!(opq = sum, self._dtype());
polars_bail!(opq = or_reduce, self._dtype());
}
/// Get the bitwise XOR of the Series as a new Series of length 1,
fn xor_reduce(&self) -> PolarsResult<Scalar> {
polars_bail!(opq = sum, self._dtype());
polars_bail!(opq = xor_reduce, self._dtype());
}

/// Get the first element of the [`Series`] as a [`Scalar`]
///
/// If the [`Series`] is empty, a [`Scalar`] with a [`AnyValue::Null`] is returned.
fn first(&self) -> Scalar {
let dt = self.dtype();
let av = self.get(0).map_or(AnyValue::Null, AnyValue::into_static);

Scalar::new(dt.clone(), av)
}

/// Get the last element of the [`Series`] as a [`Scalar`]
///
/// If the [`Series`] is empty, a [`Scalar`] with a [`AnyValue::Null`] is returned.
fn last(&self) -> Scalar {
let dt = self.dtype();
let av = if self.len() == 0 {
AnyValue::Null
} else {
// SAFETY: len-1 < len if len != 0
unsafe { self.get_unchecked(self.len() - 1) }.into_static()
};

Scalar::new(dt.clone(), av)
}

#[cfg(feature = "approx_unique")]
fn approx_n_unique(&self) -> PolarsResult<IdxSize> {
polars_bail!(opq = approx_n_unique, self._dtype());
}

/// Clone inner ChunkedArray and wrap in a new Arc
Expand Down
5 changes: 0 additions & 5 deletions crates/polars-ops/src/series/ops/approx_algo/mod.rs

This file was deleted.

78 changes: 0 additions & 78 deletions crates/polars-ops/src/series/ops/approx_unique.rs

This file was deleted.

8 changes: 0 additions & 8 deletions crates/polars-ops/src/series/ops/mod.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
#[cfg(feature = "abs")]
mod abs;
#[cfg(feature = "approx_unique")]
mod approx_algo;
#[cfg(feature = "approx_unique")]
mod approx_unique;
mod arg_min_max;
mod bitwise;
#[cfg(feature = "business")]
Expand Down Expand Up @@ -67,10 +63,6 @@ mod various;

#[cfg(feature = "abs")]
pub use abs::*;
#[cfg(feature = "approx_unique")]
pub use approx_algo::*;
#[cfg(feature = "approx_unique")]
pub use approx_unique::*;
pub use arg_min_max::ArgAgg;
pub use bitwise::*;
#[cfg(feature = "business")]
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-plan/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ extract_jsonpath = ["polars-ops/extract_jsonpath"]

# operations
bitwise = ["polars-core/bitwise", "polars-ops/bitwise"]
approx_unique = ["polars-ops/approx_unique"]
approx_unique = ["polars-ops/approx_unique", "polars-core/approx_unique"]
is_in = ["polars-ops/is_in"]
repeat_by = ["polars-ops/repeat_by"]
round_series = ["polars-ops/round_series"]
Expand Down
3 changes: 2 additions & 1 deletion crates/polars-plan/src/dsl/function_expr/dispatch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ pub(super) fn reverse(s: &Column) -> PolarsResult<Column> {

#[cfg(feature = "approx_unique")]
pub(super) fn approx_n_unique(s: &Column) -> PolarsResult<Column> {
polars_ops::prelude::approx_n_unique(s.as_materialized_series()).map(Column::from)
s.approx_n_unique()
.map(|v| Column::new_scalar(s.name().clone(), Scalar::new(IDX_DTYPE, v.into()), 1))
}

#[cfg(feature = "diff")]
Expand Down
2 changes: 2 additions & 0 deletions crates/polars-python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ hist = ["polars/hist"]
find_many = ["polars/find_many"]
new_streaming = ["polars-lazy/new_streaming"]
bitwise = ["polars/bitwise"]
approx_unique = ["polars/approx_unique"]

dtype-i8 = []
dtype-i16 = []
Expand All @@ -181,6 +182,7 @@ dtypes = [
]

operations = [
"approx_unique",
"array_any_all",
"array_count",
"bitwise",
Expand Down
1 change: 1 addition & 0 deletions crates/polars-python/src/expr/general.rs
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,7 @@ impl PyExpr {
.into()
}

#[cfg(feature = "approx_unique")]
fn approx_n_unique(&self) -> Self {
self.inner.clone().approx_n_unique().into()
}
Expand Down
17 changes: 17 additions & 0 deletions crates/polars-python/src/series/aggregation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,23 @@ impl PySeries {
.into_py(py))
}

fn first(&self, py: Python) -> PyObject {
Wrap(self.series.first().as_any_value()).into_py(py)
}

fn last(&self, py: Python) -> PyObject {
Wrap(self.series.last().as_any_value()).into_py(py)
}

#[cfg(feature = "approx_unique")]
fn approx_n_unique(&self, py: Python) -> PyResult<PyObject> {
Ok(self
.series
.approx_n_unique()
.map_err(PyPolarsErr::from)?
.into_py(py))
}

#[cfg(feature = "bitwise")]
fn bitwise_and(&self, py: Python) -> PyResult<PyObject> {
Ok(Wrap(
Expand Down
Loading
Loading