From 4d148c8550b6c3b9cbe1620fe68eb9b44b8ead05 Mon Sep 17 00:00:00 2001 From: Marshall Crumiller Date: Fri, 15 Mar 2024 20:35:22 -0400 Subject: [PATCH] Add to_local expr --- crates/polars-plan/src/dsl/cat.rs | 4 + .../polars-plan/src/dsl/function_expr/cat.rs | 8 ++ .../src/dsl/function_expr/schema.rs | 17 ++++ py-polars/polars/expr/categorical.py | 41 +++++++++ py-polars/src/expr/categorical.rs | 4 + .../operations/namespaces/test_categorical.py | 86 ++++++++++++++++++- 6 files changed, 156 insertions(+), 4 deletions(-) diff --git a/crates/polars-plan/src/dsl/cat.rs b/crates/polars-plan/src/dsl/cat.rs index 1bb3150347b4..aa2826643228 100644 --- a/crates/polars-plan/src/dsl/cat.rs +++ b/crates/polars-plan/src/dsl/cat.rs @@ -8,4 +8,8 @@ impl CategoricalNameSpace { self.0 .apply_private(CategoricalFunction::GetCategories.into()) } + + pub fn to_local(self) -> Expr { + self.0.map_private(CategoricalFunction::ToLocal.into()) + } } diff --git a/crates/polars-plan/src/dsl/function_expr/cat.rs b/crates/polars-plan/src/dsl/function_expr/cat.rs index db50f4ef4429..7df22592339d 100644 --- a/crates/polars-plan/src/dsl/function_expr/cat.rs +++ b/crates/polars-plan/src/dsl/function_expr/cat.rs @@ -5,6 +5,7 @@ use crate::map; #[derive(Clone, PartialEq, Debug, Eq, Hash)] pub enum CategoricalFunction { GetCategories, + ToLocal, } impl CategoricalFunction { @@ -12,6 +13,7 @@ impl CategoricalFunction { use CategoricalFunction::*; match self { GetCategories => mapper.with_dtype(DataType::String), + ToLocal => mapper.map_global_cat_to_local(), } } } @@ -21,6 +23,7 @@ impl Display for CategoricalFunction { use CategoricalFunction::*; let s = match self { GetCategories => "get_categories", + ToLocal => "to_local", }; write!(f, "cat.{s}") } @@ -31,6 +34,7 @@ impl From for SpecialEq> { use CategoricalFunction::*; match func { GetCategories => map!(get_categories), + ToLocal => map!(to_local), } } } @@ -48,3 +52,7 @@ fn get_categories(s: &Series) -> PolarsResult { let arr = rev_map.get_categories().clone().boxed(); Series::try_from((ca.name(), arr)) } + +fn to_local(s: &Series) -> PolarsResult { + Ok(s.categorical()?.to_local().into_series()) +} diff --git a/crates/polars-plan/src/dsl/function_expr/schema.rs b/crates/polars-plan/src/dsl/function_expr/schema.rs index a385c27820d6..46b5db5cdc98 100644 --- a/crates/polars-plan/src/dsl/function_expr/schema.rs +++ b/crates/polars-plan/src/dsl/function_expr/schema.rs @@ -411,6 +411,23 @@ impl<'a> FieldsMapper<'a> { self.map_dtype(|dtype| dtype.to_physical()) } + #[cfg(feature = "dtype-categorical")] + /// Map a global categorical to local + pub fn map_global_cat_to_local(&self) -> PolarsResult { + self.map_dtype(|dtype| match dtype { + DataType::Categorical(rev_map, ordering) => DataType::Categorical( + rev_map.clone().map(|rm| match &*rm { + RevMapping::Global(..) => { + Arc::new(RevMapping::build_local(rm.get_categories().clone())) + }, + _ => rm, + }), + *ordering, + ), + _ => dtype.clone(), + }) + } + /// Map a single dtype with a potentially failing mapper function. pub fn try_map_dtype( &self, diff --git a/py-polars/polars/expr/categorical.py b/py-polars/polars/expr/categorical.py index 2d6ce7646443..ba2c1537e685 100644 --- a/py-polars/polars/expr/categorical.py +++ b/py-polars/polars/expr/categorical.py @@ -38,3 +38,44 @@ def get_categories(self) -> Expr: └──────┘ """ return wrap_expr(self._pyexpr.cat_get_categories()) + + def to_local(self) -> Expr: + """ + Convert a categorical column to its local representation. + + This may change the underlying physical representation of the column. + + See the documentation of :func:`StringCache` for more information on the + difference between local and global categoricals. + + Examples + -------- + Compare the global and local representations of a categorical. + + >>> with pl.StringCache(): + ... _ = pl.Series("x", ["a", "b", "a"], dtype=pl.Categorical) + ... df = pl.Series("y", ["c", "b", "d"], dtype=pl.Categorical).to_frame() + >>> df.select(pl.col("y").to_physical()) + shape: (3, 1) + ┌─────┐ + │ y │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + >>> df.select(pl.col("y").cat.to_local().to_physical()) + shape: (3, 1) + ┌─────┐ + │ y │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + """ + return wrap_expr(self._pyexpr.cat_to_local()) diff --git a/py-polars/src/expr/categorical.rs b/py-polars/src/expr/categorical.rs index 5921b97313bf..df458fd4ae83 100644 --- a/py-polars/src/expr/categorical.rs +++ b/py-polars/src/expr/categorical.rs @@ -7,4 +7,8 @@ impl PyExpr { fn cat_get_categories(&self) -> Self { self.inner.clone().cat().get_categories().into() } + + fn cat_to_local(&self) -> Self { + self.inner.clone().cat().to_local().into() + } } diff --git a/py-polars/tests/unit/operations/namespaces/test_categorical.py b/py-polars/tests/unit/operations/namespaces/test_categorical.py index 708abf7eed4d..1738900b1fa2 100644 --- a/py-polars/tests/unit/operations/namespaces/test_categorical.py +++ b/py-polars/tests/unit/operations/namespaces/test_categorical.py @@ -1,4 +1,7 @@ +from collections import OrderedDict + import polars as pl +from polars import StringCache from polars.testing import assert_frame_equal @@ -81,7 +84,7 @@ def test_categorical_get_categories() -> None: ).cat.get_categories().to_list() == ["foo", "bar", "ham"] -def test_cat_to_local() -> None: +def test_cat_series_to_local() -> None: with pl.StringCache(): s1 = pl.Series(["a", "b", "a"], dtype=pl.Categorical) s2 = pl.Series(["c", "b", "d"], dtype=pl.Categorical) @@ -103,7 +106,36 @@ def test_cat_to_local() -> None: assert s2.to_list() == ["c", "b", "d"] -def test_cat_to_local_missing_values() -> None: +def test_cat_expr_to_local() -> None: + with pl.StringCache(): + df = pl.DataFrame( + { + "s1": pl.Series(["a", "b", "a"], dtype=pl.Categorical), + "s2": pl.Series(["c", "b", "d"], dtype=pl.Categorical), + } + ) + + # s2 physical starts after s1 + assert df["s1"].to_physical().to_list() == [0, 1, 0] + assert df["s2"].to_physical().to_list() == [2, 1, 3] + + out = df.select(pl.col("s1", "s2").cat.to_local()) + + # Physical has changed and now starts at 0, string values are the same + assert out["s1"].cat.is_local() + assert out["s2"].cat.is_local() + assert out["s1"].to_physical().to_list() == [0, 1, 0] + assert out["s2"].to_physical().to_list() == [0, 1, 2] + assert out["s1"].to_list() == ["a", "b", "a"] + assert out["s2"].to_list() == ["c", "b", "d"] + + # s2 should be unchanged after the operation + assert not df["s2"].cat.is_local() + assert df["s2"].to_physical().to_list() == [2, 1, 3] + assert df["s2"].to_list() == ["c", "b", "d"] + + +def test_cat_series_to_local_missing_values() -> None: with pl.StringCache(): _ = pl.Series(["a", "b"], dtype=pl.Categorical) s = pl.Series(["c", "b", None, "d"], dtype=pl.Categorical) @@ -112,7 +144,20 @@ def test_cat_to_local_missing_values() -> None: assert out.to_physical().to_list() == [0, 1, None, 2] -def test_cat_to_local_already_local() -> None: +def test_cat_expr_to_local_missing_values() -> None: + with pl.StringCache(): + _ = pl.Series(["a", "b"], dtype=pl.Categorical) + df = pl.DataFrame( + { + "s": pl.Series(["c", "b", None, "d"], dtype=pl.Categorical), + } + ) + + out = df.select(pl.col("s").cat.to_local()) + assert out["s"].to_physical().to_list() == [0, 1, None, 2] + + +def test_cat_series_to_local_already_local() -> None: s = pl.Series(["a", "c", "a", "b"], dtype=pl.Categorical) assert s.cat.is_local() @@ -122,7 +167,29 @@ def test_cat_to_local_already_local() -> None: assert out.to_list() == ["a", "c", "a", "b"] -def test_cat_is_local() -> None: +def test_cat_expr_to_local_already_local() -> None: + df = pl.DataFrame({"s": pl.Series(["a", "c", "a", "b"], dtype=pl.Categorical)}) + + assert df["s"].cat.is_local() + out = df.select(pl.col("s").cat.to_local()) + + assert out["s"].to_physical().to_list() == [0, 1, 0, 2] + assert out["s"].to_list() == ["a", "c", "a", "b"] + + +@StringCache() +def test_cat_global_to_local_schema() -> None: + _ = pl.Series(["a", "b", "c"], dtype=pl.Categorical) + schema = ( + pl.LazyFrame({"s": pl.Series(["c", "b", "d"], dtype=pl.Categorical)}) + .select(pl.col("s").cat.to_local()) + .collect_schema() + ) + + assert schema == OrderedDict([("s", pl.Categorical(ordering="physical"))]) + + +def test_cat_series_is_local() -> None: s = pl.Series(["a", "c", "a", "b"], dtype=pl.Categorical) assert s.cat.is_local() @@ -131,6 +198,17 @@ def test_cat_is_local() -> None: assert not s2.cat.is_local() +def test_cat_expr_is_local() -> None: + df = pl.DataFrame({"s": pl.Series(["a", "c", "a", "b"], dtype=pl.Categorical)}) + assert df["s"].cat.is_local() + + with pl.StringCache(): + df = df.with_columns( + pl.Series(["a", "b", "a", "c"], dtype=pl.Categorical).alias("s2") + ) + assert not df["s2"].cat.is_local() + + def test_cat_uses_lexical_ordering() -> None: s = pl.Series(["a", "b", None, "b"]).cast(pl.Categorical) assert s.cat.uses_lexical_ordering() is False