From d1be0b6dc06fddd0b69fb69731281b16894cb132 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 29 Jul 2024 15:12:38 -1000 Subject: [PATCH] Align CategoricalIndex APIs with pandas 2.x (#16369) Mostly exposing methods that were available on the CategoricalColumn Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16369 --- python/cudf/cudf/core/column/categorical.py | 130 +++++++++++--------- python/cudf/cudf/core/index.py | 116 +++++++++++++++++ python/cudf/cudf/tests/test_categorical.py | 56 +++++++++ 3 files changed, 247 insertions(+), 55 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 9aaccca349d..9433a91b9c6 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -262,37 +262,10 @@ def add_categories(self, new_categories: Any) -> SeriesOrIndex | None: dtype: category Categories (2, int64): [1, 2] """ - old_categories = self._column.categories - new_categories = column.as_column( - new_categories, - dtype=old_categories.dtype if len(new_categories) == 0 else None, - ) - - if is_mixed_with_object_dtype(old_categories, new_categories): - raise TypeError( - f"cudf does not support adding categories with existing " - f"categories of dtype `{old_categories.dtype}` and new " - f"categories of dtype `{new_categories.dtype}`, please " - f"type-cast new_categories to the same type as " - f"existing categories." - ) - common_dtype = find_common_type( - [old_categories.dtype, new_categories.dtype] + return self._return_or_inplace( + self._column.add_categories(new_categories=new_categories) ) - new_categories = new_categories.astype(common_dtype) - old_categories = old_categories.astype(common_dtype) - - if old_categories.isin(new_categories).any(): - raise ValueError("new categories must not include old categories") - - new_categories = old_categories.append(new_categories) - out_col = self._column - if not out_col._categories_equal(new_categories): - out_col = out_col._set_categories(new_categories) - - return self._return_or_inplace(out_col) - def remove_categories( self, removals: Any, @@ -349,23 +322,9 @@ def remove_categories( dtype: category Categories (3, int64): [1, 2, 10] """ - - cats = self.categories.to_series() - removals = cudf.Series(removals, dtype=cats.dtype) - removals_mask = removals.isin(cats) - - # ensure all the removals are in the current categories - # list. If not, raise an error to match Pandas behavior - if not removals_mask.all(): - vals = removals[~removals_mask].to_numpy() - raise ValueError(f"removals must all be in old categories: {vals}") - - new_categories = cats[~cats.isin(removals)]._column - out_col = self._column - if not out_col._categories_equal(new_categories): - out_col = out_col._set_categories(new_categories) - - return self._return_or_inplace(out_col) + return self._return_or_inplace( + self._column.remove_categories(removals=removals) + ) def set_categories( self, @@ -1319,7 +1278,7 @@ def _set_categories( new_categories: Any, is_unique: bool = False, ordered: bool = False, - ) -> CategoricalColumn: + ) -> Self: """Returns a new CategoricalColumn with the categories set to the specified *new_categories*. @@ -1376,17 +1335,68 @@ def _set_categories( new_codes = df._data["new_codes"] # codes can't have masks, so take mask out before moving in - return column.build_categorical_column( - categories=new_cats, - codes=column.build_column( - new_codes.base_data, dtype=new_codes.dtype + return cast( + Self, + column.build_categorical_column( + categories=new_cats, + codes=column.build_column( + new_codes.base_data, dtype=new_codes.dtype + ), + mask=new_codes.base_mask, + size=new_codes.size, + offset=new_codes.offset, + ordered=ordered, ), - mask=new_codes.base_mask, - size=new_codes.size, - offset=new_codes.offset, - ordered=ordered, ) + def add_categories(self, new_categories: Any) -> Self: + old_categories = self.categories + new_categories = column.as_column( + new_categories, + dtype=old_categories.dtype if len(new_categories) == 0 else None, + ) + if is_mixed_with_object_dtype(old_categories, new_categories): + raise TypeError( + f"cudf does not support adding categories with existing " + f"categories of dtype `{old_categories.dtype}` and new " + f"categories of dtype `{new_categories.dtype}`, please " + f"type-cast new_categories to the same type as " + f"existing categories." + ) + common_dtype = find_common_type( + [old_categories.dtype, new_categories.dtype] + ) + + new_categories = new_categories.astype(common_dtype) + old_categories = old_categories.astype(common_dtype) + + if old_categories.isin(new_categories).any(): + raise ValueError("new categories must not include old categories") + + new_categories = old_categories.append(new_categories) + if not self._categories_equal(new_categories): + return self._set_categories(new_categories) + return self + + def remove_categories( + self, + removals: Any, + ) -> Self: + removals = column.as_column(removals).astype(self.categories.dtype) + removals_mask = removals.isin(self.categories) + + # ensure all the removals are in the current categories + # list. If not, raise an error to match Pandas behavior + if not removals_mask.all(): + raise ValueError("removals must all be in old categories") + + new_categories = self.categories.apply_boolean_mask( + self.categories.isin(removals).unary_operator("not") + ) + if not self._categories_equal(new_categories): + return self._set_categories(new_categories) + return self + def reorder_categories( self, new_categories: Any, @@ -1404,6 +1414,16 @@ def reorder_categories( ) return self._set_categories(new_categories, ordered=ordered) + def rename_categories(self, new_categories) -> CategoricalColumn: + raise NotImplementedError( + "rename_categories is currently not supported." + ) + + def remove_unused_categories(self) -> Self: + raise NotImplementedError( + "remove_unused_categories is currently not supported." + ) + def as_ordered(self, ordered: bool): if self.dtype.ordered == ordered: return self diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 156cb973a9a..8c3b091abec 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2721,6 +2721,10 @@ def __init__( data = data.as_ordered(ordered=False) super().__init__(data, name=name) + @property + def ordered(self) -> bool: + return self._column.ordered + @property # type: ignore @_performance_tracking def codes(self): @@ -2743,6 +2747,118 @@ def _is_boolean(self): def _is_categorical(self): return True + def add_categories(self, new_categories) -> Self: + """ + Add new categories. + + `new_categories` will be included at the last/highest place in the + categories and will be unused directly after this call. + """ + return type(self)._from_data( + {self.name: self._column.add_categories(new_categories)} + ) + + def as_ordered(self) -> Self: + """ + Set the Categorical to be ordered. + """ + return type(self)._from_data( + {self.name: self._column.as_ordered(ordered=True)} + ) + + def as_unordered(self) -> Self: + """ + Set the Categorical to be unordered. + """ + return type(self)._from_data( + {self.name: self._column.as_ordered(ordered=False)} + ) + + def remove_categories(self, removals) -> Self: + """ + Remove the specified categories. + + `removals` must be included in the old categories. + + Parameters + ---------- + removals : category or list of categories + The categories which should be removed. + """ + return type(self)._from_data( + {self.name: self._column.remove_categories(removals)} + ) + + def remove_unused_categories(self) -> Self: + """ + Remove categories which are not used. + + This method is currently not supported. + """ + return type(self)._from_data( + {self.name: self._column.remove_unused_categories()} + ) + + def rename_categories(self, new_categories) -> Self: + """ + Rename categories. + + This method is currently not supported. + """ + return type(self)._from_data( + {self.name: self._column.rename_categories(new_categories)} + ) + + def reorder_categories(self, new_categories, ordered=None) -> Self: + """ + Reorder categories as specified in new_categories. + + ``new_categories`` need to include all old categories and no new category + items. + + Parameters + ---------- + new_categories : Index-like + The categories in new order. + ordered : bool, optional + Whether or not the categorical is treated as a ordered categorical. + If not given, do not change the ordered information. + """ + return type(self)._from_data( + { + self.name: self._column.reorder_categories( + new_categories, ordered=ordered + ) + } + ) + + def set_categories( + self, new_categories, ordered=None, rename: bool = False + ) -> Self: + """ + Set the categories to the specified new_categories. + + Parameters + ---------- + new_categories : list-like + The categories in new order. + ordered : bool, default None + Whether or not the categorical is treated as + a ordered categorical. If not given, do + not change the ordered information. + rename : bool, default False + Whether or not the `new_categories` should be + considered as a rename of the old categories + or as reordered categories. + """ + return type(self)._from_data( + { + self.name: self._column.set_categories( + new_categories, ordered=ordered, rename=rename + ) + } + ) + @_performance_tracking def interval_range( diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 9b6029582ce..ae58af8ebce 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -891,3 +891,59 @@ def test_categorical_maxima(op): result = getattr(ser.cat.as_ordered(), op)() result_pd = getattr(ser_pd.cat.as_ordered(), op)() assert_eq(result, result_pd) + + +@pytest.mark.parametrize("ordered", [True, False]) +def test_index_ordered(ordered): + pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered) + cudf_ci = cudf.from_pandas(pd_ci) + assert pd_ci.ordered == cudf_ci.ordered + + +@pytest.mark.parametrize("method", ["as_ordered", "as_unordered"]) +@pytest.mark.parametrize("ordered", [True, False]) +def test_index_as_ordered(method, ordered): + pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered) + cudf_ci = cudf.from_pandas(pd_ci) + + expected = getattr(pd_ci, method)() + result = getattr(cudf_ci, method)() + assert_eq(result, expected) + + +def test_index_add_categories(): + pd_ci = pd.CategoricalIndex([1, 2, 3]) + cudf_ci = cudf.from_pandas(pd_ci) + + expected = pd_ci.add_categories([4]) + result = cudf_ci.add_categories([4]) + assert_eq(result, expected) + + +def test_index_remove_categories(): + pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3, 4]) + cudf_ci = cudf.from_pandas(pd_ci) + + expected = pd_ci.remove_categories([4]) + result = cudf_ci.remove_categories([4]) + assert_eq(result, expected) + + +@pytest.mark.parametrize("ordered", [True, False]) +def test_index_reorder_categories(ordered): + pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 3, 2, 4]) + cudf_ci = cudf.from_pandas(pd_ci) + + expected = pd_ci.reorder_categories([1, 2, 3, 4], ordered=ordered) + result = cudf_ci.reorder_categories([1, 2, 3, 4], ordered=ordered) + assert_eq(result, expected) + + +@pytest.mark.parametrize("ordered", [True, False]) +def test_index_set_categories(ordered): + pd_ci = pd.CategoricalIndex([1, 2, 3]) + cudf_ci = cudf.from_pandas(pd_ci) + + expected = pd_ci.set_categories([1, 2, 3, 4], ordered=ordered) + result = cudf_ci.set_categories([1, 2, 3, 4], ordered=ordered) + assert_eq(result, expected)