Skip to content

Commit

Permalink
Align CategoricalIndex APIs with pandas 2.x (#16369)
Browse files Browse the repository at this point in the history
Mostly exposing methods that were available on the CategoricalColumn

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #16369
  • Loading branch information
mroeschke authored Jul 30, 2024
1 parent 368a34c commit d1be0b6
Show file tree
Hide file tree
Showing 3 changed files with 247 additions and 55 deletions.
130 changes: 75 additions & 55 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,37 +262,10 @@ def add_categories(self, new_categories: Any) -> SeriesOrIndex | None:
dtype: category
Categories (2, int64): [1, 2]
"""
old_categories = self._column.categories
new_categories = column.as_column(
new_categories,
dtype=old_categories.dtype if len(new_categories) == 0 else None,
)

if is_mixed_with_object_dtype(old_categories, new_categories):
raise TypeError(
f"cudf does not support adding categories with existing "
f"categories of dtype `{old_categories.dtype}` and new "
f"categories of dtype `{new_categories.dtype}`, please "
f"type-cast new_categories to the same type as "
f"existing categories."
)
common_dtype = find_common_type(
[old_categories.dtype, new_categories.dtype]
return self._return_or_inplace(
self._column.add_categories(new_categories=new_categories)
)

new_categories = new_categories.astype(common_dtype)
old_categories = old_categories.astype(common_dtype)

if old_categories.isin(new_categories).any():
raise ValueError("new categories must not include old categories")

new_categories = old_categories.append(new_categories)
out_col = self._column
if not out_col._categories_equal(new_categories):
out_col = out_col._set_categories(new_categories)

return self._return_or_inplace(out_col)

def remove_categories(
self,
removals: Any,
Expand Down Expand Up @@ -349,23 +322,9 @@ def remove_categories(
dtype: category
Categories (3, int64): [1, 2, 10]
"""

cats = self.categories.to_series()
removals = cudf.Series(removals, dtype=cats.dtype)
removals_mask = removals.isin(cats)

# ensure all the removals are in the current categories
# list. If not, raise an error to match Pandas behavior
if not removals_mask.all():
vals = removals[~removals_mask].to_numpy()
raise ValueError(f"removals must all be in old categories: {vals}")

new_categories = cats[~cats.isin(removals)]._column
out_col = self._column
if not out_col._categories_equal(new_categories):
out_col = out_col._set_categories(new_categories)

return self._return_or_inplace(out_col)
return self._return_or_inplace(
self._column.remove_categories(removals=removals)
)

def set_categories(
self,
Expand Down Expand Up @@ -1319,7 +1278,7 @@ def _set_categories(
new_categories: Any,
is_unique: bool = False,
ordered: bool = False,
) -> CategoricalColumn:
) -> Self:
"""Returns a new CategoricalColumn with the categories set to the
specified *new_categories*.
Expand Down Expand Up @@ -1376,17 +1335,68 @@ def _set_categories(
new_codes = df._data["new_codes"]

# codes can't have masks, so take mask out before moving in
return column.build_categorical_column(
categories=new_cats,
codes=column.build_column(
new_codes.base_data, dtype=new_codes.dtype
return cast(
Self,
column.build_categorical_column(
categories=new_cats,
codes=column.build_column(
new_codes.base_data, dtype=new_codes.dtype
),
mask=new_codes.base_mask,
size=new_codes.size,
offset=new_codes.offset,
ordered=ordered,
),
mask=new_codes.base_mask,
size=new_codes.size,
offset=new_codes.offset,
ordered=ordered,
)

def add_categories(self, new_categories: Any) -> Self:
old_categories = self.categories
new_categories = column.as_column(
new_categories,
dtype=old_categories.dtype if len(new_categories) == 0 else None,
)
if is_mixed_with_object_dtype(old_categories, new_categories):
raise TypeError(
f"cudf does not support adding categories with existing "
f"categories of dtype `{old_categories.dtype}` and new "
f"categories of dtype `{new_categories.dtype}`, please "
f"type-cast new_categories to the same type as "
f"existing categories."
)
common_dtype = find_common_type(
[old_categories.dtype, new_categories.dtype]
)

new_categories = new_categories.astype(common_dtype)
old_categories = old_categories.astype(common_dtype)

if old_categories.isin(new_categories).any():
raise ValueError("new categories must not include old categories")

new_categories = old_categories.append(new_categories)
if not self._categories_equal(new_categories):
return self._set_categories(new_categories)
return self

def remove_categories(
self,
removals: Any,
) -> Self:
removals = column.as_column(removals).astype(self.categories.dtype)
removals_mask = removals.isin(self.categories)

# ensure all the removals are in the current categories
# list. If not, raise an error to match Pandas behavior
if not removals_mask.all():
raise ValueError("removals must all be in old categories")

new_categories = self.categories.apply_boolean_mask(
self.categories.isin(removals).unary_operator("not")
)
if not self._categories_equal(new_categories):
return self._set_categories(new_categories)
return self

def reorder_categories(
self,
new_categories: Any,
Expand All @@ -1404,6 +1414,16 @@ def reorder_categories(
)
return self._set_categories(new_categories, ordered=ordered)

def rename_categories(self, new_categories) -> CategoricalColumn:
raise NotImplementedError(
"rename_categories is currently not supported."
)

def remove_unused_categories(self) -> Self:
raise NotImplementedError(
"remove_unused_categories is currently not supported."
)

def as_ordered(self, ordered: bool):
if self.dtype.ordered == ordered:
return self
Expand Down
116 changes: 116 additions & 0 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2721,6 +2721,10 @@ def __init__(
data = data.as_ordered(ordered=False)
super().__init__(data, name=name)

@property
def ordered(self) -> bool:
return self._column.ordered

@property # type: ignore
@_performance_tracking
def codes(self):
Expand All @@ -2743,6 +2747,118 @@ def _is_boolean(self):
def _is_categorical(self):
return True

def add_categories(self, new_categories) -> Self:
"""
Add new categories.
`new_categories` will be included at the last/highest place in the
categories and will be unused directly after this call.
"""
return type(self)._from_data(
{self.name: self._column.add_categories(new_categories)}
)

def as_ordered(self) -> Self:
"""
Set the Categorical to be ordered.
"""
return type(self)._from_data(
{self.name: self._column.as_ordered(ordered=True)}
)

def as_unordered(self) -> Self:
"""
Set the Categorical to be unordered.
"""
return type(self)._from_data(
{self.name: self._column.as_ordered(ordered=False)}
)

def remove_categories(self, removals) -> Self:
"""
Remove the specified categories.
`removals` must be included in the old categories.
Parameters
----------
removals : category or list of categories
The categories which should be removed.
"""
return type(self)._from_data(
{self.name: self._column.remove_categories(removals)}
)

def remove_unused_categories(self) -> Self:
"""
Remove categories which are not used.
This method is currently not supported.
"""
return type(self)._from_data(
{self.name: self._column.remove_unused_categories()}
)

def rename_categories(self, new_categories) -> Self:
"""
Rename categories.
This method is currently not supported.
"""
return type(self)._from_data(
{self.name: self._column.rename_categories(new_categories)}
)

def reorder_categories(self, new_categories, ordered=None) -> Self:
"""
Reorder categories as specified in new_categories.
``new_categories`` need to include all old categories and no new category
items.
Parameters
----------
new_categories : Index-like
The categories in new order.
ordered : bool, optional
Whether or not the categorical is treated as a ordered categorical.
If not given, do not change the ordered information.
"""
return type(self)._from_data(
{
self.name: self._column.reorder_categories(
new_categories, ordered=ordered
)
}
)

def set_categories(
self, new_categories, ordered=None, rename: bool = False
) -> Self:
"""
Set the categories to the specified new_categories.
Parameters
----------
new_categories : list-like
The categories in new order.
ordered : bool, default None
Whether or not the categorical is treated as
a ordered categorical. If not given, do
not change the ordered information.
rename : bool, default False
Whether or not the `new_categories` should be
considered as a rename of the old categories
or as reordered categories.
"""
return type(self)._from_data(
{
self.name: self._column.set_categories(
new_categories, ordered=ordered, rename=rename
)
}
)


@_performance_tracking
def interval_range(
Expand Down
56 changes: 56 additions & 0 deletions python/cudf/cudf/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,3 +891,59 @@ def test_categorical_maxima(op):
result = getattr(ser.cat.as_ordered(), op)()
result_pd = getattr(ser_pd.cat.as_ordered(), op)()
assert_eq(result, result_pd)


@pytest.mark.parametrize("ordered", [True, False])
def test_index_ordered(ordered):
pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered)
cudf_ci = cudf.from_pandas(pd_ci)
assert pd_ci.ordered == cudf_ci.ordered


@pytest.mark.parametrize("method", ["as_ordered", "as_unordered"])
@pytest.mark.parametrize("ordered", [True, False])
def test_index_as_ordered(method, ordered):
pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered)
cudf_ci = cudf.from_pandas(pd_ci)

expected = getattr(pd_ci, method)()
result = getattr(cudf_ci, method)()
assert_eq(result, expected)


def test_index_add_categories():
pd_ci = pd.CategoricalIndex([1, 2, 3])
cudf_ci = cudf.from_pandas(pd_ci)

expected = pd_ci.add_categories([4])
result = cudf_ci.add_categories([4])
assert_eq(result, expected)


def test_index_remove_categories():
pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3, 4])
cudf_ci = cudf.from_pandas(pd_ci)

expected = pd_ci.remove_categories([4])
result = cudf_ci.remove_categories([4])
assert_eq(result, expected)


@pytest.mark.parametrize("ordered", [True, False])
def test_index_reorder_categories(ordered):
pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 3, 2, 4])
cudf_ci = cudf.from_pandas(pd_ci)

expected = pd_ci.reorder_categories([1, 2, 3, 4], ordered=ordered)
result = cudf_ci.reorder_categories([1, 2, 3, 4], ordered=ordered)
assert_eq(result, expected)


@pytest.mark.parametrize("ordered", [True, False])
def test_index_set_categories(ordered):
pd_ci = pd.CategoricalIndex([1, 2, 3])
cudf_ci = cudf.from_pandas(pd_ci)

expected = pd_ci.set_categories([1, 2, 3, 4], ordered=ordered)
result = cudf_ci.set_categories([1, 2, 3, 4], ordered=ordered)
assert_eq(result, expected)

0 comments on commit d1be0b6

Please sign in to comment.