diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index c219d0b63870f..73667afcbb4e0 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -803,7 +803,7 @@ cdef class BaseMultiIndexCodesEngine: int_keys : 1-dimensional array of dtype uint64 or object Integers representing one combination each """ - level_codes = list(target._recode_for_new_levels(self.levels)) + level_codes = list(target._recode_for_new_levels(self.levels, copy=True)) for i, codes in enumerate(level_codes): if self.levels[i].hasnans: na_index = self.levels[i].isna().nonzero()[0][0] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d57856115d276..f2a401bd3687a 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -670,13 +670,15 @@ def _from_inferred_categories( if known_categories: # Recode from observation order to dtype.categories order. categories = dtype.categories - codes = recode_for_categories(inferred_codes, cats, categories) + codes = recode_for_categories(inferred_codes, cats, categories, copy=False) elif not cats.is_monotonic_increasing: # Sort categories and recode for unknown categories. unsorted = cats.copy() categories = cats.sort_values() - codes = recode_for_categories(inferred_codes, unsorted, categories) + codes = recode_for_categories( + inferred_codes, unsorted, categories, copy=False + ) dtype = CategoricalDtype(categories, ordered=False) else: dtype = CategoricalDtype(cats, ordered=False) @@ -945,7 +947,7 @@ def _set_categories(self, categories, fastpath: bool = False) -> None: super().__init__(self._ndarray, new_dtype) - def _set_dtype(self, dtype: CategoricalDtype, copy: bool = True) -> Self: + def _set_dtype(self, dtype: CategoricalDtype, *, copy: bool) -> Self: """ Internal method for directly updating the CategoricalDtype @@ -959,7 +961,7 @@ def _set_dtype(self, dtype: CategoricalDtype, copy: bool = True) -> Self: a (valid) instance of `CategoricalDtype`. """ codes = recode_for_categories( - self.codes, self.categories, dtype.categories, copy + self.codes, self.categories, dtype.categories, copy=copy ) return type(self)._simple_new(codes, dtype=dtype) @@ -1154,7 +1156,7 @@ def set_categories( codes = cat._codes else: codes = recode_for_categories( - cat.codes, cat.categories, new_dtype.categories + cat.codes, cat.categories, new_dtype.categories, copy=False ) NDArrayBacked.__init__(cat, codes, new_dtype) return cat @@ -3006,7 +3008,7 @@ def _get_codes_for_values( def recode_for_categories( - codes: np.ndarray, old_categories, new_categories, copy: bool = True + codes: np.ndarray, old_categories, new_categories, *, copy: bool ) -> np.ndarray: """ Convert a set of codes for to a new set of categories @@ -3027,7 +3029,7 @@ def recode_for_categories( >>> old_cat = pd.Index(["b", "a", "c"]) >>> new_cat = pd.Index(["a", "b"]) >>> codes = np.array([0, 1, 1, 2]) - >>> recode_for_categories(codes, old_cat, new_cat) + >>> recode_for_categories(codes, old_cat, new_cat, copy=True) array([ 1, 0, 0, -1], dtype=int8) """ if len(old_categories) == 0: diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 0d94df65a1d6c..78833568025a3 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -318,7 +318,8 @@ def _maybe_unwrap(x): categories = categories.sort_values() new_codes = [ - recode_for_categories(c.codes, c.categories, categories) for c in to_union + recode_for_categories(c.codes, c.categories, categories, copy=False) + for c in to_union ] new_codes = np.concatenate(new_codes) else: diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 90cd8e3ffa1c7..806f34975b8a7 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -53,7 +53,7 @@ def recode_for_groupby(c: Categorical, sort: bool, observed: bool) -> Categorica # we recode according to the uniques categories = c.categories.take(take_codes) - codes = recode_for_categories(c.codes, c.categories, categories) + codes = recode_for_categories(c.codes, c.categories, categories, copy=False) # return a new categorical that maps our new codes # and categories diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 29b34f560ab2e..f16ae9483b186 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2675,7 +2675,7 @@ def _reorder_ilevels(self, order) -> MultiIndex: ) def _recode_for_new_levels( - self, new_levels, copy: bool = True + self, new_levels, *, copy: bool ) -> Generator[np.ndarray]: if len(new_levels) > self.nlevels: raise AssertionError( diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 2791fd55f54d7..6ac2e74288359 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -480,7 +480,7 @@ def test_recode_to_categories(self, codes, old, new, expected): expected = np.asanyarray(expected, dtype=np.int8) old = Index(old) new = Index(new) - result = recode_for_categories(codes, old, new) + result = recode_for_categories(codes, old, new, copy=True) tm.assert_numpy_array_equal(result, expected) def test_recode_to_categories_large(self): @@ -489,5 +489,5 @@ def test_recode_to_categories_large(self): old = Index(codes) expected = np.arange(N - 1, -1, -1, dtype=np.int16) new = Index(expected) - result = recode_for_categories(codes, old, new) + result = recode_for_categories(codes, old, new, copy=True) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index ec1d501ddba16..daacf4c69a8a9 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -49,12 +49,12 @@ def test_categories_match_up_to_permutation(self): def test_set_dtype_same(self): c = Categorical(["a", "b", "c"]) - result = c._set_dtype(CategoricalDtype(["a", "b", "c"])) + result = c._set_dtype(CategoricalDtype(["a", "b", "c"]), copy=True) tm.assert_categorical_equal(result, c) def test_set_dtype_new_categories(self): c = Categorical(["a", "b", "c"]) - result = c._set_dtype(CategoricalDtype(list("abcd"))) + result = c._set_dtype(CategoricalDtype(list("abcd")), copy=True) tm.assert_numpy_array_equal(result.codes, c.codes) tm.assert_index_equal(result.dtype.categories, Index(list("abcd"))) @@ -86,12 +86,12 @@ def test_set_dtype_new_categories(self): def test_set_dtype_many(self, values, categories, new_categories, ordered): c = Categorical(values, categories) expected = Categorical(values, new_categories, ordered) - result = c._set_dtype(expected.dtype) + result = c._set_dtype(expected.dtype, copy=True) tm.assert_categorical_equal(result, expected) def test_set_dtype_no_overlap(self): c = Categorical(["a", "b", "c"], ["d", "e"]) - result = c._set_dtype(CategoricalDtype(["a", "b"])) + result = c._set_dtype(CategoricalDtype(["a", "b"]), copy=True) expected = Categorical([None, None, None], categories=["a", "b"]) tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index e3cb9664e19f2..541b271098152 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -52,7 +52,7 @@ def test_nan_handling(self): def test_set_dtype_nans(self): c = Categorical(["a", "b", np.nan]) - result = c._set_dtype(CategoricalDtype(["a", "c"])) + result = c._set_dtype(CategoricalDtype(["a", "c"]), copy=True) tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1], dtype="int8")) def test_set_item_nan(self):