From c5cf3f9b8dfdaa6487a04c49efdc03581950565c Mon Sep 17 00:00:00 2001 From: Siddharth Vishwakarma <153494533+siddharth-vi@users.noreply.github.com> Date: Tue, 31 Dec 2024 18:13:17 +0530 Subject: [PATCH] fix: Fix incorrectly added sorted flag after append for lexically ordered categorical series (#20414) --- .../src/chunked_array/logical/categorical/merge.rs | 8 +++++++- py-polars/tests/unit/datatypes/test_categorical.py | 9 +++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/crates/polars-core/src/chunked_array/logical/categorical/merge.rs b/crates/polars-core/src/chunked_array/logical/categorical/merge.rs index 0e72de7a903f..8117c424215f 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/merge.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/merge.rs @@ -143,7 +143,7 @@ pub fn call_categorical_merge_operation( ) -> PolarsResult { let rev_map_left = cat_left.get_rev_map(); let rev_map_right = cat_right.get_rev_map(); - let (new_physical, new_rev_map) = match (&**rev_map_left, &**rev_map_right) { + let (mut new_physical, new_rev_map) = match (&**rev_map_left, &**rev_map_right) { (RevMapping::Global(_, _, idl), RevMapping::Global(_, _, idr)) if idl == idr => { let mut rev_map_merger = GlobalRevMapMerger::new(rev_map_left.clone()); rev_map_merger.merge_map(rev_map_right)?; @@ -176,6 +176,12 @@ pub fn call_categorical_merge_operation( }, _ => polars_bail!(string_cache_mismatch), }; + // During merge operation, the sorted flag might get set on the underlying physical. + // Ensure that the sorted flag is not set if we use lexical order + if cat_left.uses_lexical_ordering() { + new_physical.set_sorted_flag(IsSorted::Not) + } + // SAFETY: physical and rev map are correctly constructed above unsafe { Ok(CategoricalChunked::from_cats_and_rev_map_unchecked( diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py index b28dd269c19c..505986422c55 100644 --- a/py-polars/tests/unit/datatypes/test_categorical.py +++ b/py-polars/tests/unit/datatypes/test_categorical.py @@ -835,6 +835,15 @@ def test_cat_append_lexical_sorted_flag() -> None: assert not (df2["y"].is_sorted()) + s = pl.Series("a", ["z", "k", "a"], pl.Categorical("lexical")) + s1 = s[[0]] + s2 = s[[1]] + s3 = s[[2]] + s1.append(s2) + s1.append(s3) + + assert not (s1.is_sorted()) + def test_get_cat_categories_multiple_chunks() -> None: df = pl.DataFrame(