Merge pull request #205 from lincc-frameworks/sort_values_nested

dougbrn · web-flow · commit 495b047a5aa5 · 2025-02-28T09:07:53.000-08:00
Wrapper for DataFrame.sort_values
diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
@@ -916,6 +916,118 @@ def dropna(
             return None
         return new_df
 
+    def sort_values(
+        self,
+        by,
+        *,
+        axis=0,
+        ascending=True,
+        inplace=False,
+        kind="quicksort",
+        na_position="last",
+        ignore_index=False,
+        key=None,
+    ):
+        """
+        Sort by the values along either axis.
+
+        Parameters:
+        -----------
+        by : str or list of str
+            Name or list of names to sort by.
+
+            Access nested columns using `nested_df.nested_col` (where
+            `nested_df` refers to a particular nested dataframe and
+            `nested_col` is a column of that nested dataframe).
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            Axis to be sorted.
+        ascending : bool or list of bool, default True
+            Sort ascending vs. descending. Specify list for multiple sort
+            orders. If this is a list of bools, must match the length of the
+            by.
+        inplace : bool, default False
+            If True, perform operation in-place.
+        kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
+            Choice of sorting algorithm. See also ndarray.np.sort for more
+            information. mergesort is the only stable algorithm. For DataFrames,
+            this option is only applied when sorting on a single column or label.
+        na_position : {'first', 'last'}, default 'last'
+            Puts NaNs at the beginning if first; last puts NaNs at the end.
+        ignore_index : bool, default False
+            If True, the resulting axis will be labeled 0, 1, …, n - 1.
+            Always False when applied to nested layers.
+        key : callable, optional
+            Apply the key function to the values before sorting.
+
+        Returns:
+        --------
+        DataFrame or None
+            DataFrame with sorted values if inplace=False, None otherwise.
+        """
+
+        # Resolve target layer
+        target = []
+        if isinstance(by, str):
+            by = [by]
+        # Check "by" columns for hierarchical references
+        for col in by:
+            if self._is_known_hierarchical_column(col):
+                target.append(col.split(".")[0])
+            else:
+                target.append("base")
+
+        # Ensure one target layer, preventing multi-layer operations
+        target = np.unique(target)
+        if len(target) > 1:
+            raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each")
+        target = str(target[0])
+
+        # Apply pandas sort_values
+        if target == "base":
+            return super().sort_values(
+                by=by,
+                axis=axis,
+                ascending=ascending,
+                inplace=inplace,
+                kind=kind,
+                na_position=na_position,
+                ignore_index=ignore_index,
+                key=key,
+            )
+        else:  # target is a nested column
+            target_flat = self[target].nest.to_flat()
+            target_flat = target_flat.set_index(self[target].array.get_list_index())
+
+            if target_flat.index.name is None:  # set name if not present
+                target_flat.index.name = "index"
+            # Index must always be the first sort key for nested columns
+            nested_by = [target_flat.index.name] + [col.split(".")[-1] for col in by]
+
+            # Augment the ascending kwarg to include the index
+            if isinstance(ascending, bool):
+                ascending = [True] + [ascending] * len(by)
+            elif isinstance(ascending, list):
+                ascending = [True] + ascending
+
+            target_flat = target_flat.sort_values(
+                by=nested_by,
+                axis=axis,
+                ascending=ascending,
+                kind=kind,
+                na_position=na_position,
+                ignore_index=False,
+                key=key,
+                inplace=False,
+            )
+
+            #  Could be optimized, as number of rows doesn't change
+            new_df = self._set_filtered_flat_df(nest_name=target, flat_df=target_flat)
+
+            if inplace:
+                self._update_inplace(new_df)
+                return None
+            return new_df
+
     def reduce(self, func, *args, **kwargs) -> NestedFrame:  # type: ignore[override]
         """
         Takes a function and applies it to each top-level row of the NestedFrame.
diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py
@@ -807,6 +807,60 @@ def test_dropna_errors():
         base.dropna(on_nested="nested", subset=["b"])
 
 
+def test_sort_values():
+    """Test that sort_values works on all layers"""
+
+    base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 3, 6]}, index=[0, 1, 2])
+
+    nested = pd.DataFrame(
+        data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+
+    base = base.add_nested(nested, "nested")
+
+    # Test basic functionality
+    sv_base = base.sort_values("b")
+    assert list(sv_base.index) == [0, 1, 2]
+
+    # Test on nested column
+    sv_base = base.sort_values(["nested.d"])
+    assert list(sv_base.iloc[0]["nested"]["d"]) == [4, 5, 7]
+
+    # Test multi-layer error trigger
+    with pytest.raises(ValueError):
+        base.sort_values(["a", "nested.c"])
+
+    # Test inplace=True
+    base.sort_values("nested.d", inplace=True)
+    assert list(base.iloc[0]["nested"]["d"]) == [4, 5, 7]
+
+
+def test_sort_values_ascension():
+    """Test that sort_values works with various ascending settings"""
+
+    base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 3, 6]}, index=[0, 1, 2])
+
+    nested = pd.DataFrame(
+        data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+
+    base = base.add_nested(nested, "nested")
+
+    # Test ascending=False
+    sv_base = base.sort_values("nested.d", ascending=False)
+    assert list(sv_base.iloc[0]["nested"]["d"]) == [7, 5, 4]
+
+    # Test list ascending
+    sv_base = base.sort_values("nested.d", ascending=[False])
+    assert list(sv_base.iloc[0]["nested"]["d"]) == [7, 5, 4]
+
+    # Test multi-by multi-ascending
+    sv_base = base.sort_values(["nested.d", "nested.c"], ascending=[False, True])
+    assert list(sv_base.iloc[0]["nested"]["d"]) == [7, 5, 4]
+
+
 def test_reduce():
     """Tests that we can call reduce on a NestedFrame with a custom function."""
     nf = NestedFrame(