diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 41cb76d88957e..472c505700327 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4367,9 +4367,9 @@ def sort_values( If True, perform operation in-place. kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' Choice of sorting algorithm. See also ndarray.np.sort for more - information. `mergesort` is the only stable algorithm. For - DataFrames, this option is only applied when sorting on a single - column or label. + information. `mergesort` is the only stable algorithm. For + DataFrames, if sorting by multiple columns or labels, this + argument is ignored, defaulting to a stable sorting algorithm. na_position : {'first', 'last'}, default 'last' Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 1bb969956e074..987848ec697d1 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -217,26 +217,48 @@ def test_sort_values_stable_descending_sort(self): sorted_df = df.sort_values(by="sort_col", kind="mergesort", ascending=False) tm.assert_frame_equal(df, sorted_df) - def test_sort_values_stable_descending_multicolumn_sort(self): + @pytest.mark.parametrize( + "expected_idx_non_na, ascending", + [ + [ + [3, 4, 5, 0, 1, 8, 6, 9, 7, 10, 13, 14], + [True, True], + ], + [ + [0, 3, 4, 5, 1, 8, 6, 7, 10, 13, 14, 9], + [True, False], + ], + [ + [9, 7, 10, 13, 14, 6, 8, 1, 3, 4, 5, 0], + [False, True], + ], + [ + [7, 10, 13, 14, 9, 6, 8, 1, 0, 3, 4, 5], + [False, False], + ], + ], + ) + @pytest.mark.parametrize("na_position", ["first", "last"]) + def test_sort_values_stable_multicolumn_sort( + self, expected_idx_non_na, ascending, na_position + ): + # GH#38426 Clarify sort_values with mult. columns / labels is stable df = DataFrame( - {"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]} - ) - # test stable mergesort - expected = DataFrame( - {"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 2, 9]}, - index=[2, 5, 4, 6, 1, 3, 0], - ) - sorted_df = df.sort_values( - ["A", "B"], ascending=[0, 1], na_position="first", kind="mergesort" + { + "A": [1, 2, np.nan, 1, 1, 1, 6, 8, 4, 8, 8, np.nan, np.nan, 8, 8], + "B": [9, np.nan, 5, 2, 2, 2, 5, 4, 5, 3, 4, np.nan, np.nan, 4, 4], + } ) - tm.assert_frame_equal(sorted_df, expected) - - expected = DataFrame( - {"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 9, 2]}, - index=[2, 5, 4, 6, 1, 0, 3], + # All rows with NaN in col "B" only have unique values in "A", therefore, + # only the rows with NaNs in "A" have to be treated individually: + expected_idx = ( + [11, 12, 2] + expected_idx_non_na + if na_position == "first" + else expected_idx_non_na + [2, 11, 12] ) + expected = df.take(expected_idx) sorted_df = df.sort_values( - ["A", "B"], ascending=[0, 0], na_position="first", kind="mergesort" + ["A", "B"], ascending=ascending, na_position=na_position ) tm.assert_frame_equal(sorted_df, expected)