From f1171a8ad0d5f5735fa7134f95227abe91295c42 Mon Sep 17 00:00:00 2001 From: Jonathan Tammo Siebert Date: Sat, 12 Dec 2020 17:47:06 +0100 Subject: [PATCH 1/6] DOC: Clarify that DataFrame.sort_values is stable for len(by)>1 --- pandas/core/generic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 41cb76d88957e..472c505700327 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4367,9 +4367,9 @@ def sort_values( If True, perform operation in-place. kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' Choice of sorting algorithm. See also ndarray.np.sort for more - information. `mergesort` is the only stable algorithm. For - DataFrames, this option is only applied when sorting on a single - column or label. + information. `mergesort` is the only stable algorithm. For + DataFrames, if sorting by multiple columns or labels, this + argument is ignored, defaulting to a stable sorting algorithm. na_position : {'first', 'last'}, default 'last' Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. From 2e24f694dc5a157d0b552aee46ee1b7214b29f60 Mon Sep 17 00:00:00 2001 From: Jonathan Tammo Siebert Date: Tue, 22 Dec 2020 16:22:32 +0100 Subject: [PATCH 2/6] TST: Update test case to show multilabel sort is stable --- .../tests/frame/methods/test_sort_values.py | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 1bb969956e074..904ab2a9c354f 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -219,25 +219,30 @@ def test_sort_values_stable_descending_sort(self): def test_sort_values_stable_descending_multicolumn_sort(self): df = DataFrame( - {"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]} + { + "A": [1, 2, np.nan, 1, 1, 1, 6, 8, 4, 8, 8, np.nan, np.nan, 8, 8], + "B": [9, np.nan, 5, 2, 2, 2, 5, 4, 5, 3, 4, np.nan, np.nan, 4, 4], + } ) - # test stable mergesort + # test sorting is stable expected = DataFrame( - {"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 2, 9]}, - index=[2, 5, 4, 6, 1, 3, 0], - ) - sorted_df = df.sort_values( - ["A", "B"], ascending=[0, 1], na_position="first", kind="mergesort" + { + "A": [np.nan, np.nan, np.nan, 8, 8, 8, 8, 8, 6, 4, 2, 1, 1, 1, 1], + "B": [np.nan, np.nan, 5, 3, 4, 4, 4, 4, 5, 5, np.nan, 2, 2, 2, 9], + }, + index=[11, 12, 2, 9, 7, 10, 13, 14, 6, 8, 1, 3, 4, 5, 0], ) + sorted_df = df.sort_values(["A", "B"], ascending=[0, 1], na_position="first") tm.assert_frame_equal(sorted_df, expected) expected = DataFrame( - {"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 9, 2]}, - index=[2, 5, 4, 6, 1, 0, 3], - ) - sorted_df = df.sort_values( - ["A", "B"], ascending=[0, 0], na_position="first", kind="mergesort" + { + "A": [8, 8, 8, 8, 8, 6, 4, 2, 1, 1, 1, 1, np.nan, np.nan, np.nan], + "B": [4, 4, 4, 4, 3, 5, 5, np.nan, 9, 2, 2, 2, 5, np.nan, np.nan], + }, + index=[7, 10, 13, 14, 9, 6, 8, 1, 0, 3, 4, 5, 2, 11, 12], ) + sorted_df = df.sort_values(["A", "B"], ascending=[0, 0], na_position="last") tm.assert_frame_equal(sorted_df, expected) def test_sort_values_stable_categorial(self): From 2addf1dd3191c9c652c502a8a5471b6c87379c01 Mon Sep 17 00:00:00 2001 From: Jonathan Tammo Siebert Date: Tue, 22 Dec 2020 17:21:09 +0100 Subject: [PATCH 3/6] TST: Parametrize sort options in multicol sort_values test --- .../tests/frame/methods/test_sort_values.py | 78 ++++++++++++++----- 1 file changed, 59 insertions(+), 19 deletions(-) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 904ab2a9c354f..c59f545c45d11 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -217,32 +217,72 @@ def test_sort_values_stable_descending_sort(self): sorted_df = df.sort_values(by="sort_col", kind="mergesort", ascending=False) tm.assert_frame_equal(df, sorted_df) - def test_sort_values_stable_descending_multicolumn_sort(self): + @pytest.mark.parametrize( + "expected_idx, ascending, na_position", + [ + pytest.param( + [11, 12, 2, 3, 4, 5, 0, 1, 8, 6, 9, 7, 10, 13, 14], + [True, True], + "first", + id="both_ascending_na_first", + ), + pytest.param( + [11, 12, 2, 0, 3, 4, 5, 1, 8, 6, 7, 10, 13, 14, 9], + [True, False], + "first", + id="first_ascending_na_first", + ), + pytest.param( + [11, 12, 2, 9, 7, 10, 13, 14, 6, 8, 1, 3, 4, 5, 0], + [False, True], + "first", + id="second_ascending_na_first", + ), + pytest.param( + [11, 12, 2, 7, 10, 13, 14, 9, 6, 8, 1, 0, 3, 4, 5], + [False, False], + "first", + id="both_descending_na_first", + ), + pytest.param( + [3, 4, 5, 0, 1, 8, 6, 9, 7, 10, 13, 14, 2, 11, 12], + [True, True], + "last", + id="both_ascending_na_last", + ), + pytest.param( + [0, 3, 4, 5, 1, 8, 6, 7, 10, 13, 14, 9, 2, 11, 12], + [True, False], + "last", + id="first_ascending_na_last", + ), + pytest.param( + [9, 7, 10, 13, 14, 6, 8, 1, 3, 4, 5, 0, 2, 11, 12], + [False, True], + "last", + id="second_ascending_na_last", + ), + pytest.param( + [7, 10, 13, 14, 9, 6, 8, 1, 0, 3, 4, 5, 2, 11, 12], + [False, False], + "last", + id="both_descending_na_last", + ), + ], + ) + def test_sort_values_stable_multicolumn_sort( + self, expected_idx, ascending, na_position + ): df = DataFrame( { "A": [1, 2, np.nan, 1, 1, 1, 6, 8, 4, 8, 8, np.nan, np.nan, 8, 8], "B": [9, np.nan, 5, 2, 2, 2, 5, 4, 5, 3, 4, np.nan, np.nan, 4, 4], } ) - # test sorting is stable - expected = DataFrame( - { - "A": [np.nan, np.nan, np.nan, 8, 8, 8, 8, 8, 6, 4, 2, 1, 1, 1, 1], - "B": [np.nan, np.nan, 5, 3, 4, 4, 4, 4, 5, 5, np.nan, 2, 2, 2, 9], - }, - index=[11, 12, 2, 9, 7, 10, 13, 14, 6, 8, 1, 3, 4, 5, 0], - ) - sorted_df = df.sort_values(["A", "B"], ascending=[0, 1], na_position="first") - tm.assert_frame_equal(sorted_df, expected) - - expected = DataFrame( - { - "A": [8, 8, 8, 8, 8, 6, 4, 2, 1, 1, 1, 1, np.nan, np.nan, np.nan], - "B": [4, 4, 4, 4, 3, 5, 5, np.nan, 9, 2, 2, 2, 5, np.nan, np.nan], - }, - index=[7, 10, 13, 14, 9, 6, 8, 1, 0, 3, 4, 5, 2, 11, 12], + expected = df.take(expected_idx) + sorted_df = df.sort_values( + ["A", "B"], ascending=ascending, na_position=na_position ) - sorted_df = df.sort_values(["A", "B"], ascending=[0, 0], na_position="last") tm.assert_frame_equal(sorted_df, expected) def test_sort_values_stable_categorial(self): From da6b97a523084f4ee5dd579328ba5b32d13ab4b0 Mon Sep 17 00:00:00 2001 From: Jonathan Tammo Siebert Date: Tue, 22 Dec 2020 18:31:05 +0100 Subject: [PATCH 4/6] TST: Simplified parametrization --- .../tests/frame/methods/test_sort_values.py | 62 +++++++------------ 1 file changed, 24 insertions(+), 38 deletions(-) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index c59f545c45d11..554b54178c9de 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -218,60 +218,39 @@ def test_sort_values_stable_descending_sort(self): tm.assert_frame_equal(df, sorted_df) @pytest.mark.parametrize( - "expected_idx, ascending, na_position", + "expected_idx_non_na, ascending", [ pytest.param( - [11, 12, 2, 3, 4, 5, 0, 1, 8, 6, 9, 7, 10, 13, 14], + [3, 4, 5, 0, 1, 8, 6, 9, 7, 10, 13, 14], [True, True], - "first", - id="both_ascending_na_first", + id="both_ascending", ), pytest.param( - [11, 12, 2, 0, 3, 4, 5, 1, 8, 6, 7, 10, 13, 14, 9], + [0, 3, 4, 5, 1, 8, 6, 7, 10, 13, 14, 9], [True, False], - "first", - id="first_ascending_na_first", + id="first_ascending", ), pytest.param( - [11, 12, 2, 9, 7, 10, 13, 14, 6, 8, 1, 3, 4, 5, 0], + [9, 7, 10, 13, 14, 6, 8, 1, 3, 4, 5, 0], [False, True], - "first", - id="second_ascending_na_first", + id="second_ascending", ), pytest.param( - [11, 12, 2, 7, 10, 13, 14, 9, 6, 8, 1, 0, 3, 4, 5], + [7, 10, 13, 14, 9, 6, 8, 1, 0, 3, 4, 5], [False, False], - "first", - id="both_descending_na_first", - ), - pytest.param( - [3, 4, 5, 0, 1, 8, 6, 9, 7, 10, 13, 14, 2, 11, 12], - [True, True], - "last", - id="both_ascending_na_last", - ), - pytest.param( - [0, 3, 4, 5, 1, 8, 6, 7, 10, 13, 14, 9, 2, 11, 12], - [True, False], - "last", - id="first_ascending_na_last", - ), - pytest.param( - [9, 7, 10, 13, 14, 6, 8, 1, 3, 4, 5, 0, 2, 11, 12], - [False, True], - "last", - id="second_ascending_na_last", - ), - pytest.param( - [7, 10, 13, 14, 9, 6, 8, 1, 0, 3, 4, 5, 2, 11, 12], - [False, False], - "last", - id="both_descending_na_last", + id="both_descending", ), ], ) + @pytest.mark.parametrize( + "na_position", + [ + pytest.param("first", id="na_first"), + pytest.param("last", id="na_last"), + ], + ) def test_sort_values_stable_multicolumn_sort( - self, expected_idx, ascending, na_position + self, expected_idx_non_na, ascending, na_position ): df = DataFrame( { @@ -279,6 +258,13 @@ def test_sort_values_stable_multicolumn_sort( "B": [9, np.nan, 5, 2, 2, 2, 5, 4, 5, 3, 4, np.nan, np.nan, 4, 4], } ) + # All rows with NaN in col "B" only have unique value in "A", therefore, + # only the rows with NaNs in "A" have to be treated individually: + expected_idx = ( + [11, 12, 2] + expected_idx_non_na + if na_position == "first" + else expected_idx_non_na + [2, 11, 12] + ) expected = df.take(expected_idx) sorted_df = df.sort_values( ["A", "B"], ascending=ascending, na_position=na_position From 33c1a93b8ac6a2979c3b2308ec274c74bbea97d4 Mon Sep 17 00:00:00 2001 From: Jonathan Tammo Siebert Date: Tue, 22 Dec 2020 19:01:11 +0100 Subject: [PATCH 5/6] TST: Removed parametrization IDs --- .../tests/frame/methods/test_sort_values.py | 28 ++++++------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 554b54178c9de..3916ef5697812 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -220,35 +220,25 @@ def test_sort_values_stable_descending_sort(self): @pytest.mark.parametrize( "expected_idx_non_na, ascending", [ - pytest.param( + [ [3, 4, 5, 0, 1, 8, 6, 9, 7, 10, 13, 14], [True, True], - id="both_ascending", - ), - pytest.param( + ], + [ [0, 3, 4, 5, 1, 8, 6, 7, 10, 13, 14, 9], [True, False], - id="first_ascending", - ), - pytest.param( + ], + [ [9, 7, 10, 13, 14, 6, 8, 1, 3, 4, 5, 0], [False, True], - id="second_ascending", - ), - pytest.param( + ], + [ [7, 10, 13, 14, 9, 6, 8, 1, 0, 3, 4, 5], [False, False], - id="both_descending", - ), - ], - ) - @pytest.mark.parametrize( - "na_position", - [ - pytest.param("first", id="na_first"), - pytest.param("last", id="na_last"), + ], ], ) + @pytest.mark.parametrize("na_position", ["first", "last"]) def test_sort_values_stable_multicolumn_sort( self, expected_idx_non_na, ascending, na_position ): From 911745a3bdef862be54eb539d711306432f53d9c Mon Sep 17 00:00:00 2001 From: Jonathan Tammo Siebert Date: Tue, 22 Dec 2020 19:12:39 +0100 Subject: [PATCH 6/6] TST: Added reference to github PR --- pandas/tests/frame/methods/test_sort_values.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 3916ef5697812..987848ec697d1 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -242,13 +242,14 @@ def test_sort_values_stable_descending_sort(self): def test_sort_values_stable_multicolumn_sort( self, expected_idx_non_na, ascending, na_position ): + # GH#38426 Clarify sort_values with mult. columns / labels is stable df = DataFrame( { "A": [1, 2, np.nan, 1, 1, 1, 6, 8, 4, 8, 8, np.nan, np.nan, 8, 8], "B": [9, np.nan, 5, 2, 2, 2, 5, 4, 5, 3, 4, np.nan, np.nan, 4, 4], } ) - # All rows with NaN in col "B" only have unique value in "A", therefore, + # All rows with NaN in col "B" only have unique values in "A", therefore, # only the rows with NaNs in "A" have to be treated individually: expected_idx = ( [11, 12, 2] + expected_idx_non_na