From 43f817723dd338e1db9d96be670443dcf75171fb Mon Sep 17 00:00:00 2001 From: "Allahyar, Amin" Date: Thu, 29 Feb 2024 11:18:32 +0100 Subject: [PATCH 1/7] DOC:extended the documentation for `pandas.DataFrame.sort_values`; further explain the single-column vs. multi-column sorting; added further explanation and simplification for customized sorting, e.g, using `natsort` package --- pandas/core/frame.py | 46 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f530466c0fc30..5abe2a2720ef6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6819,7 +6819,9 @@ def sort_values( 4 D 7 2 e 5 C 4 3 F - Sort by col1 + **Sort by a single column** + + In this case, we are soring the rows according to values in ``col1``: >>> df.sort_values(by=["col1"]) col1 col2 col3 col4 @@ -6830,7 +6832,11 @@ def sort_values( 4 D 7 2 e 3 NaN 8 4 D - Sort by multiple columns + **Sort by multiple columns** + + You can also provide multiple columns to ``by`` argument, as shown below. In this example, the rows are + first sorted according to ``col1``, and then the rows that have an identical value in ``col1`` are sorted according + to ``col2``. >>> df.sort_values(by=["col1", "col2"]) col1 col2 col3 col4 @@ -6841,7 +6847,9 @@ def sort_values( 4 D 7 2 e 3 NaN 8 4 D - Sort Descending + **Sort in a descending order** + + The sort order can be reversed using ``ascending`` argument, as shown below: >>> df.sort_values(by="col1", ascending=False) col1 col2 col3 col4 @@ -6852,7 +6860,10 @@ def sort_values( 1 A 1 1 B 3 NaN 8 4 D - Putting NAs first + **Placing any** ``NA`` **first** + + Note that in the above example, the rows that contain an ``NA`` value in their ``col1`` are placed + at the end of the dataframe. This behavior can be modified via ``na_position`` argument, as shown below: >>> df.sort_values(by="col1", ascending=False, na_position="first") col1 col2 col3 col4 @@ -6863,7 +6874,10 @@ def sort_values( 0 A 2 0 a 1 A 1 1 B - Sorting with a key function + **Customized sort order** + + The ``key`` argument allows for a further customization of sorting behaviour. For example, you may want + to ignore the `letter's case `__ when sorting strings: >>> df.sort_values(by="col4", key=lambda col: col.str.lower()) col1 col2 col3 col4 @@ -6874,8 +6888,10 @@ def sort_values( 4 D 7 2 e 5 C 4 3 F - Natural sort with the key argument, - using the `natsort ` package. + Another typical example is `natural sorting `__. + This can be done using + ``natsort`` `package `__, which provides sorted indices according + to their natural order, as shown below: >>> df = pd.DataFrame( ... { @@ -6891,6 +6907,8 @@ def sort_values( 3 48hr 40 4 96hr 50 >>> from natsort import index_natsorted + >>> index_natsorted(df["time"]) + [0, 3, 2, 4, 1] >>> df.sort_values( ... by="time", key=lambda x: np.argsort(index_natsorted(df["time"])) ... ) @@ -6900,6 +6918,20 @@ def sort_values( 2 72hr 30 4 96hr 50 1 128hr 20 + + If sorted indices are not needed, you may simplify the procedure as follows: + + >>> from natsort import natsort_keygen + ... df.sort_values( + ... by="time", + ... key=natsort_keygen(), + ... ) + time value + 0 0hr 10 + 3 48hr 40 + 2 72hr 30 + 4 96hr 50 + 1 128hr 20 """ inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) From 8e968160f8c0abcddc8307faf5d90014df483391 Mon Sep 17 00:00:00 2001 From: "Allahyar, Amin" Date: Thu, 29 Feb 2024 13:10:25 +0100 Subject: [PATCH 2/7] shortened the added dostrings to 80 columns; fixed a typo --- pandas/core/frame.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5abe2a2720ef6..70b88701695ae 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6821,7 +6821,7 @@ def sort_values( **Sort by a single column** - In this case, we are soring the rows according to values in ``col1``: + In this case, we are sorting the rows according to values in ``col1``: >>> df.sort_values(by=["col1"]) col1 col2 col3 col4 @@ -6834,8 +6834,9 @@ def sort_values( **Sort by multiple columns** - You can also provide multiple columns to ``by`` argument, as shown below. In this example, the rows are - first sorted according to ``col1``, and then the rows that have an identical value in ``col1`` are sorted according + You can also provide multiple columns to ``by`` argument, as shown below. + In this example, the rows are first sorted according to ``col1``, and then + the rows that have an identical value in ``col1`` are sorted according to ``col2``. >>> df.sort_values(by=["col1", "col2"]) @@ -6862,8 +6863,9 @@ def sort_values( **Placing any** ``NA`` **first** - Note that in the above example, the rows that contain an ``NA`` value in their ``col1`` are placed - at the end of the dataframe. This behavior can be modified via ``na_position`` argument, as shown below: + Note that in the above example, the rows that contain an ``NA`` value in their + ``col1`` are placed at the end of the dataframe. This behavior can be modified + via ``na_position`` argument, as shown below: >>> df.sort_values(by="col1", ascending=False, na_position="first") col1 col2 col3 col4 @@ -6876,8 +6878,10 @@ def sort_values( **Customized sort order** - The ``key`` argument allows for a further customization of sorting behaviour. For example, you may want - to ignore the `letter's case `__ when sorting strings: + The ``key`` argument allows for a further customization of sorting behaviour. + For example, you may want + to ignore the `letter's case `__ + when sorting strings: >>> df.sort_values(by="col4", key=lambda col: col.str.lower()) col1 col2 col3 col4 @@ -6888,9 +6892,11 @@ def sort_values( 4 D 7 2 e 5 C 4 3 F - Another typical example is `natural sorting `__. + Another typical example is + `natural sorting `__. This can be done using - ``natsort`` `package `__, which provides sorted indices according + ``natsort`` `package `__, + which provides sorted indices according to their natural order, as shown below: >>> df = pd.DataFrame( @@ -6922,9 +6928,10 @@ def sort_values( If sorted indices are not needed, you may simplify the procedure as follows: >>> from natsort import natsort_keygen + ... ... df.sort_values( - ... by="time", - ... key=natsort_keygen(), + ... by="time", + ... key=natsort_keygen(), ... ) time value 0 0hr 10 From 8e83eee0cbf655a3b13358cbd94ee8fd8d395aee Mon Sep 17 00:00:00 2001 From: "Allahyar, Amin" Date: Thu, 29 Feb 2024 14:12:46 +0100 Subject: [PATCH 3/7] added another `shell` line to avoid `micromamba` test failure --- pandas/core/frame.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 70b88701695ae..1fb1461f5f31c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6928,8 +6928,7 @@ def sort_values( If sorted indices are not needed, you may simplify the procedure as follows: >>> from natsort import natsort_keygen - ... - ... df.sort_values( + >>> df.sort_values( ... by="time", ... key=natsort_keygen(), ... ) From 3e198bb3bb0447e9941c319516de71f8206e7cc5 Mon Sep 17 00:00:00 2001 From: "Allahyar, Amin" Date: Fri, 1 Mar 2024 07:01:15 +0100 Subject: [PATCH 4/7] fixed a typo in a `DataFrame.sort_values()` example --- pandas/core/frame.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1fb1461f5f31c..37562ad9d6696 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6916,7 +6916,8 @@ def sort_values( >>> index_natsorted(df["time"]) [0, 3, 2, 4, 1] >>> df.sort_values( - ... by="time", key=lambda x: np.argsort(index_natsorted(df["time"])) + ... by="time", + ... key=lambda x: np.argsort(index_natsorted(x)), ... ) time value 0 0hr 10 From 6749bb47534d5ace53cde5dc3314577c1b08a951 Mon Sep 17 00:00:00 2001 From: "Allahyar, Amin" Date: Fri, 1 Mar 2024 10:26:56 +0100 Subject: [PATCH 5/7] added a warning to raise awareness about a potential issue with `natsort` --- pandas/core/frame.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 37562ad9d6696..28876bcfbb3ca 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6939,6 +6939,13 @@ def sort_values( 2 72hr 30 4 96hr 50 1 128hr 20 + + .. warning:: + At the time of writing (1st March, 2024), ``natsort`` (v8.4.0) may fail + without error or warning in the case of multi-column sorting of + indices (e.g., ``DataFrame.sort_index()``, see + `here `__ for further + details). """ inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) From 22aa5880ec2c8364d285df3bd7d8dc8a39b33aa4 Mon Sep 17 00:00:00 2001 From: "Allahyar, Amin" Date: Sat, 2 Mar 2024 10:34:08 +0100 Subject: [PATCH 6/7] simplified the examples --- pandas/core/frame.py | 51 +------------------------------------------- 1 file changed, 1 insertion(+), 50 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 28876bcfbb3ca..7905e3593e429 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6896,56 +6896,7 @@ def sort_values( `natural sorting `__. This can be done using ``natsort`` `package `__, - which provides sorted indices according - to their natural order, as shown below: - - >>> df = pd.DataFrame( - ... { - ... "time": ["0hr", "128hr", "72hr", "48hr", "96hr"], - ... "value": [10, 20, 30, 40, 50], - ... } - ... ) - >>> df - time value - 0 0hr 10 - 1 128hr 20 - 2 72hr 30 - 3 48hr 40 - 4 96hr 50 - >>> from natsort import index_natsorted - >>> index_natsorted(df["time"]) - [0, 3, 2, 4, 1] - >>> df.sort_values( - ... by="time", - ... key=lambda x: np.argsort(index_natsorted(x)), - ... ) - time value - 0 0hr 10 - 3 48hr 40 - 2 72hr 30 - 4 96hr 50 - 1 128hr 20 - - If sorted indices are not needed, you may simplify the procedure as follows: - - >>> from natsort import natsort_keygen - >>> df.sort_values( - ... by="time", - ... key=natsort_keygen(), - ... ) - time value - 0 0hr 10 - 3 48hr 40 - 2 72hr 30 - 4 96hr 50 - 1 128hr 20 - - .. warning:: - At the time of writing (1st March, 2024), ``natsort`` (v8.4.0) may fail - without error or warning in the case of multi-column sorting of - indices (e.g., ``DataFrame.sort_index()``, see - `here `__ for further - details). + which sorts according to natural order of elements. """ inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) From e8769d39251333f053c3aa9a68c7c74542ed0d2c Mon Sep 17 00:00:00 2001 From: "Allahyar, Amin" Date: Sat, 2 Mar 2024 15:58:54 +0100 Subject: [PATCH 7/7] added a single example about `natsort` --- pandas/core/frame.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7905e3593e429..b64aaf8537ad9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6896,7 +6896,35 @@ def sort_values( `natural sorting `__. This can be done using ``natsort`` `package `__, - which sorts according to natural order of elements. + which provides sorted indices according + to their natural order, as shown below: + + >>> df = pd.DataFrame( + ... { + ... "time": ["0hr", "128hr", "72hr", "48hr", "96hr"], + ... "value": [10, 20, 30, 40, 50], + ... } + ... ) + >>> df + time value + 0 0hr 10 + 1 128hr 20 + 2 72hr 30 + 3 48hr 40 + 4 96hr 50 + >>> from natsort import index_natsorted + >>> index_natsorted(df["time"]) + [0, 3, 2, 4, 1] + >>> df.sort_values( + ... by="time", + ... key=lambda x: np.argsort(index_natsorted(x)), + ... ) + time value + 0 0hr 10 + 3 48hr 40 + 2 72hr 30 + 4 96hr 50 + 1 128hr 20 """ inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis)