From 714f4f0c753e02d848fb6603257d017397e5c6a9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 9 Jan 2023 22:50:18 +0100 Subject: [PATCH 1/9] ENH: Add lazy copy for sort_values --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/frame.py | 12 ++++++- pandas/tests/copy_view/test_methods.py | 35 +++++++++++++++++++ .../tests/frame/methods/test_sort_values.py | 8 +++++ 4 files changed, 55 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index f91f694c6e772..5acec19112bb3 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -890,6 +890,7 @@ Indexing - Bug in :meth:`DataFrame.__setitem__` raising when indexer is a :class:`DataFrame` with ``boolean`` dtype (:issue:`47125`) - Bug in :meth:`DataFrame.reindex` filling with wrong values when indexing columns and index for ``uint`` dtypes (:issue:`48184`) - Bug in :meth:`DataFrame.loc` when setting :class:`DataFrame` with different dtypes coercing values to single dtype (:issue:`50467`) +- Bug in :meth:`DataFrame.sort_values` incorrectly returning object when ``by`` is empty list and ``inplace=True`` (:issue:`1`) - Bug in :meth:`DataFrame.loc` coercing dtypes when setting values with a list indexer (:issue:`49159`) - Bug in :meth:`Series.loc` raising error for out of bounds end of slice indexer (:issue:`50161`) - Bug in :meth:`DataFrame.loc` raising ``ValueError`` with ``bool`` indexer and :class:`MultiIndex` (:issue:`47687`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6491081c54592..9b8dcb5f14c3b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -45,6 +45,7 @@ from pandas._libs.hashtable import duplicated from pandas._libs.lib import ( NoDefault, + array_equal_fast, no_default, ) from pandas._typing import ( @@ -6688,7 +6689,16 @@ def sort_values( k, kind=kind, ascending=ascending, na_position=na_position, key=key ) else: - return self.copy() + if inplace: + return self._update_inplace(self) + else: + return self.copy(deep=None) + + if array_equal_fast(indexer, np.arange(0, len(indexer), dtype=indexer.dtype)): + if inplace: + return self._update_inplace(self) + else: + return self.copy(deep=None) new_data = self._mgr.take( indexer, axis=self._get_block_manager_axis(axis), verify=False diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index af900fd5268ef..623b8212be0fe 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -537,6 +537,41 @@ def test_sort_index(using_copy_on_write): tm.assert_series_equal(ser, ser_orig) +def test_sort_values(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}) + df_orig = df.copy() + df2 = df.sort_values(by="a") + + if using_copy_on_write: + assert np.shares_memory(df.values, df2.values) + else: + assert not np.shares_memory(df.values, df2.values) + + # mutating df triggers a copy-on-write for the column / block + df2.iloc[0] = 0 + assert not np.shares_memory(df2.values, df.values) + tm.assert_frame_equal(df, df_orig) + + +def test_sort_values_inplace(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}) + df_orig = df.copy() + view = df[:] + df.sort_values(by="a", inplace=True) + + assert np.shares_memory(df.values, view.values) + + # mutating df triggers a copy-on-write for the column / block + df.iloc[0] = 0 + if using_copy_on_write: + assert not np.shares_memory(view.values, df.values) + tm.assert_frame_equal(view, df_orig) + else: + assert np.shares_memory(view.values, df.values) + expected = DataFrame({"a": [0, 2, 3]}) + tm.assert_frame_equal(view, expected) + + def test_reorder_levels(using_copy_on_write): index = MultiIndex.from_tuples( [(1, 1), (1, 2), (2, 1), (2, 2)], names=["one", "two"] diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index d7f1d900db052..88248ddc77d28 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -609,6 +609,14 @@ def test_sort_values_reshaping(self): tm.assert_frame_equal(df, expected) + def test_sort_values_no_by_inplace(self): + # GH# + df = DataFrame({"a": [1, 2, 3]}) + expected = df.copy() + result = df.sort_values(by=[], inplace=True) + tm.assert_frame_equal(df, expected) + assert result is None + class TestDataFrameSortKey: # test key sorting (issue 27237) def test_sort_values_inplace_key(self, sort_by_key): From e5a8237ab7d896458ad8bfb2e6b16f3dc5ede887 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 9 Jan 2023 22:55:45 +0100 Subject: [PATCH 2/9] Add for Series --- pandas/core/series.py | 12 ++++++- pandas/tests/copy_view/test_methods.py | 46 ++++++++++++++------------ 2 files changed, 35 insertions(+), 23 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index b5d73373f061e..5332fb0e4c366 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -29,7 +29,10 @@ properties, reshape, ) -from pandas._libs.lib import no_default +from pandas._libs.lib import ( + array_equal_fast, + no_default, +) from pandas._typing import ( AggFuncType, AlignJoin, @@ -3548,6 +3551,13 @@ def sort_values( values_to_sort = ensure_key_mapped(self, key)._values if key else self._values sorted_index = nargsort(values_to_sort, kind, bool(ascending), na_position) + if array_equal_fast( + sorted_index, np.arange(0, len(sorted_index), dtype=sorted_index.dtype) + ): + if inplace: + return self._update_inplace(self) + return self.copy(deep=None) + result = self._constructor( self._values[sorted_index], index=self.index[sorted_index] ) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 623b8212be0fe..6164f53578440 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -537,39 +537,41 @@ def test_sort_index(using_copy_on_write): tm.assert_series_equal(ser, ser_orig) -def test_sort_values(using_copy_on_write): - df = DataFrame({"a": [1, 2, 3]}) - df_orig = df.copy() - df2 = df.sort_values(by="a") +@pytest.mark.parametrize( + "obj, kwargs", [(Series([1, 2, 3]), {}), (DataFrame({"a": [1, 2, 3]}), {"by": "a"})] +) +def test_sort_values(using_copy_on_write, obj, kwargs): + obj_orig = obj.copy() + obj2 = obj.sort_values(**kwargs) if using_copy_on_write: - assert np.shares_memory(df.values, df2.values) + assert np.shares_memory(obj.values, obj2.values) else: - assert not np.shares_memory(df.values, df2.values) + assert not np.shares_memory(obj.values, obj2.values) # mutating df triggers a copy-on-write for the column / block - df2.iloc[0] = 0 - assert not np.shares_memory(df2.values, df.values) - tm.assert_frame_equal(df, df_orig) + obj2.iloc[0] = 0 + assert not np.shares_memory(obj2.values, obj.values) + tm.assert_equal(obj, obj_orig) -def test_sort_values_inplace(using_copy_on_write): - df = DataFrame({"a": [1, 2, 3]}) - df_orig = df.copy() - view = df[:] - df.sort_values(by="a", inplace=True) +@pytest.mark.parametrize( + "obj, kwargs", [(Series([1, 2, 3]), {}), (DataFrame({"a": [1, 2, 3]}), {"by": "a"})] +) +def test_sort_values_inplace(using_copy_on_write, obj, kwargs): + obj_orig = obj.copy() + view = obj[:] + obj.sort_values(inplace=True, **kwargs) - assert np.shares_memory(df.values, view.values) + assert np.shares_memory(obj.values, view.values) - # mutating df triggers a copy-on-write for the column / block - df.iloc[0] = 0 + # mutating obj triggers a copy-on-write for the column / block + obj.iloc[0] = 0 if using_copy_on_write: - assert not np.shares_memory(view.values, df.values) - tm.assert_frame_equal(view, df_orig) + assert not np.shares_memory(view.values, obj.values) + tm.assert_equal(view, obj_orig) else: - assert np.shares_memory(view.values, df.values) - expected = DataFrame({"a": [0, 2, 3]}) - tm.assert_frame_equal(view, expected) + assert np.shares_memory(view.values, obj.values) def test_reorder_levels(using_copy_on_write): From 73850994cea8b2f72fe6ebaa6525ce4c3f16307c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 10 Jan 2023 09:21:41 +0100 Subject: [PATCH 3/9] Fix array manager --- pandas/tests/copy_view/test_methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 6164f53578440..8ec6d24b2ea0e 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -558,7 +558,7 @@ def test_sort_values(using_copy_on_write, obj, kwargs): @pytest.mark.parametrize( "obj, kwargs", [(Series([1, 2, 3]), {}), (DataFrame({"a": [1, 2, 3]}), {"by": "a"})] ) -def test_sort_values_inplace(using_copy_on_write, obj, kwargs): +def test_sort_values_inplace(using_copy_on_write, obj, kwargs, using_array_manager): obj_orig = obj.copy() view = obj[:] obj.sort_values(inplace=True, **kwargs) @@ -567,7 +567,7 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs): # mutating obj triggers a copy-on-write for the column / block obj.iloc[0] = 0 - if using_copy_on_write: + if using_copy_on_write or using_array_manager and isinstance(obj, DataFrame): assert not np.shares_memory(view.values, obj.values) tm.assert_equal(view, obj_orig) else: From 57ece3b22eae91f1e63fa4eada5b3bd914541691 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 10 Jan 2023 16:53:21 +0100 Subject: [PATCH 4/9] Fix test --- pandas/tests/copy_view/test_methods.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 8ec6d24b2ea0e..8549f07387320 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd from pandas import ( DataFrame, Index, @@ -567,7 +568,12 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, using_array_manag # mutating obj triggers a copy-on-write for the column / block obj.iloc[0] = 0 - if using_copy_on_write or using_array_manager and isinstance(obj, DataFrame): + if ( + using_copy_on_write + or using_array_manager + and isinstance(obj, DataFrame) + and pd.options.mode.copy_on_write + ): assert not np.shares_memory(view.values, obj.values) tm.assert_equal(view, obj_orig) else: From 06df224afa3d238e99289362a9908f5f4d4d0353 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 12 Jan 2023 22:16:35 +0100 Subject: [PATCH 5/9] Update doc/source/whatsnew/v2.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 3d45774fe02eb..9ba1588b8bbd2 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -938,7 +938,7 @@ Indexing - Bug in :meth:`DataFrame.__setitem__` raising when indexer is a :class:`DataFrame` with ``boolean`` dtype (:issue:`47125`) - Bug in :meth:`DataFrame.reindex` filling with wrong values when indexing columns and index for ``uint`` dtypes (:issue:`48184`) - Bug in :meth:`DataFrame.loc` when setting :class:`DataFrame` with different dtypes coercing values to single dtype (:issue:`50467`) -- Bug in :meth:`DataFrame.sort_values` incorrectly returning object when ``by`` is empty list and ``inplace=True`` (:issue:`1`) +- Bug in :meth:`DataFrame.sort_values` where ``None`` was no returned when ``by`` is empty list and ``inplace=True`` (:issue:`50643`) - Bug in :meth:`DataFrame.loc` coercing dtypes when setting values with a list indexer (:issue:`49159`) - Bug in :meth:`Series.loc` raising error for out of bounds end of slice indexer (:issue:`50161`) - Bug in :meth:`DataFrame.loc` raising ``ValueError`` with ``bool`` indexer and :class:`MultiIndex` (:issue:`47687`) From b03750a7ea4153e9330e78fae0167e6f97863b42 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 12 Jan 2023 22:17:14 +0100 Subject: [PATCH 6/9] Update v2.0.0.rst --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 9ba1588b8bbd2..5d70c97d503f0 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -938,7 +938,7 @@ Indexing - Bug in :meth:`DataFrame.__setitem__` raising when indexer is a :class:`DataFrame` with ``boolean`` dtype (:issue:`47125`) - Bug in :meth:`DataFrame.reindex` filling with wrong values when indexing columns and index for ``uint`` dtypes (:issue:`48184`) - Bug in :meth:`DataFrame.loc` when setting :class:`DataFrame` with different dtypes coercing values to single dtype (:issue:`50467`) -- Bug in :meth:`DataFrame.sort_values` where ``None`` was no returned when ``by`` is empty list and ``inplace=True`` (:issue:`50643`) +- Bug in :meth:`DataFrame.sort_values` where ``None`` was not returned when ``by`` is empty list and ``inplace=True`` (:issue:`50643`) - Bug in :meth:`DataFrame.loc` coercing dtypes when setting values with a list indexer (:issue:`49159`) - Bug in :meth:`Series.loc` raising error for out of bounds end of slice indexer (:issue:`50161`) - Bug in :meth:`DataFrame.loc` raising ``ValueError`` with ``bool`` indexer and :class:`MultiIndex` (:issue:`47687`) From 7382ac543268aa9182fd993761a35c8ccffbc94f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 13 Jan 2023 11:27:25 +0100 Subject: [PATCH 7/9] fix test --- pandas/tests/copy_view/test_methods.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index ce0a34be7226a..886d443af27be 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1,7 +1,6 @@ import numpy as np import pytest -import pandas as pd from pandas import ( DataFrame, Index, @@ -595,27 +594,23 @@ def test_sort_values(using_copy_on_write, obj, kwargs): @pytest.mark.parametrize( - "obj, kwargs", [(Series([1, 2, 3]), {}), (DataFrame({"a": [1, 2, 3]}), {"by": "a"})] + "obj, kwargs", + [(Series([1, 2, 3], name="a"), {}), (DataFrame({"a": [1, 2, 3]}), {"by": "a"})], ) def test_sort_values_inplace(using_copy_on_write, obj, kwargs, using_array_manager): obj_orig = obj.copy() view = obj[:] obj.sort_values(inplace=True, **kwargs) - assert np.shares_memory(obj.values, view.values) + assert np.shares_memory(get_array(obj, "a"), get_array(view, "a")) # mutating obj triggers a copy-on-write for the column / block obj.iloc[0] = 0 - if ( - using_copy_on_write - or using_array_manager - and isinstance(obj, DataFrame) - and pd.options.mode.copy_on_write - ): - assert not np.shares_memory(view.values, obj.values) + if using_copy_on_write: + assert not np.shares_memory(get_array(obj, "a"), get_array(view, "a")) tm.assert_equal(view, obj_orig) else: - assert np.shares_memory(view.values, obj.values) + assert np.shares_memory(get_array(obj, "a"), get_array(view, "a")) def test_reorder_levels(using_copy_on_write): From f950076aa971c40119f37769aaae251e5e6db538 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 13 Jan 2023 11:31:44 +0100 Subject: [PATCH 8/9] Update test_sort_values.py --- pandas/tests/frame/methods/test_sort_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 88248ddc77d28..c9d62790d64de 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -610,7 +610,7 @@ def test_sort_values_reshaping(self): tm.assert_frame_equal(df, expected) def test_sort_values_no_by_inplace(self): - # GH# + # GH#50643 df = DataFrame({"a": [1, 2, 3]}) expected = df.copy() result = df.sort_values(by=[], inplace=True) From e2c17717259a8412ef25951932b4c7fb777bddc8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 13 Jan 2023 11:36:06 +0100 Subject: [PATCH 9/9] apply same changes to other test --- pandas/tests/copy_view/test_methods.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 886d443af27be..99626772659e9 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -576,20 +576,21 @@ def test_sort_index(using_copy_on_write): @pytest.mark.parametrize( - "obj, kwargs", [(Series([1, 2, 3]), {}), (DataFrame({"a": [1, 2, 3]}), {"by": "a"})] + "obj, kwargs", + [(Series([1, 2, 3], name="a"), {}), (DataFrame({"a": [1, 2, 3]}), {"by": "a"})], ) def test_sort_values(using_copy_on_write, obj, kwargs): obj_orig = obj.copy() obj2 = obj.sort_values(**kwargs) if using_copy_on_write: - assert np.shares_memory(obj.values, obj2.values) + assert np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) else: - assert not np.shares_memory(obj.values, obj2.values) + assert not np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) # mutating df triggers a copy-on-write for the column / block obj2.iloc[0] = 0 - assert not np.shares_memory(obj2.values, obj.values) + assert not np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) tm.assert_equal(obj, obj_orig)