From cf445fd94ff83204afe1a8542e1010b6a0e69025 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 7 Jun 2021 22:09:59 +0100 Subject: [PATCH 1/6] #41556 passing fill_value into take_nd() allows cythonized path to be used with fill_value!=None. The existing test was extracting only the values column and ignoring that the index columns were also returned, which masked the bug reported in the issue --- pandas/core/groupby/groupby.py | 8 ++++++-- pandas/tests/groupby/test_groupby_shift_diff.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6deb5bb1a76f0..d85842c7a49c3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2775,6 +2775,7 @@ def _get_cythonized_result( result_is_index: bool = False, pre_processing=None, post_processing=None, + fill_value=None, **kwargs, ): """ @@ -2825,6 +2826,8 @@ def _get_cythonized_result( second argument, i.e. the signature should be (ndarray, Type). If `needs_nullable=True`, a third argument should be `nullable`, to allow for processing specific to nullable values. + fill_value : any, default None + The scalar value to use for newly introduced missing values. **kwargs : dict Extra arguments to be passed back to Cython funcs @@ -2923,7 +2926,7 @@ def _get_cythonized_result( result = result.reshape(-1) if result_is_index: - result = algorithms.take_nd(values, result) + result = algorithms.take_nd(values, result, fill_value=fill_value) if post_processing: result = post_processing(result, inferences) @@ -2972,7 +2975,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): tshift : Shift the time index, using the index’s frequency if available. """ - if freq is not None or axis != 0 or not isna(fill_value): + if freq is not None or axis != 0: return self.apply(lambda x: x.shift(periods, freq, axis, fill_value)) return self._get_cythonized_result( @@ -2982,6 +2985,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): needs_ngroups=True, result_is_index=True, periods=periods, + fill_value=fill_value ) @final diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/test_groupby_shift_diff.py index c6f3e7618e3f7..e9517b4544f0b 100644 --- a/pandas/tests/groupby/test_groupby_shift_diff.py +++ b/pandas/tests/groupby/test_groupby_shift_diff.py @@ -55,7 +55,7 @@ def test_group_shift_with_fill_value(): columns=["Z"], index=None, ) - result = g.shift(-1, fill_value=0)[["Z"]] + result = g.shift(-1, fill_value=0) tm.assert_frame_equal(result, expected) From 232e3171223c34c5804388123944c0c18bcb43f9 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 7 Jun 2021 22:17:40 +0100 Subject: [PATCH 2/6] whatsnew --- doc/source/whatsnew/v1.2.5.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst index d0af23b48b1f7..b7ebbec199c4a 100644 --- a/doc/source/whatsnew/v1.2.5.rst +++ b/doc/source/whatsnew/v1.2.5.rst @@ -28,7 +28,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug in :meth:`GroupBy.shift` that would return different columns if ``fill_value`` was not None (:issue:`41556`) - .. --------------------------------------------------------------------------- From dfc758da0833059d4c6dcfa7dc72bfe787f8c544 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 7 Jun 2021 22:21:08 +0100 Subject: [PATCH 3/6] black --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d85842c7a49c3..ca2f6c159cc52 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2985,7 +2985,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): needs_ngroups=True, result_is_index=True, periods=periods, - fill_value=fill_value + fill_value=fill_value, ) @final From e7236ac551671dd0d2825d3fc092d6339a36cc79 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 14 Jun 2021 14:27:51 +0100 Subject: [PATCH 4/6] whatsnew 1.4 --- doc/source/whatsnew/v1.2.5.rst | 2 +- doc/source/whatsnew/v1.4.0.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst index b7ebbec199c4a..d0af23b48b1f7 100644 --- a/doc/source/whatsnew/v1.2.5.rst +++ b/doc/source/whatsnew/v1.2.5.rst @@ -28,7 +28,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Bug in :meth:`GroupBy.shift` that would return different columns if ``fill_value`` was not None (:issue:`41556`) +- - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 166ea2f0d4164..1fc5baa7bcb5c 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -187,7 +187,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- +- Bug in :meth:`GroupBy.shift` that would return different columns if ``fill_value`` was not None (:issue:`41556`) - Reshaping From 5f9478f121aad55a9cbca62e321f0768cb39acbc Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 25 Jun 2021 23:49:15 +0100 Subject: [PATCH 5/6] add ASV --- asv_bench/benchmarks/groupby.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 1648985a56b91..b90b69e9891e8 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -369,6 +369,21 @@ def time_category_size(self): self.draws.groupby(self.cats).size() +class Shift: + def setup(self): + N = 18 + self.df = DataFrame({ + 'g' : ['a', 'b'] * 9, + 'v' : list(range(N)) + }) + + def time_defaults(self): + self.df.groupby('g').shift() + + def time_fill_value(self): + self.df.groupby('g').shift(fill_value=99) + + class FillNA: def setup(self): N = 100 From 674b3cbbfbe3ded69808a408ec939369125348ee Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sat, 26 Jun 2021 00:03:01 +0100 Subject: [PATCH 6/6] black --- asv_bench/benchmarks/groupby.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index b90b69e9891e8..dd7a2b3b29e7a 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -372,16 +372,13 @@ def time_category_size(self): class Shift: def setup(self): N = 18 - self.df = DataFrame({ - 'g' : ['a', 'b'] * 9, - 'v' : list(range(N)) - }) - + self.df = DataFrame({"g": ["a", "b"] * 9, "v": list(range(N))}) + def time_defaults(self): - self.df.groupby('g').shift() - + self.df.groupby("g").shift() + def time_fill_value(self): - self.df.groupby('g').shift(fill_value=99) + self.df.groupby("g").shift(fill_value=99) class FillNA: