From 60b10216865752328b71d5e9f60b28fc63df3a99 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 14 Feb 2020 11:03:31 +0000 Subject: [PATCH 1/5] Don't modify values of directly --- doc/source/whatsnew/v1.0.2.rst | 2 +- pandas/core/groupby/generic.py | 24 ++++++++++++++---------- pandas/tests/groupby/test_function.py | 2 ++ 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index 0216007ea5ba8..e7c0502d3b992 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -19,7 +19,7 @@ Fixed regressions - Fixed regression in :meth:`Series.align` when ``other`` is a DataFrame and ``method`` is not None (:issue:`31785`) - Fixed regression in :meth:`pandas.core.groupby.RollingGroupby.apply` where the ``raw`` parameter was ignored (:issue:`31754`) - Fixed regression in :meth:`rolling(..).corr() ` when using a time offset (:issue:`31789`) -- +- Fixed regression in :meth:`DataFrameGroupBy.nunique` which was modifying the original values if ``NaN``s were present (:issue:`31950`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 37b6429167646..ed3856bd58ed5 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -591,20 +591,24 @@ def nunique(self, dropna: bool = True) -> Series: val = self.obj._internal_get_values() - # GH 27951 - # temporary fix while we wait for NumPy bug 12629 to be fixed - val[isna(val)] = np.datetime64("NaT") - - try: - sorter = np.lexsort((val, ids)) - except TypeError: # catches object dtypes - msg = f"val.dtype must be object, got {val.dtype}" - assert val.dtype == object, msg + def _object_sorter(val, ids): val, _ = algorithms.factorize(val, sort=False) sorter = np.lexsort((val, ids)) _isna = lambda a: a == -1 + return val, sorter, _isna + + if isna(val).any() and val.dtype == object: + # Deal with pandas.NaT + val, sorter, _isna = _object_sorter(val, ids) else: - _isna = isna + try: + sorter = np.lexsort((val, ids)) + except TypeError: # catches object dtypes + msg = f"val.dtype must be object, got {val.dtype}" + assert val.dtype == object, msg + val, sorter, _isna = _object_sorter(val, ids) + else: + _isna = isna ids, val = ids[sorter], val[sorter] diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 73e36cb5e6c84..245ed5bf9900b 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -966,6 +966,7 @@ def test_frame_describe_unstacked_format(): @pytest.mark.parametrize("dropna", [False, True]) def test_series_groupby_nunique(n, m, sort, dropna): def check_nunique(df, keys, as_index=True): + original_df = df.copy() gr = df.groupby(keys, as_index=as_index, sort=sort) left = gr["julie"].nunique(dropna=dropna) @@ -975,6 +976,7 @@ def check_nunique(df, keys, as_index=True): right = right.reset_index(drop=True) tm.assert_series_equal(left, right, check_names=False) + tm.assert_frame_equal(df, original_df) days = date_range("2015-08-23", periods=10) From 86b9bf6e54127d5f502258149779224f45d3a594 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 14 Feb 2020 11:31:24 +0000 Subject: [PATCH 2/5] fix whatsnew --- doc/source/whatsnew/v1.0.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index e7c0502d3b992..77a8e7dc6e39d 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -19,7 +19,7 @@ Fixed regressions - Fixed regression in :meth:`Series.align` when ``other`` is a DataFrame and ``method`` is not None (:issue:`31785`) - Fixed regression in :meth:`pandas.core.groupby.RollingGroupby.apply` where the ``raw`` parameter was ignored (:issue:`31754`) - Fixed regression in :meth:`rolling(..).corr() ` when using a time offset (:issue:`31789`) -- Fixed regression in :meth:`DataFrameGroupBy.nunique` which was modifying the original values if ``NaN``s were present (:issue:`31950`) +- Fixed regression in :meth:`DataFrameGroupBy.nunique` which was modifying the original values if ``NaN`` values were present (:issue:`31950`) .. --------------------------------------------------------------------------- From ef59ac6ba0ce6d565fb76bcb73351d6106a7cf3f Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sat, 22 Feb 2020 11:26:36 +0000 Subject: [PATCH 3/5] no need to special case nat --- pandas/core/groupby/generic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ed3856bd58ed5..61d3a3d5a4a77 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -597,8 +597,7 @@ def _object_sorter(val, ids): _isna = lambda a: a == -1 return val, sorter, _isna - if isna(val).any() and val.dtype == object: - # Deal with pandas.NaT + if isna(val).any(): val, sorter, _isna = _object_sorter(val, ids) else: try: From 02326fb552d0eec3d7d6fcbbb1f62c8c274991d0 Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sat, 22 Feb 2020 11:29:05 +0000 Subject: [PATCH 4/5] remove file --- pandas/tests/mytest.py | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 pandas/tests/mytest.py diff --git a/pandas/tests/mytest.py b/pandas/tests/mytest.py deleted file mode 100644 index 23bc27f271dd2..0000000000000 --- a/pandas/tests/mytest.py +++ /dev/null @@ -1,9 +0,0 @@ -import pandas as pd -import pytest - -@pytest.mark.xfail(strict=True) -def test(): - index = pd.period_range(start='2018-01', periods=24, freq='M') - periodSerie = pd.Series(range(24),index=index) - periodSerie.index.name = 'Month' - periodSerie.groupby(periodSerie.index.month).sum() From 88a7eaed20940b26d5dd48f066f46b1bedbb4c43 Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sat, 22 Feb 2020 17:20:14 +0000 Subject: [PATCH 5/5] simplify acc to Jeff's review --- pandas/core/groupby/generic.py | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 61d3a3d5a4a77..1bb512aee39e2 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -591,33 +591,18 @@ def nunique(self, dropna: bool = True) -> Series: val = self.obj._internal_get_values() - def _object_sorter(val, ids): - val, _ = algorithms.factorize(val, sort=False) - sorter = np.lexsort((val, ids)) - _isna = lambda a: a == -1 - return val, sorter, _isna - - if isna(val).any(): - val, sorter, _isna = _object_sorter(val, ids) - else: - try: - sorter = np.lexsort((val, ids)) - except TypeError: # catches object dtypes - msg = f"val.dtype must be object, got {val.dtype}" - assert val.dtype == object, msg - val, sorter, _isna = _object_sorter(val, ids) - else: - _isna = isna - - ids, val = ids[sorter], val[sorter] + codes, _ = algorithms.factorize(val, sort=False) + sorter = np.lexsort((codes, ids)) + codes = codes[sorter] + ids = ids[sorter] # group boundaries are where group ids change # unique observations are where sorted values change idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] - inc = np.r_[1, val[1:] != val[:-1]] + inc = np.r_[1, codes[1:] != codes[:-1]] # 1st item of each group is a new unique observation - mask = _isna(val) + mask = codes == -1 if dropna: inc[idx] = 1 inc[mask] = 0