From 4bfe322abc04120f7d64dc33e4f1a5a670aa0f3e Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 14 Nov 2020 19:54:00 +0100 Subject: [PATCH 1/7] Add min count to groupby functions --- pandas/_libs/groupby.pyx | 30 ++++++++++++++---------------- pandas/core/groupby/groupby.py | 8 ++++---- pandas/core/groupby/ops.py | 2 +- pandas/core/resample.py | 4 ++-- 4 files changed, 21 insertions(+), 23 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 438d9fa625737..5d43f3b04cf4e 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -892,7 +892,7 @@ def group_last(rank_t[:, :] out, int64_t[:] counts, ndarray[rank_t, ndim=2] values, const int64_t[:] labels, - Py_ssize_t min_count=-1): + Py_ssize_t min_count=0): """ Only aggregates on axis=0 """ @@ -903,8 +903,6 @@ def group_last(rank_t[:, :] out, ndarray[int64_t, ndim=2] nobs bint runtime_error = False - assert min_count == -1, "'min_count' only used in add and prod" - # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` if not len(values) == labels.shape[0]: @@ -939,7 +937,7 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: out[i, j] = NAN else: out[i, j] = resx[i, j] @@ -961,7 +959,7 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: if rank_t is int64_t: out[i, j] = NPY_NAT elif rank_t is uint64_t: @@ -986,7 +984,8 @@ def group_last(rank_t[:, :] out, def group_nth(rank_t[:, :] out, int64_t[:] counts, ndarray[rank_t, ndim=2] values, - const int64_t[:] labels, int64_t rank=1 + const int64_t[:] labels, + int64_t min_count=0, int64_t rank=1 ): """ Only aggregates on axis=0 @@ -1033,7 +1032,7 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: out[i, j] = NAN else: out[i, j] = resx[i, j] @@ -1057,7 +1056,7 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: if rank_t is int64_t: out[i, j] = NPY_NAT elif rank_t is uint64_t: @@ -1283,7 +1282,7 @@ def group_max(groupby_t[:, :] out, int64_t[:] counts, ndarray[groupby_t, ndim=2] values, const int64_t[:] labels, - Py_ssize_t min_count=-1): + Py_ssize_t min_count=0): """ Only aggregates on axis=0 """ @@ -1294,8 +1293,6 @@ def group_max(groupby_t[:, :] out, bint runtime_error = False int64_t[:, :] nobs - assert min_count == -1, "'min_count' only used in add and prod" - # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` if not len(values) == labels.shape[0]: @@ -1337,11 +1334,14 @@ def group_max(groupby_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: if groupby_t is uint64_t: + with gil: + print("test") runtime_error = True break else: + out[i, j] = nan_val else: out[i, j] = maxx[i, j] @@ -1358,7 +1358,7 @@ def group_min(groupby_t[:, :] out, int64_t[:] counts, ndarray[groupby_t, ndim=2] values, const int64_t[:] labels, - Py_ssize_t min_count=-1): + Py_ssize_t min_count=0): """ Only aggregates on axis=0 """ @@ -1369,8 +1369,6 @@ def group_min(groupby_t[:, :] out, bint runtime_error = False int64_t[:, :] nobs - assert min_count == -1, "'min_count' only used in add and prod" - # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` if not len(values) == labels.shape[0]: @@ -1411,7 +1409,7 @@ def group_min(groupby_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: if groupby_t is uint64_t: runtime_error = True break diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e3fceb9bf0a06..38c2c9bd07723 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1655,19 +1655,19 @@ def prod(self, numeric_only: bool = True, min_count: int = 0): ) @doc(_groupby_agg_method_template, fname="min", no=False, mc=-1) - def min(self, numeric_only: bool = False, min_count: int = -1): + def min(self, numeric_only: bool = False, min_count: int = 1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="min", npfunc=np.min ) @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1) - def max(self, numeric_only: bool = False, min_count: int = -1): + def max(self, numeric_only: bool = False, min_count: int = 1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="max", npfunc=np.max ) @doc(_groupby_agg_method_template, fname="first", no=False, mc=-1) - def first(self, numeric_only: bool = False, min_count: int = -1): + def first(self, numeric_only: bool = False, min_count: int = 1): def first_compat(obj: FrameOrSeries, axis: int = 0): def first(x: Series): """Helper function for first item that isn't NA.""" @@ -1691,7 +1691,7 @@ def first(x: Series): ) @doc(_groupby_agg_method_template, fname="last", no=False, mc=-1) - def last(self, numeric_only: bool = False, min_count: int = -1): + def last(self, numeric_only: bool = False, min_count: int = 1): def last_compat(obj: FrameOrSeries, axis: int = 0): def last(x: Series): """Helper function for last item that isn't NA.""" diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index fc80852f00c95..3b33b7e5ecd00 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -603,7 +603,7 @@ def _aggregate( ): if agg_func is libgroupby.group_nth: # different signature from the others - agg_func(result, counts, values, comp_ids, rank=1) + agg_func(result, counts, values, comp_ids, min_count, rank=1) else: agg_func(result, counts, values, comp_ids, min_count) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index fccedd75c4531..e5589b0dae837 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -950,7 +950,7 @@ def quantile(self, q=0.5, **kwargs): # downsample methods -for method in ["sum", "prod"]: +for method in ["sum", "prod", "min", "max", "first", "last"]: def f(self, _method=method, min_count=0, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) @@ -961,7 +961,7 @@ def f(self, _method=method, min_count=0, *args, **kwargs): # downsample methods -for method in ["min", "max", "first", "last", "mean", "sem", "median", "ohlc"]: +for method in ["mean", "sem", "median", "ohlc"]: def g(self, _method=method, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) From 0ccda51e71f707a161194ad32dd90f4daa80e3a5 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 15 Nov 2020 18:20:36 +0100 Subject: [PATCH 2/7] Add test --- pandas/_libs/groupby.pyx | 2 +- pandas/core/groupby/groupby.py | 8 ++++---- pandas/tests/groupby/aggregate/test_cython.py | 16 ++++++++-------- pandas/tests/groupby/aggregate/test_other.py | 9 +++++++++ 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 5d43f3b04cf4e..ab1fb235fdacc 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -985,7 +985,7 @@ def group_nth(rank_t[:, :] out, int64_t[:] counts, ndarray[rank_t, ndim=2] values, const int64_t[:] labels, - int64_t min_count=0, int64_t rank=1 + int64_t min_count=1, int64_t rank=1 ): """ Only aggregates on axis=0 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 38c2c9bd07723..338d8cdc5d282 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1654,19 +1654,19 @@ def prod(self, numeric_only: bool = True, min_count: int = 0): numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod ) - @doc(_groupby_agg_method_template, fname="min", no=False, mc=-1) + @doc(_groupby_agg_method_template, fname="min", no=False, mc=1) def min(self, numeric_only: bool = False, min_count: int = 1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="min", npfunc=np.min ) - @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1) + @doc(_groupby_agg_method_template, fname="max", no=False, mc=1) def max(self, numeric_only: bool = False, min_count: int = 1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="max", npfunc=np.max ) - @doc(_groupby_agg_method_template, fname="first", no=False, mc=-1) + @doc(_groupby_agg_method_template, fname="first", no=False, mc=1) def first(self, numeric_only: bool = False, min_count: int = 1): def first_compat(obj: FrameOrSeries, axis: int = 0): def first(x: Series): @@ -1690,7 +1690,7 @@ def first(x: Series): npfunc=first_compat, ) - @doc(_groupby_agg_method_template, fname="last", no=False, mc=-1) + @doc(_groupby_agg_method_template, fname="last", no=False, mc=1) def last(self, numeric_only: bool = False, min_count: int = 1): def last_compat(obj: FrameOrSeries, axis: int = 0): def last(x: Series): diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index c907391917ca8..d4daf6f5d52eb 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -166,23 +166,23 @@ def test__cython_agg_general(op, targop): @pytest.mark.parametrize( - "op, targop", + "op, targop, kwargs", [ - ("mean", np.mean), - ("median", lambda x: np.median(x) if len(x) > 0 else np.nan), - ("var", lambda x: np.var(x, ddof=1)), - ("min", np.min), - ("max", np.max), + ("mean", np.mean, {}), + ("median", lambda x: np.median(x) if len(x) > 0 else np.nan, {}), + ("var", lambda x: np.var(x, ddof=1), {}), + ("min", np.min, {"min_count": 1}), + ("max", np.max, {"min_count": 1}), ], ) -def test_cython_agg_empty_buckets(op, targop, observed): +def test_cython_agg_empty_buckets(op, targop, kwargs, observed): df = DataFrame([11, 12, 13]) grps = range(0, 55, 5) # calling _cython_agg_general directly, instead of via the user API # which sets different values for min_count, so do that here. g = df.groupby(pd.cut(df[0], grps), observed=observed) - result = g._cython_agg_general(op) + result = g._cython_agg_general(op, **kwargs) g = df.groupby(pd.cut(df[0], grps), observed=observed) expected = g.agg(lambda x: targop(x)) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 5d0f6d6262899..8911a4077d2f2 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -638,3 +638,12 @@ def weird_func(x): result = df["decimals"].groupby(df["id1"]).agg(weird_func) tm.assert_series_equal(result, expected, check_names=False) + + +@pytest.mark.parametrize("func", ["first", "last", "max", "min"]) +def test_min_count_implementation_min_max_first_last(func): + # GH#37821 + df = DataFrame({"a": [1] * 3, "b": [np.nan] * 3}) + result = getattr(df.groupby("a"), func)(min_count=2) + expected = DataFrame({"b": [np.nan]}, index=Index([1], name="a")) + tm.assert_frame_equal(result, expected) From 1aff6298e5b86dcdef3dbd7394957c3754a209e4 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 15 Nov 2020 20:44:16 +0100 Subject: [PATCH 3/7] Add tests and whatsnew --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/tests/groupby/aggregate/test_other.py | 2 +- pandas/tests/resample/test_datetime_index.py | 13 +++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e623b76fed0a9..aaa20ff7bf1fc 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -235,6 +235,7 @@ Other enhancements - Improve error reporting for :meth:`DataFrame.merge()` when invalid merge column definitions were given (:issue:`16228`) - Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`) - Improved error reporting for subsetting columns of a :class:`DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) +- Add support for ``min_count`` keyword for :meth:`DataFrame.groupby` and :meth:`DataFrame.resample` for functions ``min``, ``max``, ``first`` and ``last`` (:issue:`37821`, :issue:`37768`) .. --------------------------------------------------------------------------- diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 8911a4077d2f2..0ee9854d8045e 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -643,7 +643,7 @@ def weird_func(x): @pytest.mark.parametrize("func", ["first", "last", "max", "min"]) def test_min_count_implementation_min_max_first_last(func): # GH#37821 - df = DataFrame({"a": [1] * 3, "b": [np.nan] * 3}) + df = DataFrame({"a": [1] * 3, "b": [1, np.nan, np.nan]}) result = getattr(df.groupby("a"), func)(min_count=2) expected = DataFrame({"b": [np.nan]}, index=Index([1], name="a")) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index d3d33d6fe847e..7cd8d628a18e3 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1785,3 +1785,16 @@ def test_resample_calendar_day_with_dst( 1.0, pd.date_range(first, exp_last, freq=freq_out, tz="Europe/Amsterdam") ) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", ["min", "max", "first", "last"]) +def test_resample_aggregate_functions_min_count(func): + # GH#37768 + index = date_range(start="2020", freq="M", periods=3) + ser = Series([1, np.nan, np.nan], index) + result = getattr(ser.resample("Q"), func)(min_count=2) + expected = Series( + [np.nan], + index=DatetimeIndex(["2020-03-31"], dtype="datetime64[ns]", freq="Q-DEC"), + ) + tm.assert_series_equal(result, expected) From f5f333d361ff2eac73d2328cfa1331bf11f230b8 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 22 Nov 2020 02:10:15 +0100 Subject: [PATCH 4/7] Roll tests back with new behavior --- pandas/_libs/groupby.pyx | 22 +++++++++---------- pandas/core/groupby/groupby.py | 8 +++---- pandas/tests/groupby/aggregate/test_cython.py | 16 +++++++------- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index ab1fb235fdacc..802c1f974c240 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -892,7 +892,7 @@ def group_last(rank_t[:, :] out, int64_t[:] counts, ndarray[rank_t, ndim=2] values, const int64_t[:] labels, - Py_ssize_t min_count=0): + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -937,7 +937,7 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count: + if nobs[i, j] < min_count or nobs[i, j] == 0: out[i, j] = NAN else: out[i, j] = resx[i, j] @@ -959,7 +959,7 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count: + if nobs[i, j] < min_count or nobs[i, j] == 0: if rank_t is int64_t: out[i, j] = NPY_NAT elif rank_t is uint64_t: @@ -985,7 +985,7 @@ def group_nth(rank_t[:, :] out, int64_t[:] counts, ndarray[rank_t, ndim=2] values, const int64_t[:] labels, - int64_t min_count=1, int64_t rank=1 + int64_t min_count=-1, int64_t rank=1 ): """ Only aggregates on axis=0 @@ -1032,7 +1032,7 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count: + if nobs[i, j] < min_count or nobs[i, j] == 0: out[i, j] = NAN else: out[i, j] = resx[i, j] @@ -1056,7 +1056,7 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count: + if nobs[i, j] < min_count or nobs[i, j] == 0: if rank_t is int64_t: out[i, j] = NPY_NAT elif rank_t is uint64_t: @@ -1282,7 +1282,7 @@ def group_max(groupby_t[:, :] out, int64_t[:] counts, ndarray[groupby_t, ndim=2] values, const int64_t[:] labels, - Py_ssize_t min_count=0): + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -1334,10 +1334,8 @@ def group_max(groupby_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count: + if nobs[i, j] < min_count or nobs[i, j] == 0: if groupby_t is uint64_t: - with gil: - print("test") runtime_error = True break else: @@ -1358,7 +1356,7 @@ def group_min(groupby_t[:, :] out, int64_t[:] counts, ndarray[groupby_t, ndim=2] values, const int64_t[:] labels, - Py_ssize_t min_count=0): + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -1409,7 +1407,7 @@ def group_min(groupby_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count: + if nobs[i, j] < min_count or nobs[i, j] == 0: if groupby_t is uint64_t: runtime_error = True break diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 338d8cdc5d282..38c2c9bd07723 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1654,19 +1654,19 @@ def prod(self, numeric_only: bool = True, min_count: int = 0): numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod ) - @doc(_groupby_agg_method_template, fname="min", no=False, mc=1) + @doc(_groupby_agg_method_template, fname="min", no=False, mc=-1) def min(self, numeric_only: bool = False, min_count: int = 1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="min", npfunc=np.min ) - @doc(_groupby_agg_method_template, fname="max", no=False, mc=1) + @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1) def max(self, numeric_only: bool = False, min_count: int = 1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="max", npfunc=np.max ) - @doc(_groupby_agg_method_template, fname="first", no=False, mc=1) + @doc(_groupby_agg_method_template, fname="first", no=False, mc=-1) def first(self, numeric_only: bool = False, min_count: int = 1): def first_compat(obj: FrameOrSeries, axis: int = 0): def first(x: Series): @@ -1690,7 +1690,7 @@ def first(x: Series): npfunc=first_compat, ) - @doc(_groupby_agg_method_template, fname="last", no=False, mc=1) + @doc(_groupby_agg_method_template, fname="last", no=False, mc=-1) def last(self, numeric_only: bool = False, min_count: int = 1): def last_compat(obj: FrameOrSeries, axis: int = 0): def last(x: Series): diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index d4daf6f5d52eb..c907391917ca8 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -166,23 +166,23 @@ def test__cython_agg_general(op, targop): @pytest.mark.parametrize( - "op, targop, kwargs", + "op, targop", [ - ("mean", np.mean, {}), - ("median", lambda x: np.median(x) if len(x) > 0 else np.nan, {}), - ("var", lambda x: np.var(x, ddof=1), {}), - ("min", np.min, {"min_count": 1}), - ("max", np.max, {"min_count": 1}), + ("mean", np.mean), + ("median", lambda x: np.median(x) if len(x) > 0 else np.nan), + ("var", lambda x: np.var(x, ddof=1)), + ("min", np.min), + ("max", np.max), ], ) -def test_cython_agg_empty_buckets(op, targop, kwargs, observed): +def test_cython_agg_empty_buckets(op, targop, observed): df = DataFrame([11, 12, 13]) grps = range(0, 55, 5) # calling _cython_agg_general directly, instead of via the user API # which sets different values for min_count, so do that here. g = df.groupby(pd.cut(df[0], grps), observed=observed) - result = g._cython_agg_general(op, **kwargs) + result = g._cython_agg_general(op) g = df.groupby(pd.cut(df[0], grps), observed=observed) expected = g.agg(lambda x: targop(x)) From b8264134065c2c81a80245bd4c814952888e5e71 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 22 Nov 2020 02:25:49 +0100 Subject: [PATCH 5/7] Fix default value --- pandas/core/groupby/groupby.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 38c2c9bd07723..e3fceb9bf0a06 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1655,19 +1655,19 @@ def prod(self, numeric_only: bool = True, min_count: int = 0): ) @doc(_groupby_agg_method_template, fname="min", no=False, mc=-1) - def min(self, numeric_only: bool = False, min_count: int = 1): + def min(self, numeric_only: bool = False, min_count: int = -1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="min", npfunc=np.min ) @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1) - def max(self, numeric_only: bool = False, min_count: int = 1): + def max(self, numeric_only: bool = False, min_count: int = -1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="max", npfunc=np.max ) @doc(_groupby_agg_method_template, fname="first", no=False, mc=-1) - def first(self, numeric_only: bool = False, min_count: int = 1): + def first(self, numeric_only: bool = False, min_count: int = -1): def first_compat(obj: FrameOrSeries, axis: int = 0): def first(x: Series): """Helper function for first item that isn't NA.""" @@ -1691,7 +1691,7 @@ def first(x: Series): ) @doc(_groupby_agg_method_template, fname="last", no=False, mc=-1) - def last(self, numeric_only: bool = False, min_count: int = 1): + def last(self, numeric_only: bool = False, min_count: int = -1): def last_compat(obj: FrameOrSeries, axis: int = 0): def last(x: Series): """Helper function for last item that isn't NA.""" From 9111fc91ad7dc43311e2186e4f8091a6c18b9da1 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 22 Nov 2020 21:20:07 +0100 Subject: [PATCH 6/7] Move and improve test --- pandas/tests/groupby/aggregate/test_other.py | 9 --------- pandas/tests/groupby/test_missing.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 0ee9854d8045e..5d0f6d6262899 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -638,12 +638,3 @@ def weird_func(x): result = df["decimals"].groupby(df["id1"]).agg(weird_func) tm.assert_series_equal(result, expected, check_names=False) - - -@pytest.mark.parametrize("func", ["first", "last", "max", "min"]) -def test_min_count_implementation_min_max_first_last(func): - # GH#37821 - df = DataFrame({"a": [1] * 3, "b": [1, np.nan, np.nan]}) - result = getattr(df.groupby("a"), func)(min_count=2) - expected = DataFrame({"b": [np.nan]}, index=Index([1], name="a")) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_missing.py b/pandas/tests/groupby/test_missing.py index 580148cb2a3a3..56cf400258f0f 100644 --- a/pandas/tests/groupby/test_missing.py +++ b/pandas/tests/groupby/test_missing.py @@ -116,3 +116,13 @@ def test_ffill_handles_nan_groups(dropna, method, has_nan_group): expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("min_count, value", [(2, np.nan), (-1, 1.0)]) +@pytest.mark.parametrize("func", ["first", "last", "max", "min"]) +def test_min_count(func, min_count, value): + # GH#37821 + df = DataFrame({"a": [1] * 3, "b": [1, np.nan, np.nan], "c": [np.nan] * 3}) + result = getattr(df.groupby("a"), func)(min_count=min_count) + expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a")) + tm.assert_frame_equal(result, expected) From 8654aaab0e3e647779b11a51a10523134f769604 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 23 Nov 2020 22:25:44 +0100 Subject: [PATCH 7/7] Improve if condition --- pandas/_libs/groupby.pyx | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 802c1f974c240..24156c88f0d76 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -908,6 +908,7 @@ def group_last(rank_t[:, :] out, if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") + min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) if rank_t is object: resx = np.empty((out).shape, dtype=object) @@ -937,7 +938,7 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count or nobs[i, j] == 0: + if nobs[i, j] < min_count: out[i, j] = NAN else: out[i, j] = resx[i, j] @@ -959,7 +960,7 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count or nobs[i, j] == 0: + if nobs[i, j] < min_count: if rank_t is int64_t: out[i, j] = NPY_NAT elif rank_t is uint64_t: @@ -1002,6 +1003,7 @@ def group_nth(rank_t[:, :] out, if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") + min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) if rank_t is object: resx = np.empty((out).shape, dtype=object) @@ -1032,7 +1034,7 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count or nobs[i, j] == 0: + if nobs[i, j] < min_count: out[i, j] = NAN else: out[i, j] = resx[i, j] @@ -1056,7 +1058,7 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count or nobs[i, j] == 0: + if nobs[i, j] < min_count: if rank_t is int64_t: out[i, j] = NPY_NAT elif rank_t is uint64_t: @@ -1298,6 +1300,7 @@ def group_max(groupby_t[:, :] out, if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") + min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) maxx = np.empty_like(out) @@ -1334,7 +1337,7 @@ def group_max(groupby_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count or nobs[i, j] == 0: + if nobs[i, j] < min_count: if groupby_t is uint64_t: runtime_error = True break @@ -1372,6 +1375,7 @@ def group_min(groupby_t[:, :] out, if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") + min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) minx = np.empty_like(out) @@ -1407,7 +1411,7 @@ def group_min(groupby_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count or nobs[i, j] == 0: + if nobs[i, j] < min_count: if groupby_t is uint64_t: runtime_error = True break