ENH: Add support for min_count keyword for Resample and Groupby functions (#37870)

phofl · web-flow · commit 8b0af583089a · 2020-11-26T10:36:25.000-05:00
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -252,6 +252,7 @@ Other enhancements
 - :class:`Window` now supports all Scipy window types in ``win_type`` with flexible keyword argument support (:issue:`34556`)
 - :meth:`testing.assert_index_equal` now has a ``check_order`` parameter that allows indexes to be checked in an order-insensitive manner (:issue:`37478`)
 - :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`)
+- Add support for ``min_count`` keyword for :meth:`DataFrame.groupby` and :meth:`DataFrame.resample` for functions ``min``, ``max``, ``first`` and ``last`` (:issue:`37821`, :issue:`37768`)
 - Improve error reporting for :meth:`DataFrame.merge` when invalid merge column definitions were given (:issue:`16228`)
 - Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`)
 - Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`)
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -903,13 +903,12 @@ def group_last(rank_t[:, :] out,
         ndarray[int64_t, ndim=2] nobs
         bint runtime_error = False
 
-    assert min_count == -1, "'min_count' only used in add and prod"
-
     # TODO(cython 3.0):
     # Instead of `labels.shape[0]` use `len(labels)`
     if not len(values) == labels.shape[0]:
         raise AssertionError("len(index) != len(labels)")
 
+    min_count = max(min_count, 1)
     nobs = np.zeros((<object>out).shape, dtype=np.int64)
     if rank_t is object:
         resx = np.empty((<object>out).shape, dtype=object)
@@ -939,7 +938,7 @@ def group_last(rank_t[:, :] out,
 
         for i in range(ncounts):
             for j in range(K):
-                if nobs[i, j] == 0:
+                if nobs[i, j] < min_count:
                     out[i, j] = NAN
                 else:
                     out[i, j] = resx[i, j]
@@ -961,7 +960,7 @@ def group_last(rank_t[:, :] out,
 
             for i in range(ncounts):
                 for j in range(K):
-                    if nobs[i, j] == 0:
+                    if nobs[i, j] < min_count:
                         if rank_t is int64_t:
                             out[i, j] = NPY_NAT
                         elif rank_t is uint64_t:
@@ -986,7 +985,8 @@ def group_last(rank_t[:, :] out,
 def group_nth(rank_t[:, :] out,
               int64_t[:] counts,
               ndarray[rank_t, ndim=2] values,
-              const int64_t[:] labels, int64_t rank=1
+              const int64_t[:] labels,
+              int64_t min_count=-1, int64_t rank=1
               ):
     """
     Only aggregates on axis=0
@@ -1003,6 +1003,7 @@ def group_nth(rank_t[:, :] out,
     if not len(values) == labels.shape[0]:
         raise AssertionError("len(index) != len(labels)")
 
+    min_count = max(min_count, 1)
     nobs = np.zeros((<object>out).shape, dtype=np.int64)
     if rank_t is object:
         resx = np.empty((<object>out).shape, dtype=object)
@@ -1033,7 +1034,7 @@ def group_nth(rank_t[:, :] out,
 
         for i in range(ncounts):
             for j in range(K):
-                if nobs[i, j] == 0:
+                if nobs[i, j] < min_count:
                     out[i, j] = NAN
                 else:
                     out[i, j] = resx[i, j]
@@ -1057,7 +1058,7 @@ def group_nth(rank_t[:, :] out,
 
             for i in range(ncounts):
                 for j in range(K):
-                    if nobs[i, j] == 0:
+                    if nobs[i, j] < min_count:
                         if rank_t is int64_t:
                             out[i, j] = NPY_NAT
                         elif rank_t is uint64_t:
@@ -1294,13 +1295,12 @@ def group_max(groupby_t[:, :] out,
         bint runtime_error = False
         int64_t[:, :] nobs
 
-    assert min_count == -1, "'min_count' only used in add and prod"
-
     # TODO(cython 3.0):
     # Instead of `labels.shape[0]` use `len(labels)`
     if not len(values) == labels.shape[0]:
         raise AssertionError("len(index) != len(labels)")
 
+    min_count = max(min_count, 1)
     nobs = np.zeros((<object>out).shape, dtype=np.int64)
 
     maxx = np.empty_like(out)
@@ -1337,11 +1337,12 @@ def group_max(groupby_t[:, :] out,
 
         for i in range(ncounts):
             for j in range(K):
-                if nobs[i, j] == 0:
+                if nobs[i, j] < min_count:
                     if groupby_t is uint64_t:
                         runtime_error = True
                         break
                     else:
+
                         out[i, j] = nan_val
                 else:
                     out[i, j] = maxx[i, j]
@@ -1369,13 +1370,12 @@ def group_min(groupby_t[:, :] out,
         bint runtime_error = False
         int64_t[:, :] nobs
 
-    assert min_count == -1, "'min_count' only used in add and prod"
-
     # TODO(cython 3.0):
     # Instead of `labels.shape[0]` use `len(labels)`
     if not len(values) == labels.shape[0]:
         raise AssertionError("len(index) != len(labels)")
 
+    min_count = max(min_count, 1)
     nobs = np.zeros((<object>out).shape, dtype=np.int64)
 
     minx = np.empty_like(out)
@@ -1411,7 +1411,7 @@ def group_min(groupby_t[:, :] out,
 
         for i in range(ncounts):
             for j in range(K):
-                if nobs[i, j] == 0:
+                if nobs[i, j] < min_count:
                     if groupby_t is uint64_t:
                         runtime_error = True
                         break
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -603,7 +603,7 @@ def _aggregate(
     ):
         if agg_func is libgroupby.group_nth:
             # different signature from the others
-            agg_func(result, counts, values, comp_ids, rank=1)
+            agg_func(result, counts, values, comp_ids, min_count, rank=1)
         else:
             agg_func(result, counts, values, comp_ids, min_count)
 
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
@@ -950,7 +950,7 @@ def quantile(self, q=0.5, **kwargs):
 
 
 # downsample methods
-for method in ["sum", "prod"]:
+for method in ["sum", "prod", "min", "max", "first", "last"]:
 
     def f(self, _method=method, min_count=0, *args, **kwargs):
         nv.validate_resampler_func(_method, args, kwargs)
@@ -961,7 +961,7 @@ def f(self, _method=method, min_count=0, *args, **kwargs):
 
 
 # downsample methods
-for method in ["min", "max", "first", "last", "mean", "sem", "median", "ohlc"]:
+for method in ["mean", "sem", "median", "ohlc"]:
 
     def g(self, _method=method, *args, **kwargs):
         nv.validate_resampler_func(_method, args, kwargs)
diff --git a/pandas/tests/groupby/test_missing.py b/pandas/tests/groupby/test_missing.py
@@ -116,3 +116,13 @@ def test_ffill_handles_nan_groups(dropna, method, has_nan_group):
     expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
 
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("min_count, value", [(2, np.nan), (-1, 1.0)])
+@pytest.mark.parametrize("func", ["first", "last", "max", "min"])
+def test_min_count(func, min_count, value):
+    # GH#37821
+    df = DataFrame({"a": [1] * 3, "b": [1, np.nan, np.nan], "c": [np.nan] * 3})
+    result = getattr(df.groupby("a"), func)(min_count=min_count)
+    expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a"))
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py
@@ -1785,3 +1785,16 @@ def test_resample_calendar_day_with_dst(
         1.0, pd.date_range(first, exp_last, freq=freq_out, tz="Europe/Amsterdam")
     )
     tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", ["min", "max", "first", "last"])
+def test_resample_aggregate_functions_min_count(func):
+    # GH#37768
+    index = date_range(start="2020", freq="M", periods=3)
+    ser = Series([1, np.nan, np.nan], index)
+    result = getattr(ser.resample("Q"), func)(min_count=2)
+    expected = Series(
+        [np.nan],
+        index=DatetimeIndex(["2020-03-31"], dtype="datetime64[ns]", freq="Q-DEC"),
+    )
+    tm.assert_series_equal(result, expected)