Skip to content

ENH: Add support for min_count keyword for Resample and Groupby functions #37870

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Nov 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ Other enhancements
- :class:`Window` now supports all Scipy window types in ``win_type`` with flexible keyword argument support (:issue:`34556`)
- :meth:`testing.assert_index_equal` now has a ``check_order`` parameter that allows indexes to be checked in an order-insensitive manner (:issue:`37478`)
- :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`)
- Add support for ``min_count`` keyword for :meth:`DataFrame.groupby` and :meth:`DataFrame.resample` for functions ``min``, ``max``, ``first`` and ``last`` (:issue:`37821`, :issue:`37768`)
- Improve error reporting for :meth:`DataFrame.merge` when invalid merge column definitions were given (:issue:`16228`)
- Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`)
- Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`)
Expand Down
26 changes: 13 additions & 13 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -903,13 +903,12 @@ def group_last(rank_t[:, :] out,
ndarray[int64_t, ndim=2] nobs
bint runtime_error = False

assert min_count == -1, "'min_count' only used in add and prod"

# TODO(cython 3.0):
# Instead of `labels.shape[0]` use `len(labels)`
if not len(values) == labels.shape[0]:
raise AssertionError("len(index) != len(labels)")

min_count = max(min_count, 1)
nobs = np.zeros((<object>out).shape, dtype=np.int64)
if rank_t is object:
resx = np.empty((<object>out).shape, dtype=object)
Expand Down Expand Up @@ -939,7 +938,7 @@ def group_last(rank_t[:, :] out,

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if nobs[i, j] < min_count:
out[i, j] = NAN
else:
out[i, j] = resx[i, j]
Expand All @@ -961,7 +960,7 @@ def group_last(rank_t[:, :] out,

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if nobs[i, j] < min_count:
if rank_t is int64_t:
out[i, j] = NPY_NAT
elif rank_t is uint64_t:
Expand All @@ -986,7 +985,8 @@ def group_last(rank_t[:, :] out,
def group_nth(rank_t[:, :] out,
int64_t[:] counts,
ndarray[rank_t, ndim=2] values,
const int64_t[:] labels, int64_t rank=1
const int64_t[:] labels,
int64_t min_count=-1, int64_t rank=1
):
"""
Only aggregates on axis=0
Expand All @@ -1003,6 +1003,7 @@ def group_nth(rank_t[:, :] out,
if not len(values) == labels.shape[0]:
raise AssertionError("len(index) != len(labels)")

min_count = max(min_count, 1)
nobs = np.zeros((<object>out).shape, dtype=np.int64)
if rank_t is object:
resx = np.empty((<object>out).shape, dtype=object)
Expand Down Expand Up @@ -1033,7 +1034,7 @@ def group_nth(rank_t[:, :] out,

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if nobs[i, j] < min_count:
out[i, j] = NAN
else:
out[i, j] = resx[i, j]
Expand All @@ -1057,7 +1058,7 @@ def group_nth(rank_t[:, :] out,

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if nobs[i, j] < min_count:
if rank_t is int64_t:
out[i, j] = NPY_NAT
elif rank_t is uint64_t:
Expand Down Expand Up @@ -1294,13 +1295,12 @@ def group_max(groupby_t[:, :] out,
bint runtime_error = False
int64_t[:, :] nobs

assert min_count == -1, "'min_count' only used in add and prod"

# TODO(cython 3.0):
# Instead of `labels.shape[0]` use `len(labels)`
if not len(values) == labels.shape[0]:
raise AssertionError("len(index) != len(labels)")

min_count = max(min_count, 1)
nobs = np.zeros((<object>out).shape, dtype=np.int64)

maxx = np.empty_like(out)
Expand Down Expand Up @@ -1337,11 +1337,12 @@ def group_max(groupby_t[:, :] out,

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if nobs[i, j] < min_count:
if groupby_t is uint64_t:
runtime_error = True
break
else:

out[i, j] = nan_val
else:
out[i, j] = maxx[i, j]
Expand Down Expand Up @@ -1369,13 +1370,12 @@ def group_min(groupby_t[:, :] out,
bint runtime_error = False
int64_t[:, :] nobs

assert min_count == -1, "'min_count' only used in add and prod"

# TODO(cython 3.0):
# Instead of `labels.shape[0]` use `len(labels)`
if not len(values) == labels.shape[0]:
raise AssertionError("len(index) != len(labels)")

min_count = max(min_count, 1)
nobs = np.zeros((<object>out).shape, dtype=np.int64)

minx = np.empty_like(out)
Expand Down Expand Up @@ -1411,7 +1411,7 @@ def group_min(groupby_t[:, :] out,

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if nobs[i, j] < min_count:
if groupby_t is uint64_t:
runtime_error = True
break
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,7 +603,7 @@ def _aggregate(
):
if agg_func is libgroupby.group_nth:
# different signature from the others
agg_func(result, counts, values, comp_ids, rank=1)
agg_func(result, counts, values, comp_ids, min_count, rank=1)
else:
agg_func(result, counts, values, comp_ids, min_count)

Expand Down
4 changes: 2 additions & 2 deletions pandas/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -950,7 +950,7 @@ def quantile(self, q=0.5, **kwargs):


# downsample methods
for method in ["sum", "prod"]:
for method in ["sum", "prod", "min", "max", "first", "last"]:

def f(self, _method=method, min_count=0, *args, **kwargs):
nv.validate_resampler_func(_method, args, kwargs)
Expand All @@ -961,7 +961,7 @@ def f(self, _method=method, min_count=0, *args, **kwargs):


# downsample methods
for method in ["min", "max", "first", "last", "mean", "sem", "median", "ohlc"]:
for method in ["mean", "sem", "median", "ohlc"]:

def g(self, _method=method, *args, **kwargs):
nv.validate_resampler_func(_method, args, kwargs)
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/groupby/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,13 @@ def test_ffill_handles_nan_groups(dropna, method, has_nan_group):
expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True)

tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("min_count, value", [(2, np.nan), (-1, 1.0)])
@pytest.mark.parametrize("func", ["first", "last", "max", "min"])
def test_min_count(func, min_count, value):
# GH#37821
df = DataFrame({"a": [1] * 3, "b": [1, np.nan, np.nan], "c": [np.nan] * 3})
result = getattr(df.groupby("a"), func)(min_count=min_count)
expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a"))
tm.assert_frame_equal(result, expected)
13 changes: 13 additions & 0 deletions pandas/tests/resample/test_datetime_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1785,3 +1785,16 @@ def test_resample_calendar_day_with_dst(
1.0, pd.date_range(first, exp_last, freq=freq_out, tz="Europe/Amsterdam")
)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("func", ["min", "max", "first", "last"])
def test_resample_aggregate_functions_min_count(func):
# GH#37768
index = date_range(start="2020", freq="M", periods=3)
ser = Series([1, np.nan, np.nan], index)
result = getattr(ser.resample("Q"), func)(min_count=2)
expected = Series(
[np.nan],
index=DatetimeIndex(["2020-03-31"], dtype="datetime64[ns]", freq="Q-DEC"),
)
tm.assert_series_equal(result, expected)