From 54830d924fcd20bdc35e2f64c28079468619f4e8 Mon Sep 17 00:00:00 2001 From: richard Date: Fri, 26 Jan 2024 23:45:11 -0500 Subject: [PATCH 1/6] ENH: Add skipna to groupby.first and groupby.last --- doc/source/whatsnew/v2.2.1.rst | 2 +- pandas/_libs/groupby.pyi | 2 ++ pandas/_libs/groupby.pyx | 41 ++++++++++++++++--------- pandas/_testing/__init__.py | 7 +++++ pandas/conftest.py | 32 +++++++++++++++++++ pandas/core/groupby/groupby.py | 32 ++++++++++++++----- pandas/tests/groupby/test_reductions.py | 32 +++++++++++++++++++ 7 files changed, 124 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index b9b2821ebc468..1100a3b3972e4 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -32,7 +32,7 @@ Bug fixes Other ~~~~~ -- +- Added the argument ``skipna`` to :meth:`DataFrameGroupBy.first`, :meth:`DataFrameGroupBy.last`, :meth:`SeriesGroupBy.first`, and :meth:`SeriesGroupBy.last`; achieving ``skipna=False`` used to be available via :meth:`DataFrameGroupBy.nth`, but the behavior was changed in pandas 2.0.0 (:issue:`57019`) .. --------------------------------------------------------------------------- .. _whatsnew_221.contributors: diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index b7130ee35dc57..95ac555303221 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -136,6 +136,7 @@ def group_last( result_mask: npt.NDArray[np.bool_] | None = ..., min_count: int = ..., # Py_ssize_t is_datetimelike: bool = ..., + skipna: bool = ..., ) -> None: ... def group_nth( out: np.ndarray, # rank_t[:, ::1] @@ -147,6 +148,7 @@ def group_nth( min_count: int = ..., # int64_t rank: int = ..., # int64_t is_datetimelike: bool = ..., + skipna: bool = ..., ) -> None: ... def group_rank( out: np.ndarray, # float64_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 45e02c3dd420f..391bb4a3a3fd3 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1428,6 +1428,7 @@ def group_last( uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=-1, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -1462,14 +1463,19 @@ def group_last( for j in range(K): val = values[i, j] - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = _treat_as_na(val, is_datetimelike) + if skipna: + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + if isna_entry: + continue - if not isna_entry: - nobs[lab, j] += 1 - resx[lab, j] = val + nobs[lab, j] += 1 + resx[lab, j] = val + + if uses_mask and not skipna: + result_mask[lab, j] = mask[i, j] _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx @@ -1490,6 +1496,7 @@ def group_nth( int64_t min_count=-1, int64_t rank=1, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -1524,15 +1531,19 @@ def group_nth( for j in range(K): val = values[i, j] - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = _treat_as_na(val, is_datetimelike) + if skipna: + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + if isna_entry: + continue - if not isna_entry: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + if uses_mask and not skipna: + result_mask[lab, j] = mask[i, j] _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 3f5fd2e61b0cb..d187d018840fe 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -235,11 +235,18 @@ + TIMEDELTA_PYARROW_DTYPES + BOOL_PYARROW_DTYPES ) + ALL_REAL_PYARROW_DTYPES_STR_REPR = ( + ALL_INT_PYARROW_DTYPES_STR_REPR + FLOAT_PYARROW_DTYPES_STR_REPR + ) else: FLOAT_PYARROW_DTYPES_STR_REPR = [] ALL_INT_PYARROW_DTYPES_STR_REPR = [] ALL_PYARROW_DTYPES = [] + ALL_REAL_PYARROW_DTYPES_STR_REPR = [] +ALL_REAL_NULLABLE_DTYPES = ( + FLOAT_NUMPY_DTYPES + ALL_REAL_EXTENSION_DTYPES + ALL_REAL_PYARROW_DTYPES_STR_REPR +) arithmetic_dunder_methods = [ "__add__", diff --git a/pandas/conftest.py b/pandas/conftest.py index 94805313ccfc1..4f62404733979 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1703,6 +1703,38 @@ def any_numpy_dtype(request): return request.param +@pytest.fixture(params=tm.ALL_REAL_NULLABLE_DTYPES) +def any_real_nullable_dtype(request): + """ + Parameterized fixture for all numpy dtypes. + + * float + * 'float32' + * 'float64' + * 'Float32' + * 'Float64' + * 'UInt8' + * 'UInt16' + * 'UInt32' + * 'UInt64' + * 'Int8' + * 'Int16' + * 'Int32' + * 'Int64' + * 'uint8[pyarrow]' + * 'uint16[pyarrow]' + * 'uint32[pyarrow]' + * 'uint64[pyarrow]' + * 'int8[pyarrow]' + * 'int16[pyarrow]' + * 'int32[pyarrow]' + * 'int64[pyarrow]' + * 'float[pyarrow]' + * 'double[pyarrow]' + """ + return request.param + + @pytest.fixture(params=tm.ALL_NUMERIC_DTYPES) def any_numeric_dtype(request): """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b2afaffc267fa..45af5c20ed780 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3364,9 +3364,13 @@ def max( ) @final - def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: + def first( + self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True + ) -> NDFrameT: """ - Compute the first non-null entry of each column. + Compute the first entry of each column within each group. + + Defaults to skipping NA elements. Parameters ---------- @@ -3374,12 +3378,15 @@ def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: Include only float, int, boolean columns. min_count : int, default -1 The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` valid values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. Returns ------- Series or DataFrame - First non-null of values within each group. + First values within each group. See Also -------- @@ -3431,12 +3438,17 @@ def first(x: Series): min_count=min_count, alias="first", npfunc=first_compat, + skipna=skipna, ) @final - def last(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: + def last( + self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True + ) -> NDFrameT: """ - Compute the last non-null entry of each column. + Compute the last entry of each column within each group. + + Defaults to skipping NA elements. Parameters ---------- @@ -3445,12 +3457,15 @@ def last(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: everything, then use only numeric data. min_count : int, default -1 The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` valid values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. Returns ------- Series or DataFrame - Last non-null of values within each group. + Last of values within each group. See Also -------- @@ -3490,6 +3505,7 @@ def last(x: Series): min_count=min_count, alias="last", npfunc=last_compat, + skipna=skipna, ) @final diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index d24a2a26bba81..f330101c1d3a8 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -7,6 +7,8 @@ from pandas._libs.tslibs import iNaT +from pandas.core.dtypes.common import is_extension_array_dtype + import pandas as pd from pandas import ( DataFrame, @@ -389,6 +391,36 @@ def test_groupby_non_arithmetic_agg_int_like_precision(method, data): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("how", ["first", "last"]) +def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how): + if is_extension_array_dtype(any_real_nullable_dtype): + na_value = Series(dtype=any_real_nullable_dtype).dtype.na_value + else: + na_value = np.nan + df = DataFrame( + { + "a": [2, 1, 1, 2], + "b": [na_value, 3.0, na_value, 4.0], + "c": [na_value, 3.0, na_value, 4.0], + }, + dtype=any_real_nullable_dtype, + ) + gb = df.groupby("a", sort=sort) + method = getattr(gb, how) + result = method(skipna=skipna) + + ilocs = { + ("first", True): [3, 1], + ("first", False): [0, 1], + ("last", True): [3, 1], + ("last", False): [3, 2], + }[how, skipna] + expected = df.iloc[ilocs].set_index("a") + if sort: + expected = expected.sort_index() + tm.assert_frame_equal(result, expected) + + def test_idxmin_idxmax_axis1(): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] From b12541bdf59799c49c2e4974a7cae4a05e3509a3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 27 Jan 2024 12:35:46 -0500 Subject: [PATCH 2/6] resample & tests --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/resample.py | 10 +++++++-- pandas/tests/groupby/test_reductions.py | 1 + pandas/tests/resample/test_base.py | 29 +++++++++++++++++++++++++ 4 files changed, 39 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 1100a3b3972e4..2704d9b9a9a6b 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -33,6 +33,7 @@ Bug fixes Other ~~~~~ - Added the argument ``skipna`` to :meth:`DataFrameGroupBy.first`, :meth:`DataFrameGroupBy.last`, :meth:`SeriesGroupBy.first`, and :meth:`SeriesGroupBy.last`; achieving ``skipna=False`` used to be available via :meth:`DataFrameGroupBy.nth`, but the behavior was changed in pandas 2.0.0 (:issue:`57019`) +- Added the argument ``skipna`` to :meth:`Resampler.first`, :meth:`Resampler.last` (:issue:`57019`) .. --------------------------------------------------------------------------- .. _whatsnew_221.contributors: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 082196abc17c2..4d6507d89ec90 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1329,12 +1329,15 @@ def first( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, *args, **kwargs, ): maybe_warn_args_and_kwargs(type(self), "first", args, kwargs) nv.validate_resampler_func("first", args, kwargs) - return self._downsample("first", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "first", numeric_only=numeric_only, min_count=min_count, skipna=skipna + ) @final @doc(GroupBy.last) @@ -1342,12 +1345,15 @@ def last( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, *args, **kwargs, ): maybe_warn_args_and_kwargs(type(self), "last", args, kwargs) nv.validate_resampler_func("last", args, kwargs) - return self._downsample("last", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "last", numeric_only=numeric_only, min_count=min_count, skipna=skipna + ) @final @doc(GroupBy.median) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index f330101c1d3a8..06c2edf31b334 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -393,6 +393,7 @@ def test_groupby_non_arithmetic_agg_int_like_precision(method, data): @pytest.mark.parametrize("how", ["first", "last"]) def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how): + # GH#57019 if is_extension_array_dtype(any_real_nullable_dtype): na_value = Series(dtype=any_real_nullable_dtype).dtype.na_value else: diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index ab75dd7469b73..9cd51b95d6efd 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -3,6 +3,9 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_extension_array_dtype + +import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -459,3 +462,29 @@ def test_resample_quantile(index): result = ser.resample(freq).quantile(q) expected = ser.resample(freq).agg(lambda x: x.quantile(q)).rename(ser.name) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("how", ["first", "last"]) +def test_first_last_skipna(any_real_nullable_dtype, skipna, how): + # GH#57019 + if is_extension_array_dtype(any_real_nullable_dtype): + na_value = Series(dtype=any_real_nullable_dtype).dtype.na_value + else: + na_value = np.nan + df = DataFrame( + { + "a": [2, 1, 1, 2], + "b": [na_value, 3.0, na_value, 4.0], + "c": [na_value, 3.0, na_value, 4.0], + }, + index=date_range("2020-01-01", periods=4, freq="D"), + dtype=any_real_nullable_dtype, + ) + rs = df.resample("ME") + method = getattr(rs, how) + result = method(skipna=skipna) + + gb = df.groupby(df.shape[0] * [pd.to_datetime("2020-01-31")]) + expected = getattr(gb, how)(skipna=skipna) + expected.index.freq = "ME" + tm.assert_frame_equal(result, expected) From 207be1276ac88754bd7eb259e2eccc3307559b05 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 27 Jan 2024 12:39:55 -0500 Subject: [PATCH 3/6] Improve test --- pandas/tests/groupby/test_reductions.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 06c2edf31b334..c458b6ab9d96d 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -400,9 +400,9 @@ def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how): na_value = np.nan df = DataFrame( { - "a": [2, 1, 1, 2], - "b": [na_value, 3.0, na_value, 4.0], - "c": [na_value, 3.0, na_value, 4.0], + "a": [2, 1, 1, 2, 3, 3], + "b": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], + "c": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], }, dtype=any_real_nullable_dtype, ) @@ -411,10 +411,10 @@ def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how): result = method(skipna=skipna) ilocs = { - ("first", True): [3, 1], - ("first", False): [0, 1], - ("last", True): [3, 1], - ("last", False): [3, 2], + ("first", True): [3, 1, 4], + ("first", False): [0, 1, 4], + ("last", True): [3, 1, 5], + ("last", False): [3, 2, 5], }[how, skipna] expected = df.iloc[ilocs].set_index("a") if sort: From b20733720056d71c625c8fb45622b07fd8dd87d4 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 27 Jan 2024 12:47:08 -0500 Subject: [PATCH 4/6] Fixups --- pandas/conftest.py | 2 +- pandas/core/groupby/groupby.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 4f62404733979..c5dc48b9ed096 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1706,7 +1706,7 @@ def any_numpy_dtype(request): @pytest.fixture(params=tm.ALL_REAL_NULLABLE_DTYPES) def any_real_nullable_dtype(request): """ - Parameterized fixture for all numpy dtypes. + Parameterized fixture for all real dtypes that can hold NA. * float * 'float32' diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 45af5c20ed780..7227d5c727994 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3383,6 +3383,8 @@ def first( Exclude NA/null values. If an entire row/column is NA, the result will be NA. + .. versionadded:: 2.2.1 + Returns ------- Series or DataFrame @@ -3462,6 +3464,8 @@ def last( Exclude NA/null values. If an entire row/column is NA, the result will be NA. + .. versionadded:: 2.2.1 + Returns ------- Series or DataFrame From a634d44a967f8b0091505a0c73b9b24c258eec86 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 28 Jan 2024 09:17:57 -0500 Subject: [PATCH 5/6] fixup test --- pandas/tests/resample/test_resample_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index d3e906827b754..12abd1c98784b 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -1040,11 +1040,11 @@ def test_args_kwargs_depr(method, raises): if raises: with tm.assert_produces_warning(FutureWarning, match=warn_msg): with pytest.raises(UnsupportedFunctionCall, match=error_msg): - func(*args, 1, 2, 3) + func(*args, 1, 2, 3, 4) else: with tm.assert_produces_warning(FutureWarning, match=warn_msg): with pytest.raises(TypeError, match=error_msg_type): - func(*args, 1, 2, 3) + func(*args, 1, 2, 3, 4) def test_df_axis_param_depr(): From b3bd9bb5806159220eb8c6d9eca2082f22162d5f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 29 Jan 2024 17:08:11 -0500 Subject: [PATCH 6/6] Rework na_value determination --- pandas/tests/groupby/test_reductions.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index c458b6ab9d96d..bd188c729846c 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -7,7 +7,8 @@ from pandas._libs.tslibs import iNaT -from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.missing import na_value_for_dtype import pandas as pd from pandas import ( @@ -394,10 +395,7 @@ def test_groupby_non_arithmetic_agg_int_like_precision(method, data): @pytest.mark.parametrize("how", ["first", "last"]) def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how): # GH#57019 - if is_extension_array_dtype(any_real_nullable_dtype): - na_value = Series(dtype=any_real_nullable_dtype).dtype.na_value - else: - na_value = np.nan + na_value = na_value_for_dtype(pandas_dtype(any_real_nullable_dtype)) df = DataFrame( { "a": [2, 1, 1, 2, 3, 3],