diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index d237ee9921356..28abfebce1ffc 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -275,6 +275,7 @@ Other enhancements - :class:`.DataError`, :class:`.SpecificationError`, :class:`.SettingWithCopyError`, :class:`.SettingWithCopyWarning`, :class:`.NumExprClobberingError`, :class:`.UndefinedVariableError`, and :class:`.IndexingError` are now exposed in ``pandas.errors`` (:issue:`27656`) - Added ``check_like`` argument to :func:`testing.assert_series_equal` (:issue:`47247`) - Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files) +- :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: @@ -766,7 +767,8 @@ Other Deprecations - Deprecated the argument ``na_sentinel`` in :func:`factorize`, :meth:`Index.factorize`, and :meth:`.ExtensionArray.factorize`; pass ``use_na_sentinel=True`` instead to use the sentinel ``-1`` for NaN values and ``use_na_sentinel=False`` instead of ``na_sentinel=None`` to encode NaN values (:issue:`46910`) - Deprecated :meth:`DataFrameGroupBy.transform` not aligning the result when the UDF returned DataFrame (:issue:`45648`) - Clarified warning from :func:`to_datetime` when delimited dates can't be parsed in accordance to specified ``dayfirst`` argument (:issue:`46210`) - +- Deprecated :class:`Series` and :class:`Resampler` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) raising a ``NotImplementedError`` when the dtype is non-numric and ``numeric_only=True`` is provided; this will raise a ``TypeError`` in a future version (:issue:`47500`) +- Deprecated :meth:`Series.rank` returning an empty result when the dtype is non-numeric and ``numeric_only=True`` is provided; this will raise a ``TypeError`` in a future version (:issue:`47500`) .. --------------------------------------------------------------------------- .. _whatsnew_150.performance: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f896169d0ae44..1264d139df299 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8689,6 +8689,15 @@ def ranker(data): ) if numeric_only: + if self.ndim == 1 and not is_numeric_dtype(self.dtype): + # GH#47500 + warnings.warn( + f"Calling Series.rank with numeric_only={numeric_only} and dtype " + f"{self.dtype} is deprecated and will raise a TypeError in a " + "future version of pandas", + category=FutureWarning, + stacklevel=find_stack_level(), + ) data = self._get_numeric_data() else: data = self diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a4462a292f4f9..63c861e084eda 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1145,7 +1145,7 @@ def _cython_transform( ) -> DataFrame: assert axis == 0 # handled by caller # TODO: no tests with self.ndim == 1 for DataFrameGroupBy - numeric_only_bool = self._resolve_numeric_only(numeric_only, axis) + numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis) # With self.axis == 0, we have multi-block tests # e.g. test_rank_min_int, test_cython_transform_frame diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c2098fbe93a56..7e8a732a2e30d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1291,7 +1291,7 @@ def _wrap_applied_output( raise AbstractMethodError(self) def _resolve_numeric_only( - self, numeric_only: bool | lib.NoDefault, axis: int + self, how: str, numeric_only: bool | lib.NoDefault, axis: int ) -> bool: """ Determine subclass-specific default value for 'numeric_only'. @@ -1328,6 +1328,20 @@ def _resolve_numeric_only( else: numeric_only = False + if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype): + # GH#47500 + how = "sum" if how == "add" else how + warnings.warn( + f"{type(self).__name__}.{how} called with " + f"numeric_only={numeric_only} and dtype {self.obj.dtype}. This will " + "raise a TypeError in a future version of pandas", + category=FutureWarning, + stacklevel=find_stack_level(), + ) + raise NotImplementedError( + f"{type(self).__name__}.{how} does not implement numeric_only" + ) + return numeric_only def _maybe_warn_numeric_only_depr( @@ -1704,7 +1718,7 @@ def _cython_agg_general( ): # Note: we never get here with how="ohlc" for DataFrameGroupBy; # that goes through SeriesGroupBy - numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) + numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis=0) data = self._get_data_to_aggregate() is_ser = data.ndim == 1 @@ -2100,7 +2114,7 @@ def mean( 2 4.0 Name: B, dtype: float64 """ - numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) + numeric_only_bool = self._resolve_numeric_only("mean", numeric_only, axis=0) if maybe_use_numba(engine): from pandas.core._numba.kernels import sliding_mean @@ -2134,7 +2148,7 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default): Series or DataFrame Median of values within each group. """ - numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) + numeric_only_bool = self._resolve_numeric_only("median", numeric_only, axis=0) result = self._cython_agg_general( "median", @@ -2196,10 +2210,15 @@ def std( return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof)) else: # Resolve numeric_only so that var doesn't warn - numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) - if numeric_only_bool and self.obj.ndim == 1: - raise NotImplementedError( - f"{type(self).__name__}.std does not implement numeric_only." + numeric_only_bool = self._resolve_numeric_only("std", numeric_only, axis=0) + if ( + numeric_only_bool + and self.obj.ndim == 1 + and not is_numeric_dtype(self.obj.dtype) + ): + raise TypeError( + f"{type(self).__name__}.std called with " + f"numeric_only={numeric_only} and dtype {self.obj.dtype}" ) result = self._get_cythonized_result( libgroupby.group_var, @@ -2264,7 +2283,7 @@ def var( return self._numba_agg_general(sliding_var, engine_kwargs, ddof) else: - numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) + numeric_only_bool = self._resolve_numeric_only("var", numeric_only, axis=0) if ddof == 1: return self._cython_agg_general( "var", @@ -2304,10 +2323,15 @@ def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default Standard error of the mean of values within each group. """ # Reolve numeric_only so that std doesn't warn - numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) - if numeric_only_bool and self.obj.ndim == 1: - raise NotImplementedError( - f"{type(self).__name__}.sem does not implement numeric_only." + numeric_only_bool = self._resolve_numeric_only("sem", numeric_only, axis=0) + if ( + numeric_only_bool + and self.obj.ndim == 1 + and not is_numeric_dtype(self.obj.dtype) + ): + raise TypeError( + f"{type(self).__name__}.sem called with " + f"numeric_only={numeric_only} and dtype {self.obj.dtype}" ) result = self.std(ddof=ddof, numeric_only=numeric_only_bool) self._maybe_warn_numeric_only_depr("sem", result, numeric_only) @@ -3179,10 +3203,15 @@ def quantile( a 2.0 b 3.0 """ - numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) - if numeric_only_bool and self.obj.ndim == 1: - raise NotImplementedError( - f"{type(self).__name__}.quantile does not implement numeric_only" + numeric_only_bool = self._resolve_numeric_only("quantile", numeric_only, axis=0) + if ( + numeric_only_bool + and self.obj.ndim == 1 + and not is_numeric_dtype(self.obj.dtype) + ): + raise TypeError( + f"{type(self).__name__}.quantile called with " + f"numeric_only={numeric_only} and dtype {self.obj.dtype}" ) def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]: @@ -3671,7 +3700,8 @@ def _get_cythonized_result( ------- `Series` or `DataFrame` with filled values """ - numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) + how = base_func.__name__ + numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis=0) if post_processing and not callable(post_processing): raise ValueError("'post_processing' must be a callable!") @@ -3682,7 +3712,6 @@ def _get_cythonized_result( ids, _, ngroups = grouper.group_info - how = base_func.__name__ base_func = partial(base_func, labels=ids) def blk_func(values: ArrayLike) -> ArrayLike: diff --git a/pandas/core/series.py b/pandas/core/series.py index cdebb67e1b6dc..dd52ec855240e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -80,6 +80,7 @@ is_integer, is_iterator, is_list_like, + is_numeric_dtype, is_object_dtype, is_scalar, pandas_dtype, @@ -4616,10 +4617,17 @@ def _reduce( else: # dispatch to numpy arrays - if numeric_only: + if numeric_only and not is_numeric_dtype(self.dtype): kwd_name = "numeric_only" if name in ["any", "all"]: kwd_name = "bool_only" + # GH#47500 - change to TypeError to match other methods + warnings.warn( + f"Calling Series.{name} with {kwd_name}={numeric_only} and " + f"dtype {self.dtype} will raise a TypeError in the future", + FutureWarning, + stacklevel=find_stack_level(), + ) raise NotImplementedError( f"Series.{name} does not implement {kwd_name}." ) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 9631de7833cf4..869ed31b6a2d9 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -92,8 +92,9 @@ def test_cython_agg_boolean(): def test_cython_agg_nothing_to_agg(): frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) - with pytest.raises(NotImplementedError, match="does not implement"): - frame.groupby("a")["b"].mean(numeric_only=True) + with tm.assert_produces_warning(FutureWarning, match="This will raise a TypeError"): + with pytest.raises(NotImplementedError, match="does not implement"): + frame.groupby("a")["b"].mean(numeric_only=True) with pytest.raises(TypeError, match="Could not convert (foo|bar)*"): frame.groupby("a")["b"].mean() @@ -114,8 +115,9 @@ def test_cython_agg_nothing_to_agg_with_dates(): "dates": pd.date_range("now", periods=50, freq="T"), } ) - with pytest.raises(NotImplementedError, match="does not implement"): - frame.groupby("b").dates.mean(numeric_only=True) + with tm.assert_produces_warning(FutureWarning, match="This will raise a TypeError"): + with pytest.raises(NotImplementedError, match="does not implement"): + frame.groupby("b").dates.mean(numeric_only=True) def test_cython_agg_frame_columns(): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 7f24143920b84..7d6c5310942e2 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1356,7 +1356,8 @@ def test_deprecate_numeric_only( method(*args, **kwargs) -def test_deprecate_numeric_only_series(groupby_func, request): +@pytest.mark.parametrize("dtype", [bool, int, float, object]) +def test_deprecate_numeric_only_series(dtype, groupby_func, request): # GH#46560 if groupby_func in ("backfill", "mad", "pad", "tshift"): pytest.skip("method is deprecated") @@ -1364,8 +1365,15 @@ def test_deprecate_numeric_only_series(groupby_func, request): msg = "corrwith is not implemented on SeriesGroupBy" request.node.add_marker(pytest.mark.xfail(reason=msg)) - ser = Series(list("xyz")) - gb = ser.groupby([0, 0, 1]) + grouper = [0, 0, 1] + + ser = Series([1, 0, 0], dtype=dtype) + gb = ser.groupby(grouper) + method = getattr(gb, groupby_func) + + expected_ser = Series([1, 0, 0]) + expected_gb = expected_ser.groupby(grouper) + expected_method = getattr(expected_gb, groupby_func) if groupby_func == "corrwith": args = (ser,) @@ -1383,48 +1391,95 @@ def test_deprecate_numeric_only_series(groupby_func, request): args = (0.5,) else: args = () - method = getattr(gb, groupby_func) - try: - _ = method(*args) - except (TypeError, ValueError) as err: - # ops that only work on numeric dtypes - assert groupby_func in ( - "corr", - "cov", - "cummax", - "cummin", - "cumprod", - "cumsum", - "diff", - "idxmax", - "idxmin", - "mean", - "median", - "pct_change", - "prod", - "quantile", - "sem", - "skew", - "std", - "var", - ) - assert ( - "could not convert" in str(err).lower() - or "unsupported operand type" in str(err) - or "not allowed for this dtype" in str(err) - or "can't multiply sequence by non-int" in str(err) - or "cannot be performed against 'object' dtypes" in str(err) - or "is not supported for object dtype" in str(err) - ), str(err) - - msgs = ( - "got an unexpected keyword argument 'numeric_only'", - f"{groupby_func} does not implement numeric_only", - f"{groupby_func} is not supported for object dtype", + fails_on_numeric_object = ( + "corr", + "cov", + "cummax", + "cummin", + "cumprod", + "cumsum", + "idxmax", + "idxmin", + "quantile", + ) + # ops that give an object result on object input + obj_result = ( + "first", + "last", + "nth", + "bfill", + "ffill", + "shift", + "sum", + "diff", + "pct_change", + ) + + # Test default behavior; kernels that fail may be enabled in the future but kernels + # that succeed should not be allowed to fail (without deprecation, at least) + if groupby_func in fails_on_numeric_object and dtype is object: + if groupby_func in ("idxmax", "idxmin"): + msg = "not allowed for this dtype" + elif groupby_func == "quantile": + msg = "cannot be performed against 'object' dtypes" + else: + msg = "is not supported for object dtype" + with pytest.raises(TypeError, match=msg): + method(*args) + elif dtype is object: + result = method(*args) + expected = expected_method(*args) + if groupby_func in obj_result: + expected = expected.astype(object) + tm.assert_series_equal(result, expected) + + has_numeric_only = ( + "first", + "last", + "max", + "mean", + "median", + "min", + "prod", + "quantile", + "sem", + "skew", + "std", + "sum", + "var", + "cummax", + "cummin", + "cumprod", + "cumsum", ) - with pytest.raises((NotImplementedError, TypeError), match=f"({'|'.join(msgs)})"): - _ = method(*args, numeric_only=True) + if groupby_func not in has_numeric_only: + msg = "got an unexpected keyword argument 'numeric_only'" + with pytest.raises(TypeError, match=msg): + method(*args, numeric_only=True) + elif dtype is object: + err_category = NotImplementedError + err_msg = f"{groupby_func} does not implement numeric_only" + if groupby_func.startswith("cum"): + # cum ops already exhibit future behavior + warn_category = None + warn_msg = "" + err_category = TypeError + err_msg = f"{groupby_func} is not supported for object dtype" + elif groupby_func == "skew": + warn_category = FutureWarning + warn_msg = "will raise a TypeError in the future" + else: + warn_category = FutureWarning + warn_msg = "This will raise a TypeError" + + with tm.assert_produces_warning(warn_category, match=warn_msg): + with pytest.raises(err_category, match=err_msg): + method(*args, numeric_only=True) + else: + result = method(*args, numeric_only=True) + expected = method(*args, numeric_only=False) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [int, float, object]) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index a68595546eb83..fa53ed47dbdba 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -935,13 +935,9 @@ def test_all_any_params(self): with tm.assert_produces_warning(FutureWarning): s.all(bool_only=True, level=0) - # GH#38810 bool_only is not implemented alone. - msg = "Series.any does not implement bool_only" - with pytest.raises(NotImplementedError, match=msg): - s.any(bool_only=True) - msg = "Series.all does not implement bool_only." - with pytest.raises(NotImplementedError, match=msg): - s.all(bool_only=True) + # GH#47500 - test bool_only works + assert s.any(bool_only=True) + assert not s.all(bool_only=True) @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) @pytest.mark.parametrize("skipna", [True, False]) diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index 0a6c0ccc891bb..be40d7ca631eb 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -149,10 +149,9 @@ def _check_stat_op( with pytest.raises(ValueError, match=msg): f(string_series_, axis=1) - # Unimplemented numeric_only parameter. if "numeric_only" in inspect.getfullargspec(f).args: - with pytest.raises(NotImplementedError, match=name): - f(string_series_, numeric_only=True) + # only the index is string; dtype is float + f(string_series_, numeric_only=True) def test_sum(self): string_series = tm.makeStringSeries().rename("series") diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 5e10b9ee5277c..2d74b703b9bb1 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -922,8 +922,11 @@ def test_series_downsample_method(method, numeric_only, expected_data): func = getattr(resampled, method) if numeric_only and numeric_only is not lib.no_default: - with pytest.raises(NotImplementedError, match="not implement numeric_only"): - func(numeric_only=numeric_only) + with tm.assert_produces_warning( + FutureWarning, match="This will raise a TypeError" + ): + with pytest.raises(NotImplementedError, match="not implement numeric_only"): + func(numeric_only=numeric_only) elif method == "prod": with pytest.raises(TypeError, match="can't multiply sequence by non-int"): func(numeric_only=numeric_only) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 9a0c3fd5e9fed..0aab381d6e076 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -209,3 +209,97 @@ def test_series_iteritems_deprecated(self): ser = Series([1]) with tm.assert_produces_warning(FutureWarning): next(ser.iteritems()) + + @pytest.mark.parametrize( + "kernel, has_numeric_only", + [ + ("skew", True), + ("var", True), + ("all", False), + ("prod", True), + ("any", False), + ("idxmin", False), + ("quantile", False), + ("idxmax", False), + ("min", True), + ("sem", True), + ("mean", True), + ("nunique", False), + ("max", True), + ("sum", True), + ("count", False), + ("median", True), + ("std", True), + ("backfill", False), + ("rank", True), + ("pct_change", False), + ("cummax", False), + ("shift", False), + ("diff", False), + ("cumsum", False), + ("cummin", False), + ("cumprod", False), + ("fillna", False), + ("ffill", False), + ("pad", False), + ("bfill", False), + ("sample", False), + ("tail", False), + ("take", False), + ("head", False), + ("cov", False), + ("corr", False), + ], + ) + @pytest.mark.parametrize("dtype", [bool, int, float, object]) + def test_numeric_only(self, kernel, has_numeric_only, dtype): + # GH#47500 + ser = Series([0, 1, 1], dtype=dtype) + if kernel == "corrwith": + args = (ser,) + elif kernel == "corr": + args = (ser,) + elif kernel == "cov": + args = (ser,) + elif kernel == "nth": + args = (0,) + elif kernel == "fillna": + args = (True,) + elif kernel == "fillna": + args = ("ffill",) + elif kernel == "take": + args = ([0],) + elif kernel == "quantile": + args = (0.5,) + else: + args = () + method = getattr(ser, kernel) + if not has_numeric_only: + msg = ( + "(got an unexpected keyword argument 'numeric_only'" + "|too many arguments passed in)" + ) + with pytest.raises(TypeError, match=msg): + method(*args, numeric_only=True) + elif dtype is object: + if kernel == "rank": + msg = "Calling Series.rank with numeric_only=True and dtype object" + with tm.assert_produces_warning(FutureWarning, match=msg): + method(*args, numeric_only=True) + else: + warn_msg = ( + f"Calling Series.{kernel} with numeric_only=True and dtype object" + ) + err_msg = f"Series.{kernel} does not implement numeric_only" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + with pytest.raises(NotImplementedError, match=err_msg): + method(*args, numeric_only=True) + else: + result = method(*args, numeric_only=True) + expected = method(*args, numeric_only=False) + if isinstance(expected, Series): + # transformer + tm.assert_series_equal(result, expected) + else: + # reducer + assert result == expected