Skip to content

Commit 37d0194

Browse files
authored
DEPR: numeic_only in Series and SeriesGroupBy consistency (#47561)
* DEPR: numeic_only in Series and SeriesGroupBy consistency * Refactor warning, whatsnew note, fixup tests
1 parent 4fe2f31 commit 37d0194

File tree

11 files changed

+278
-81
lines changed

11 files changed

+278
-81
lines changed

doc/source/whatsnew/v1.5.0.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@ Other enhancements
275275
- :class:`.DataError`, :class:`.SpecificationError`, :class:`.SettingWithCopyError`, :class:`.SettingWithCopyWarning`, :class:`.NumExprClobberingError`, :class:`.UndefinedVariableError`, and :class:`.IndexingError` are now exposed in ``pandas.errors`` (:issue:`27656`)
276276
- Added ``check_like`` argument to :func:`testing.assert_series_equal` (:issue:`47247`)
277277
- Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files)
278+
- :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`)
278279

279280
.. ---------------------------------------------------------------------------
280281
.. _whatsnew_150.notable_bug_fixes:
@@ -766,7 +767,8 @@ Other Deprecations
766767
- Deprecated the argument ``na_sentinel`` in :func:`factorize`, :meth:`Index.factorize`, and :meth:`.ExtensionArray.factorize`; pass ``use_na_sentinel=True`` instead to use the sentinel ``-1`` for NaN values and ``use_na_sentinel=False`` instead of ``na_sentinel=None`` to encode NaN values (:issue:`46910`)
767768
- Deprecated :meth:`DataFrameGroupBy.transform` not aligning the result when the UDF returned DataFrame (:issue:`45648`)
768769
- Clarified warning from :func:`to_datetime` when delimited dates can't be parsed in accordance to specified ``dayfirst`` argument (:issue:`46210`)
769-
770+
- Deprecated :class:`Series` and :class:`Resampler` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) raising a ``NotImplementedError`` when the dtype is non-numric and ``numeric_only=True`` is provided; this will raise a ``TypeError`` in a future version (:issue:`47500`)
771+
- Deprecated :meth:`Series.rank` returning an empty result when the dtype is non-numeric and ``numeric_only=True`` is provided; this will raise a ``TypeError`` in a future version (:issue:`47500`)
770772

771773
.. ---------------------------------------------------------------------------
772774
.. _whatsnew_150.performance:

pandas/core/generic.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8691,6 +8691,15 @@ def ranker(data):
86918691
)
86928692

86938693
if numeric_only:
8694+
if self.ndim == 1 and not is_numeric_dtype(self.dtype):
8695+
# GH#47500
8696+
warnings.warn(
8697+
f"Calling Series.rank with numeric_only={numeric_only} and dtype "
8698+
f"{self.dtype} is deprecated and will raise a TypeError in a "
8699+
"future version of pandas",
8700+
category=FutureWarning,
8701+
stacklevel=find_stack_level(),
8702+
)
86948703
data = self._get_numeric_data()
86958704
else:
86968705
data = self

pandas/core/groupby/generic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1145,7 +1145,7 @@ def _cython_transform(
11451145
) -> DataFrame:
11461146
assert axis == 0 # handled by caller
11471147
# TODO: no tests with self.ndim == 1 for DataFrameGroupBy
1148-
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis)
1148+
numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis)
11491149

11501150
# With self.axis == 0, we have multi-block tests
11511151
# e.g. test_rank_min_int, test_cython_transform_frame

pandas/core/groupby/groupby.py

Lines changed: 48 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1291,7 +1291,7 @@ def _wrap_applied_output(
12911291
raise AbstractMethodError(self)
12921292

12931293
def _resolve_numeric_only(
1294-
self, numeric_only: bool | lib.NoDefault, axis: int
1294+
self, how: str, numeric_only: bool | lib.NoDefault, axis: int
12951295
) -> bool:
12961296
"""
12971297
Determine subclass-specific default value for 'numeric_only'.
@@ -1328,6 +1328,20 @@ def _resolve_numeric_only(
13281328
else:
13291329
numeric_only = False
13301330

1331+
if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype):
1332+
# GH#47500
1333+
how = "sum" if how == "add" else how
1334+
warnings.warn(
1335+
f"{type(self).__name__}.{how} called with "
1336+
f"numeric_only={numeric_only} and dtype {self.obj.dtype}. This will "
1337+
"raise a TypeError in a future version of pandas",
1338+
category=FutureWarning,
1339+
stacklevel=find_stack_level(),
1340+
)
1341+
raise NotImplementedError(
1342+
f"{type(self).__name__}.{how} does not implement numeric_only"
1343+
)
1344+
13311345
return numeric_only
13321346

13331347
def _maybe_warn_numeric_only_depr(
@@ -1704,7 +1718,7 @@ def _cython_agg_general(
17041718
):
17051719
# Note: we never get here with how="ohlc" for DataFrameGroupBy;
17061720
# that goes through SeriesGroupBy
1707-
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0)
1721+
numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis=0)
17081722

17091723
data = self._get_data_to_aggregate()
17101724
is_ser = data.ndim == 1
@@ -2100,7 +2114,7 @@ def mean(
21002114
2 4.0
21012115
Name: B, dtype: float64
21022116
"""
2103-
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0)
2117+
numeric_only_bool = self._resolve_numeric_only("mean", numeric_only, axis=0)
21042118

21052119
if maybe_use_numba(engine):
21062120
from pandas.core._numba.kernels import sliding_mean
@@ -2134,7 +2148,7 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
21342148
Series or DataFrame
21352149
Median of values within each group.
21362150
"""
2137-
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0)
2151+
numeric_only_bool = self._resolve_numeric_only("median", numeric_only, axis=0)
21382152

21392153
result = self._cython_agg_general(
21402154
"median",
@@ -2196,10 +2210,15 @@ def std(
21962210
return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof))
21972211
else:
21982212
# Resolve numeric_only so that var doesn't warn
2199-
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0)
2200-
if numeric_only_bool and self.obj.ndim == 1:
2201-
raise NotImplementedError(
2202-
f"{type(self).__name__}.std does not implement numeric_only."
2213+
numeric_only_bool = self._resolve_numeric_only("std", numeric_only, axis=0)
2214+
if (
2215+
numeric_only_bool
2216+
and self.obj.ndim == 1
2217+
and not is_numeric_dtype(self.obj.dtype)
2218+
):
2219+
raise TypeError(
2220+
f"{type(self).__name__}.std called with "
2221+
f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
22032222
)
22042223
result = self._get_cythonized_result(
22052224
libgroupby.group_var,
@@ -2264,7 +2283,7 @@ def var(
22642283

22652284
return self._numba_agg_general(sliding_var, engine_kwargs, ddof)
22662285
else:
2267-
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0)
2286+
numeric_only_bool = self._resolve_numeric_only("var", numeric_only, axis=0)
22682287
if ddof == 1:
22692288
return self._cython_agg_general(
22702289
"var",
@@ -2304,10 +2323,15 @@ def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default
23042323
Standard error of the mean of values within each group.
23052324
"""
23062325
# Reolve numeric_only so that std doesn't warn
2307-
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0)
2308-
if numeric_only_bool and self.obj.ndim == 1:
2309-
raise NotImplementedError(
2310-
f"{type(self).__name__}.sem does not implement numeric_only."
2326+
numeric_only_bool = self._resolve_numeric_only("sem", numeric_only, axis=0)
2327+
if (
2328+
numeric_only_bool
2329+
and self.obj.ndim == 1
2330+
and not is_numeric_dtype(self.obj.dtype)
2331+
):
2332+
raise TypeError(
2333+
f"{type(self).__name__}.sem called with "
2334+
f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
23112335
)
23122336
result = self.std(ddof=ddof, numeric_only=numeric_only_bool)
23132337
self._maybe_warn_numeric_only_depr("sem", result, numeric_only)
@@ -3179,10 +3203,15 @@ def quantile(
31793203
a 2.0
31803204
b 3.0
31813205
"""
3182-
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0)
3183-
if numeric_only_bool and self.obj.ndim == 1:
3184-
raise NotImplementedError(
3185-
f"{type(self).__name__}.quantile does not implement numeric_only"
3206+
numeric_only_bool = self._resolve_numeric_only("quantile", numeric_only, axis=0)
3207+
if (
3208+
numeric_only_bool
3209+
and self.obj.ndim == 1
3210+
and not is_numeric_dtype(self.obj.dtype)
3211+
):
3212+
raise TypeError(
3213+
f"{type(self).__name__}.quantile called with "
3214+
f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
31863215
)
31873216

31883217
def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]:
@@ -3671,7 +3700,8 @@ def _get_cythonized_result(
36713700
-------
36723701
`Series` or `DataFrame` with filled values
36733702
"""
3674-
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0)
3703+
how = base_func.__name__
3704+
numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis=0)
36753705

36763706
if post_processing and not callable(post_processing):
36773707
raise ValueError("'post_processing' must be a callable!")
@@ -3682,7 +3712,6 @@ def _get_cythonized_result(
36823712

36833713
ids, _, ngroups = grouper.group_info
36843714

3685-
how = base_func.__name__
36863715
base_func = partial(base_func, labels=ids)
36873716

36883717
def blk_func(values: ArrayLike) -> ArrayLike:

pandas/core/series.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@
8080
is_integer,
8181
is_iterator,
8282
is_list_like,
83+
is_numeric_dtype,
8384
is_object_dtype,
8485
is_scalar,
8586
pandas_dtype,
@@ -4616,10 +4617,17 @@ def _reduce(
46164617

46174618
else:
46184619
# dispatch to numpy arrays
4619-
if numeric_only:
4620+
if numeric_only and not is_numeric_dtype(self.dtype):
46204621
kwd_name = "numeric_only"
46214622
if name in ["any", "all"]:
46224623
kwd_name = "bool_only"
4624+
# GH#47500 - change to TypeError to match other methods
4625+
warnings.warn(
4626+
f"Calling Series.{name} with {kwd_name}={numeric_only} and "
4627+
f"dtype {self.dtype} will raise a TypeError in the future",
4628+
FutureWarning,
4629+
stacklevel=find_stack_level(),
4630+
)
46234631
raise NotImplementedError(
46244632
f"Series.{name} does not implement {kwd_name}."
46254633
)

pandas/tests/groupby/aggregate/test_cython.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,9 @@ def test_cython_agg_boolean():
9292
def test_cython_agg_nothing_to_agg():
9393
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
9494

95-
with pytest.raises(NotImplementedError, match="does not implement"):
96-
frame.groupby("a")["b"].mean(numeric_only=True)
95+
with tm.assert_produces_warning(FutureWarning, match="This will raise a TypeError"):
96+
with pytest.raises(NotImplementedError, match="does not implement"):
97+
frame.groupby("a")["b"].mean(numeric_only=True)
9798

9899
with pytest.raises(TypeError, match="Could not convert (foo|bar)*"):
99100
frame.groupby("a")["b"].mean()
@@ -114,8 +115,9 @@ def test_cython_agg_nothing_to_agg_with_dates():
114115
"dates": pd.date_range("now", periods=50, freq="T"),
115116
}
116117
)
117-
with pytest.raises(NotImplementedError, match="does not implement"):
118-
frame.groupby("b").dates.mean(numeric_only=True)
118+
with tm.assert_produces_warning(FutureWarning, match="This will raise a TypeError"):
119+
with pytest.raises(NotImplementedError, match="does not implement"):
120+
frame.groupby("b").dates.mean(numeric_only=True)
119121

120122

121123
def test_cython_agg_frame_columns():

0 commit comments

Comments
 (0)