diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index 9a646ddc6ca7e..7fa231846e721 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -521,10 +521,12 @@ def dispatch_reduction_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwar if "axis" not in kwargs: # For DataFrame reductions we don't want the default axis=0 - # FIXME: DataFrame.min ignores axis=None - # FIXME: np.minimum.reduce(df) gets here bc axis is not in kwargs, - # but np.minimum.reduce(df.values) behaves as if axis=0 - kwargs["axis"] = None + # Note: np.min is not a ufunc, but uses array_function_dispatch, + # so calls DataFrame.min (without ever getting here) with the np.min + # default of axis=None, which DataFrame.min catches and changes to axis=0. + # np.minimum.reduce(df) gets here bc axis is not in kwargs, + # so we set axis=0 to match the behaviorof np.minimum.reduce(df.values) + kwargs["axis"] = 0 # By default, numpy's reductions do not skip NaNs, so we have to # pass skipna=False diff --git a/pandas/core/common.py b/pandas/core/common.py index 2ebdfccc88f4e..e3989238be849 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -562,7 +562,14 @@ def require_length_match(data, index: Index): ) -_builtin_table = {builtins.sum: np.sum, builtins.max: np.max, builtins.min: np.min} +# the ufuncs np.maximum.reduce and np.minimum.reduce default to axis=0, +# whereas np.min and np.max (which directly call obj.min and obj.max) +# default to axis=None. +_builtin_table = { + builtins.sum: np.sum, + builtins.max: np.maximum.reduce, + builtins.min: np.minimum.reduce, +} _cython_table = { builtins.sum: "sum", diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d85c15493bed1..de40ad36a7d02 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10556,7 +10556,7 @@ def _stat_function( self, name: str, func, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = None, skipna: bool_t = True, level: Level | None = None, numeric_only: bool_t | None = None, @@ -10569,8 +10569,22 @@ def _stat_function( validate_bool_kwarg(skipna, "skipna", none_allowed=False) + if axis is None and level is None and self.ndim > 1: + # user must have explicitly passed axis=None + # GH#21597 + warnings.warn( + f"In a future version, DataFrame.{name}(axis=None) will return a " + f"scalar {name} over the entire DataFrame. To retain the old " + f"behavior, use 'frame.{name}(axis=0)' or just 'frame.{name}()'", + FutureWarning, + stacklevel=find_stack_level(), + ) + if axis is lib.no_default: + axis = None + if axis is None: axis = self._stat_axis_number + axis = cast(Axis, axis) if level is not None: warnings.warn( "Using the level keyword in DataFrame and Series aggregations is " @@ -10588,31 +10602,43 @@ def _stat_function( def min( self, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level: Level | None = None, numeric_only: bool_t | None = None, **kwargs, ): return self._stat_function( - "min", nanops.nanmin, axis, skipna, level, numeric_only, **kwargs + "min", + nanops.nanmin, + axis, + skipna, + level, + numeric_only, + **kwargs, ) def max( self, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level: Level | None = None, numeric_only: bool_t | None = None, **kwargs, ): return self._stat_function( - "max", nanops.nanmax, axis, skipna, level, numeric_only, **kwargs + "max", + nanops.nanmax, + axis, + skipna, + level, + numeric_only, + **kwargs, ) def mean( self, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level: Level | None = None, numeric_only: bool_t | None = None, @@ -10624,7 +10650,7 @@ def mean( def median( self, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level: Level | None = None, numeric_only: bool_t | None = None, @@ -10636,7 +10662,7 @@ def median( def skew( self, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level: Level | None = None, numeric_only: bool_t | None = None, @@ -10648,7 +10674,7 @@ def skew( def kurt( self, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level: Level | None = None, numeric_only: bool_t | None = None, @@ -10699,6 +10725,7 @@ def _min_count_stat_function( min_count=min_count, numeric_only=numeric_only, ) + return self._reduce( func, name=name, @@ -11039,7 +11066,14 @@ def prod( see_also="", examples="", ) - def mean(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): + def mean( + self, + axis: int | None | lib.NoDefault = lib.no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): return NDFrame.mean(self, axis, skipna, level, numeric_only, **kwargs) setattr(cls, "mean", mean) @@ -11054,7 +11088,14 @@ def mean(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): see_also="", examples="", ) - def skew(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): + def skew( + self, + axis: int | None | lib.NoDefault = lib.no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): return NDFrame.skew(self, axis, skipna, level, numeric_only, **kwargs) setattr(cls, "skew", skew) @@ -11072,7 +11113,14 @@ def skew(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): see_also="", examples="", ) - def kurt(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): + def kurt( + self, + axis: Axis | None | lib.NoDefault = lib.no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): return NDFrame.kurt(self, axis, skipna, level, numeric_only, **kwargs) setattr(cls, "kurt", kurt) @@ -11089,13 +11137,19 @@ def kurt(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): examples="", ) def median( - self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs + self, + axis: int | None | lib.NoDefault = lib.no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, ): return NDFrame.median(self, axis, skipna, level, numeric_only, **kwargs) setattr(cls, "median", median) - @doc( + # error: Untyped decorator makes function "max" untyped + @doc( # type: ignore[misc] _num_doc, desc="Return the maximum of the values over the requested axis.\n\n" "If you want the *index* of the maximum, use ``idxmax``. This is " @@ -11107,12 +11161,20 @@ def median( see_also=_stat_func_see_also, examples=_max_examples, ) - def max(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): + def max( + self, + axis: int | None | lib.NoDefault = lib.no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): return NDFrame.max(self, axis, skipna, level, numeric_only, **kwargs) setattr(cls, "max", max) - @doc( + # error: Untyped decorator makes function "max" untyped + @doc( # type: ignore[misc] _num_doc, desc="Return the minimum of the values over the requested axis.\n\n" "If you want the *index* of the minimum, use ``idxmin``. This is " @@ -11124,7 +11186,14 @@ def max(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): see_also=_stat_func_see_also, examples=_min_examples, ) - def min(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): + def min( + self, + axis: int | None | lib.NoDefault = lib.no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): return NDFrame.min(self, axis, skipna, level, numeric_only, **kwargs) setattr(cls, "min", min) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 6429544869ac2..585176fc6a2d7 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1765,3 +1765,20 @@ def test_prod_sum_min_count_mixed_object(): msg = re.escape("unsupported operand type(s) for +: 'int' and 'str'") with pytest.raises(TypeError, match=msg): df.sum(axis=0, min_count=1, numeric_only=False) + + +@pytest.mark.parametrize("method", ["min", "max", "mean", "median", "skew", "kurt"]) +def test_reduction_axis_none_deprecation(method): + # GH#21597 deprecate axis=None defaulting to axis=0 so that we can change it + # to reducing over all axes. + + df = DataFrame(np.random.randn(4, 4)) + meth = getattr(df, method) + + msg = f"scalar {method} over the entire DataFrame" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = meth(axis=None) + with tm.assert_produces_warning(None): + expected = meth() + tm.assert_series_equal(res, expected) + tm.assert_series_equal(res, meth(axis=0)) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 585491f8664b3..5ae929be1d8cf 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -81,7 +81,7 @@ def get_stats(group): assert result.index.names[0] == "C" -def test_basic(): +def test_basic(): # TODO: split this test cats = Categorical( ["a", "a", "a", "b", "b", "b", "c", "c", "c"], @@ -142,9 +142,24 @@ def f(x): df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"] ) tm.assert_frame_equal(df.groupby(c, observed=False).transform(sum), df[["a"]]) - tm.assert_frame_equal( - df.groupby(c, observed=False).transform(lambda xs: np.max(xs)), df[["a"]] - ) + + gbc = df.groupby(c, observed=False) + with tm.assert_produces_warning( + FutureWarning, match="scalar max", check_stacklevel=False + ): + # stacklevel is thrown off (i think) bc the stack goes through numpy C code + result = gbc.transform(lambda xs: np.max(xs)) + tm.assert_frame_equal(result, df[["a"]]) + + with tm.assert_produces_warning(None): + result2 = gbc.transform(lambda xs: np.max(xs, axis=0)) + result3 = gbc.transform(max) + result4 = gbc.transform(np.maximum.reduce) + result5 = gbc.transform(lambda xs: np.maximum.reduce(xs)) + tm.assert_frame_equal(result2, df[["a"]], check_dtype=False) + tm.assert_frame_equal(result3, df[["a"]], check_dtype=False) + tm.assert_frame_equal(result4, df[["a"]]) + tm.assert_frame_equal(result5, df[["a"]]) # Filter tm.assert_series_equal(df.a.groupby(c, observed=False).filter(np.all), df["a"]) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 39a3e82fc2d98..06f00634802b4 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -69,20 +69,33 @@ def test_builtins_apply(keys, f): df = DataFrame(np.random.randint(1, 50, (1000, 2)), columns=["jim", "joe"]) df["jolie"] = np.random.randn(1000) + gb = df.groupby(keys) + fname = f.__name__ - result = df.groupby(keys).apply(f) + result = gb.apply(f) ngroups = len(df.drop_duplicates(subset=keys)) assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))" assert result.shape == (ngroups, 3), assert_msg - tm.assert_frame_equal( - result, # numpy's equivalent function - df.groupby(keys).apply(getattr(np, fname)), - ) + npfunc = getattr(np, fname) # numpy's equivalent function + if f in [max, min]: + warn = FutureWarning + else: + warn = None + msg = "scalar (max|min) over the entire DataFrame" + with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): + # stacklevel can be thrown off because (i think) the stack + # goes through some of numpy's C code. + expected = gb.apply(npfunc) + tm.assert_frame_equal(result, expected) + + with tm.assert_produces_warning(None): + expected2 = gb.apply(lambda x: npfunc(x, axis=0)) + tm.assert_frame_equal(result, expected2) if f != sum: - expected = df.groupby(keys).agg(fname).reset_index() + expected = gb.agg(fname).reset_index() expected.set_index(keys, inplace=True, drop=False) tm.assert_frame_equal(result, expected, check_dtype=False) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 1e78bb1e58583..2057486ae401a 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -483,9 +483,16 @@ def test_transform_coercion(): g = df.groupby("A") expected = g.transform(np.mean) - result = g.transform(lambda x: np.mean(x)) + + msg = "will return a scalar mean" + with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): + result = g.transform(lambda x: np.mean(x)) tm.assert_frame_equal(result, expected) + with tm.assert_produces_warning(None): + result2 = g.transform(lambda x: np.mean(x, axis=0)) + tm.assert_frame_equal(result2, expected) + def test_groupby_transform_with_int(): diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index ad43a02724960..7ba81e84dfe3e 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -348,7 +348,12 @@ def test_expanding_corr_pairwise(frame): @pytest.mark.parametrize( "func,static_comp", - [("sum", np.sum), ("mean", np.mean), ("max", np.max), ("min", np.min)], + [ + ("sum", np.sum), + ("mean", lambda x: np.mean(x, axis=0)), + ("max", lambda x: np.max(x, axis=0)), + ("min", lambda x: np.min(x, axis=0)), + ], ids=["sum", "mean", "max", "min"], ) def test_expanding_func(func, static_comp, frame_or_series): @@ -356,12 +361,11 @@ def test_expanding_func(func, static_comp, frame_or_series): result = getattr(data.expanding(min_periods=1, axis=0), func)() assert isinstance(result, frame_or_series) + expected = static_comp(data[:11]) if frame_or_series is Series: - tm.assert_almost_equal(result[10], static_comp(data[:11])) + tm.assert_almost_equal(result[10], expected) else: - tm.assert_series_equal( - result.iloc[10], static_comp(data[:11]), check_names=False - ) + tm.assert_series_equal(result.iloc[10], expected, check_names=False) @pytest.mark.parametrize( @@ -404,9 +408,11 @@ def test_expanding_apply(engine_and_raw, frame_or_series): assert isinstance(result, frame_or_series) if frame_or_series is Series: - tm.assert_almost_equal(result[9], np.mean(data[:11])) + tm.assert_almost_equal(result[9], np.mean(data[:11], axis=0)) else: - tm.assert_series_equal(result.iloc[9], np.mean(data[:11]), check_names=False) + tm.assert_series_equal( + result.iloc[9], np.mean(data[:11], axis=0), check_names=False + ) def test_expanding_min_periods_apply(engine_and_raw):