From 6c9121173b1b43fc9eaea7d72f7ddf62315d60fe Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Sep 2022 19:57:34 +0200 Subject: [PATCH 1/9] BUG: masked mean unnecessarily overflowing --- doc/source/whatsnew/v1.6.0.rst | 2 +- pandas/core/array_algos/masked_reductions.py | 15 +++++---------- pandas/tests/series/test_reductions.py | 13 +++++++++++++ 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 848e87f0bc029..0f6ee3f81a71d 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -199,7 +199,7 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ -- +- Bug in :meth:`Series.mean` overflowing unnecessarily with nullable integers (:issue:`40000`) - Styler diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 3e59a267f7191..13b0762afad9e 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -14,7 +14,7 @@ from pandas.core.nanops import check_below_min_count -def _sumprod( +def _sumprodmean( func: Callable, values: np.ndarray, mask: npt.NDArray[np.bool_], @@ -24,7 +24,7 @@ def _sumprod( axis: int | None = None, ): """ - Sum or product for 1D masked array. + Sum, mean or product for 1D masked array. Parameters ---------- @@ -63,7 +63,7 @@ def sum( min_count: int = 0, axis: int | None = None, ): - return _sumprod( + return _sumprodmean( np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis ) @@ -76,7 +76,7 @@ def prod( min_count: int = 0, axis: int | None = None, ): - return _sumprod( + return _sumprodmean( np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis ) @@ -141,9 +141,4 @@ def max( # TODO: axis kwarg def mean(values: np.ndarray, mask: npt.NDArray[np.bool_], skipna: bool = True): - if not values.size or mask.all(): - return libmissing.NA - _sum = _sumprod(np.sum, values=values, mask=mask, skipna=skipna) - count = np.count_nonzero(~mask) - mean_value = _sum / count - return mean_value + return _sumprodmean(np.mean, values=values, mask=mask, skipna=skipna) diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index a552d9d84329f..36c9aea37f8fd 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -141,3 +141,16 @@ def test_validate_stat_keepdims(): ) with pytest.raises(ValueError, match=msg): np.sum(ser, keepdims=True) + + +def test_mean_masked_overflow(): + # GH# + val = 100_000_000_000_000_000 + n_elements = 100 + na = np.array([val] * n_elements) + ser = Series([val] * n_elements, dtype="Int64") + + result_numpy = np.mean(na) + result_masked = ser.mean() + assert result_masked - result_numpy == 0 + assert result_masked == 1e17 From b940634285d87efe7c7b81f12aa4c6596275c34b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Sep 2022 20:02:41 +0200 Subject: [PATCH 2/9] Change gh ref --- doc/source/whatsnew/v1.6.0.rst | 2 +- pandas/tests/series/test_reductions.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 0f6ee3f81a71d..80a43d3367da2 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -199,7 +199,7 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ -- Bug in :meth:`Series.mean` overflowing unnecessarily with nullable integers (:issue:`40000`) +- Bug in :meth:`Series.mean` overflowing unnecessarily with nullable integers (:issue:`48378`) - Styler diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index 36c9aea37f8fd..c8c72e2843e5f 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -144,7 +144,7 @@ def test_validate_stat_keepdims(): def test_mean_masked_overflow(): - # GH# + # GH#48378 val = 100_000_000_000_000_000 n_elements = 100 na = np.array([val] * n_elements) From ec5dc89ecd6c55c58723d36b40080835a4989e57 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Sep 2022 20:15:52 +0200 Subject: [PATCH 3/9] Add axis --- pandas/core/array_algos/masked_reductions.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 13b0762afad9e..5010714ab3347 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -139,6 +139,11 @@ def max( return _minmax(np.max, values=values, mask=mask, skipna=skipna, axis=axis) -# TODO: axis kwarg -def mean(values: np.ndarray, mask: npt.NDArray[np.bool_], skipna: bool = True): - return _sumprodmean(np.mean, values=values, mask=mask, skipna=skipna) +def mean( + values: np.ndarray, + mask: npt.NDArray[np.bool_], + *, + skipna: bool = True, + axis: int | None = None, +): + return _sumprodmean(np.mean, values=values, mask=mask, skipna=skipna, axis=axis) From ebafd60277d34ad353f3fb74664e4b830b48adba Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Sep 2022 20:16:10 +0200 Subject: [PATCH 4/9] Rename function --- pandas/core/array_algos/masked_reductions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 5010714ab3347..40a48dbc20c93 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -14,7 +14,7 @@ from pandas.core.nanops import check_below_min_count -def _sumprodmean( +def _reductions( func: Callable, values: np.ndarray, mask: npt.NDArray[np.bool_], @@ -63,7 +63,7 @@ def sum( min_count: int = 0, axis: int | None = None, ): - return _sumprodmean( + return _reductions( np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis ) @@ -76,7 +76,7 @@ def prod( min_count: int = 0, axis: int | None = None, ): - return _sumprodmean( + return _reductions( np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis ) @@ -146,4 +146,4 @@ def mean( skipna: bool = True, axis: int | None = None, ): - return _sumprodmean(np.mean, values=values, mask=mask, skipna=skipna, axis=axis) + return _reductions(np.mean, values=values, mask=mask, skipna=skipna, axis=axis) From 3bc8324b492592679ec26df92f027db22116cfdc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Sep 2022 21:05:38 +0200 Subject: [PATCH 5/9] Fix tests --- pandas/core/array_algos/masked_reductions.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 40a48dbc20c93..041905c993b0d 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -146,4 +146,6 @@ def mean( skipna: bool = True, axis: int | None = None, ): + if not values.size or mask.all(): + return libmissing.NA return _reductions(np.mean, values=values, mask=mask, skipna=skipna, axis=axis) From af191dc99f00e9579e37bddd16d5a8094b22c8ef Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Sep 2022 21:13:13 +0200 Subject: [PATCH 6/9] Move test --- pandas/tests/reductions/test_reductions.py | 12 ++++++++++++ pandas/tests/series/test_reductions.py | 13 ------------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index fa53ed47dbdba..ef94a18016719 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -775,6 +775,18 @@ def test_sum_overflow_float(self, use_bottleneck, dtype): result = s.max(skipna=False) assert np.allclose(float(result), v[-1]) + def test_mean_masked_overflow(self): + # GH#48378 + val = 100_000_000_000_000_000 + n_elements = 100 + na = np.array([val] * n_elements) + ser = Series([val] * n_elements, dtype="Int64") + + result_numpy = np.mean(na) + result_masked = ser.mean() + assert result_masked - result_numpy == 0 + assert result_masked == 1e17 + @pytest.mark.parametrize("dtype", ("m8[ns]", "m8[ns]", "M8[ns]", "M8[ns, UTC]")) @pytest.mark.parametrize("skipna", [True, False]) def test_empty_timeseries_reductions_return_nat(self, dtype, skipna): diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index c8c72e2843e5f..a552d9d84329f 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -141,16 +141,3 @@ def test_validate_stat_keepdims(): ) with pytest.raises(ValueError, match=msg): np.sum(ser, keepdims=True) - - -def test_mean_masked_overflow(): - # GH#48378 - val = 100_000_000_000_000_000 - n_elements = 100 - na = np.array([val] * n_elements) - ser = Series([val] * n_elements, dtype="Int64") - - result_numpy = np.mean(na) - result_masked = ser.mean() - assert result_masked - result_numpy == 0 - assert result_masked == 1e17 From c3b770a098d193e83acefc9816bc373ca21574a0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 6 Sep 2022 23:11:53 +0200 Subject: [PATCH 7/9] Add mean as method --- pandas/core/arrays/masked.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index c5f6dea7157ab..5c77a50f4a805 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1036,17 +1036,12 @@ def _quantile( # Reductions def _reduce(self, name: str, *, skipna: bool = True, **kwargs): - if name in {"any", "all", "min", "max", "sum", "prod"}: + if name in {"any", "all", "min", "max", "sum", "prod", "mean"}: return getattr(self, name)(skipna=skipna, **kwargs) data = self._data mask = self._mask - if name in {"mean"}: - op = getattr(masked_reductions, name) - result = op(data, mask, skipna=skipna, **kwargs) - return result - # coerce to a nan-aware float if needed # (we explicitly use NaN within reductions) if self._hasna: @@ -1107,6 +1102,18 @@ def prod(self, *, skipna=True, min_count=0, axis: int | None = 0, **kwargs): "prod", result, skipna=skipna, axis=axis, **kwargs ) + def mean(self, *, skipna=True, axis: int | None = 0, **kwargs): + nv.validate_mean((), kwargs) + result = masked_reductions.mean( + self._data, + self._mask, + skipna=skipna, + axis=axis, + ) + return self._wrap_reduction_result( + "mean", result, skipna=skipna, axis=axis, **kwargs + ) + def min(self, *, skipna=True, axis: int | None = 0, **kwargs): nv.validate_min((), kwargs) return masked_reductions.min( From 2c79fbdb87b0db89fc3659643cc7f0b3135e55f0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 7 Sep 2022 00:35:37 +0200 Subject: [PATCH 8/9] Fix test --- pandas/tests/extension/base/dim2.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index f71f3cf164bfc..c512d84bc8d31 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -7,6 +7,7 @@ from pandas._libs.missing import is_matching_na import pandas as pd +import pandas._testing as tm from pandas.core.arrays.integer import INT_STR_TO_DTYPE from pandas.tests.extension.base.base import BaseExtensionTests @@ -191,7 +192,11 @@ def test_reductions_2d_axis0(self, data, method): kwargs["ddof"] = 0 try: - result = getattr(arr2d, method)(axis=0, **kwargs) + if method == "mean": + with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False): + result = getattr(arr2d, method)(axis=0, **kwargs) + else: + result = getattr(arr2d, method)(axis=0, **kwargs) except Exception as err: try: getattr(data, method)() @@ -212,7 +217,7 @@ def get_reduction_result_dtype(dtype): # i.e. dtype.kind == "u" return INT_STR_TO_DTYPE[np.dtype(np.uint).name] - if method in ["mean", "median", "sum", "prod"]: + if method in ["median", "sum", "prod"]: # std and var are not dtype-preserving expected = data if method in ["sum", "prod"] and data.dtype.kind in "iub": @@ -229,6 +234,9 @@ def get_reduction_result_dtype(dtype): self.assert_extension_array_equal(result, expected) elif method == "std": self.assert_extension_array_equal(result, data - data) + elif method == "mean": + expected = data.astype("Float64") + self.assert_extension_array_equal(result, expected) # punt on method == "var" @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"]) From 56a5db91f62d5f37672470858d880602433f9739 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 7 Sep 2022 09:42:03 +0200 Subject: [PATCH 9/9] Fix tests --- pandas/tests/extension/base/dim2.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index c512d84bc8d31..d2c1e6971c56e 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -6,6 +6,11 @@ from pandas._libs.missing import is_matching_na +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_integer_dtype, +) + import pandas as pd import pandas._testing as tm from pandas.core.arrays.integer import INT_STR_TO_DTYPE @@ -192,7 +197,8 @@ def test_reductions_2d_axis0(self, data, method): kwargs["ddof"] = 0 try: - if method == "mean": + if method == "mean" and hasattr(data, "_mask"): + # Empty slices produced by the mask cause RuntimeWarnings by numpy with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False): result = getattr(arr2d, method)(axis=0, **kwargs) else: @@ -235,8 +241,9 @@ def get_reduction_result_dtype(dtype): elif method == "std": self.assert_extension_array_equal(result, data - data) elif method == "mean": - expected = data.astype("Float64") - self.assert_extension_array_equal(result, expected) + if is_integer_dtype(data) or is_bool_dtype(data): + data = data.astype("Float64") + self.assert_extension_array_equal(result, data) # punt on method == "var" @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"])