Skip to content

BUG: masked mean unnecessarily overflowing #48378

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Sep 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.6.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ Sparse

ExtensionArray
^^^^^^^^^^^^^^
-
- Bug in :meth:`Series.mean` overflowing unnecessarily with nullable integers (:issue:`48378`)
-

Styler
Expand Down
22 changes: 12 additions & 10 deletions pandas/core/array_algos/masked_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from pandas.core.nanops import check_below_min_count


def _sumprod(
def _reductions(
func: Callable,
values: np.ndarray,
mask: npt.NDArray[np.bool_],
Expand All @@ -24,7 +24,7 @@ def _sumprod(
axis: int | None = None,
):
"""
Sum or product for 1D masked array.
Sum, mean or product for 1D masked array.

Parameters
----------
Expand Down Expand Up @@ -63,7 +63,7 @@ def sum(
min_count: int = 0,
axis: int | None = None,
):
return _sumprod(
return _reductions(
np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
)

Expand All @@ -76,7 +76,7 @@ def prod(
min_count: int = 0,
axis: int | None = None,
):
return _sumprod(
return _reductions(
np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
)

Expand Down Expand Up @@ -139,11 +139,13 @@ def max(
return _minmax(np.max, values=values, mask=mask, skipna=skipna, axis=axis)


# TODO: axis kwarg
def mean(values: np.ndarray, mask: npt.NDArray[np.bool_], skipna: bool = True):
def mean(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: int | None = None,
):
if not values.size or mask.all():
return libmissing.NA
_sum = _sumprod(np.sum, values=values, mask=mask, skipna=skipna)
count = np.count_nonzero(~mask)
mean_value = _sum / count
return mean_value
return _reductions(np.mean, values=values, mask=mask, skipna=skipna, axis=axis)
19 changes: 13 additions & 6 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -1036,17 +1036,12 @@ def _quantile(
# Reductions

def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
if name in {"any", "all", "min", "max", "sum", "prod"}:
if name in {"any", "all", "min", "max", "sum", "prod", "mean"}:
return getattr(self, name)(skipna=skipna, **kwargs)

data = self._data
mask = self._mask

if name in {"mean"}:
op = getattr(masked_reductions, name)
result = op(data, mask, skipna=skipna, **kwargs)
return result

# coerce to a nan-aware float if needed
# (we explicitly use NaN within reductions)
if self._hasna:
Expand Down Expand Up @@ -1107,6 +1102,18 @@ def prod(self, *, skipna=True, min_count=0, axis: int | None = 0, **kwargs):
"prod", result, skipna=skipna, axis=axis, **kwargs
)

def mean(self, *, skipna=True, axis: int | None = 0, **kwargs):
nv.validate_mean((), kwargs)
result = masked_reductions.mean(
self._data,
self._mask,
skipna=skipna,
axis=axis,
)
return self._wrap_reduction_result(
"mean", result, skipna=skipna, axis=axis, **kwargs
)

def min(self, *, skipna=True, axis: int | None = 0, **kwargs):
nv.validate_min((), kwargs)
return masked_reductions.min(
Expand Down
19 changes: 17 additions & 2 deletions pandas/tests/extension/base/dim2.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,13 @@

from pandas._libs.missing import is_matching_na

from pandas.core.dtypes.common import (
is_bool_dtype,
is_integer_dtype,
)

import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.integer import INT_STR_TO_DTYPE
from pandas.tests.extension.base.base import BaseExtensionTests

Expand Down Expand Up @@ -191,7 +197,12 @@ def test_reductions_2d_axis0(self, data, method):
kwargs["ddof"] = 0

try:
result = getattr(arr2d, method)(axis=0, **kwargs)
if method == "mean" and hasattr(data, "_mask"):
# Empty slices produced by the mask cause RuntimeWarnings by numpy
with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False):
result = getattr(arr2d, method)(axis=0, **kwargs)
else:
result = getattr(arr2d, method)(axis=0, **kwargs)
except Exception as err:
try:
getattr(data, method)()
Expand All @@ -212,7 +223,7 @@ def get_reduction_result_dtype(dtype):
# i.e. dtype.kind == "u"
return INT_STR_TO_DTYPE[np.dtype(np.uint).name]

if method in ["mean", "median", "sum", "prod"]:
if method in ["median", "sum", "prod"]:
# std and var are not dtype-preserving
expected = data
if method in ["sum", "prod"] and data.dtype.kind in "iub":
Expand All @@ -229,6 +240,10 @@ def get_reduction_result_dtype(dtype):
self.assert_extension_array_equal(result, expected)
elif method == "std":
self.assert_extension_array_equal(result, data - data)
elif method == "mean":
if is_integer_dtype(data) or is_bool_dtype(data):
data = data.astype("Float64")
self.assert_extension_array_equal(result, data)
# punt on method == "var"

@pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"])
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/reductions/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,6 +775,18 @@ def test_sum_overflow_float(self, use_bottleneck, dtype):
result = s.max(skipna=False)
assert np.allclose(float(result), v[-1])

def test_mean_masked_overflow(self):
# GH#48378
val = 100_000_000_000_000_000
n_elements = 100
na = np.array([val] * n_elements)
ser = Series([val] * n_elements, dtype="Int64")

result_numpy = np.mean(na)
result_masked = ser.mean()
assert result_masked - result_numpy == 0
assert result_masked == 1e17

@pytest.mark.parametrize("dtype", ("m8[ns]", "m8[ns]", "M8[ns]", "M8[ns, UTC]"))
@pytest.mark.parametrize("skipna", [True, False])
def test_empty_timeseries_reductions_return_nat(self, dtype, skipna):
Expand Down