diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index cfac916157649..9bc1499d5511e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -913,6 +913,7 @@ Numeric - Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`) - Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`) - Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`) +- Bug in :class:`DataFrame` reductions (e.g. ``df.min()``, ``df.max()``) with ``ExtensionArray`` dtypes (:issue:`34520`, :issue:`32651`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cfe5621fec14e..3b6eb9e3a27c0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -118,6 +118,7 @@ from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor +from pandas.core.construction import extract_array from pandas.core.generic import NDFrame, _shared_docs from pandas.core.indexes import base as ibase from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences @@ -8499,7 +8500,14 @@ def _count_level(self, level, axis=0, numeric_only=False): return result def _reduce( - self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds + self, + op, + name: str, + axis=0, + skipna=True, + numeric_only=None, + filter_type=None, + **kwds, ): assert filter_type is None or filter_type == "bool", filter_type @@ -8531,8 +8539,11 @@ def _reduce( labels = self._get_agg_axis(axis) constructor = self._constructor - def f(x): - return op(x, axis=axis, skipna=skipna, **kwds) + def func(values): + if is_extension_array_dtype(values.dtype): + return extract_array(values)._reduce(name, skipna=skipna, **kwds) + else: + return op(values, axis=axis, skipna=skipna, **kwds) def _get_data(axis_matters): if filter_type is None: @@ -8579,7 +8590,7 @@ def blk_func(values): out[:] = coerce_to_dtypes(out.values, df.dtypes) return out - if not self._is_homogeneous_type: + if not self._is_homogeneous_type or self._mgr.any_extension_types: # try to avoid self.values call if filter_type is None and axis == 0 and len(self) > 0: @@ -8599,7 +8610,7 @@ def blk_func(values): from pandas.core.apply import frame_apply opa = frame_apply( - self, func=f, result_type="expand", ignore_failures=True + self, func=func, result_type="expand", ignore_failures=True ) result = opa.get_result() if result.ndim == self.ndim: @@ -8611,7 +8622,7 @@ def blk_func(values): values = data.values try: - result = f(values) + result = func(values) except TypeError: # e.g. in nanops trying to convert strs to float @@ -8622,7 +8633,7 @@ def blk_func(values): values = data.values with np.errstate(all="ignore"): - result = f(values) + result = func(values) else: if numeric_only: @@ -8633,7 +8644,7 @@ def blk_func(values): else: data = self values = data.values - result = f(values) + result = func(values) if filter_type == "bool" and is_object_dtype(values) and axis is None: # work around https://github.com/numpy/numpy/issues/10489 diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 44c3077228e80..a81434339fdae 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -133,6 +133,15 @@ def test_integer_array_numpy_sum(values, expected): assert result == expected +@pytest.mark.parametrize("op", ["sum", "prod", "min", "max"]) +def test_dataframe_reductions(op): + # https://github.com/pandas-dev/pandas/pull/32867 + # ensure the integers are not cast to float during reductions + df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")}) + result = df.max() + assert isinstance(result["a"], np.int64) + + # TODO(jreback) - these need testing / are broken # shift diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index db8bb5ca3c437..9d6b9f39a0578 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1303,3 +1303,26 @@ def test_preserve_timezone(self, initial: str, method): df = DataFrame([expected]) result = getattr(df, method)(axis=1) tm.assert_series_equal(result, expected) + + +def test_mixed_frame_with_integer_sum(): + # https://github.com/pandas-dev/pandas/issues/34520 + df = pd.DataFrame([["a", 1]], columns=list("ab")) + df = df.astype({"b": "Int64"}) + result = df.sum() + expected = pd.Series(["a", 1], index=["a", "b"]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("numeric_only", [True, False, None]) +@pytest.mark.parametrize("method", ["min", "max"]) +def test_minmax_extensionarray(method, numeric_only): + # https://github.com/pandas-dev/pandas/issues/32651 + int64_info = np.iinfo("int64") + ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype()) + df = DataFrame({"Int64": ser}) + result = getattr(df, method)(numeric_only=numeric_only) + expected = Series( + [getattr(int64_info, method)], index=pd.Index(["Int64"], dtype="object") + ) + tm.assert_series_equal(result, expected)