From 465a69b8350a6af53fb0ae5ace981bbf09f77db7 Mon Sep 17 00:00:00 2001 From: Richard Date: Sat, 21 Mar 2020 14:21:14 -0400 Subject: [PATCH 1/7] [BUG] Aggregated bool has inconsistent dtype Addresses: GH7001 --- pandas/core/dtypes/cast.py | 8 ++++- pandas/core/dtypes/common.py | 24 +++++++++++++++ pandas/core/groupby/generic.py | 6 ++-- pandas/core/groupby/groupby.py | 11 ++++--- pandas/tests/groupby/test_groupby.py | 45 ++++++++++++++++++++++++++++ 5 files changed, 86 insertions(+), 8 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 8173e95c9d3d6..97e77d1add7f0 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -30,6 +30,7 @@ ensure_int64, ensure_object, ensure_str, + groupby_result_dtype, is_bool, is_bool_dtype, is_complex, @@ -172,7 +173,9 @@ def maybe_downcast_to_dtype(result, dtype): return result -def maybe_downcast_numeric(result, dtype, do_round: bool = False): +def maybe_downcast_numeric( + result, dtype, do_round: bool = False, how: str = "", +): """ Subset of maybe_downcast_to_dtype restricted to numeric dtypes. @@ -181,6 +184,7 @@ def maybe_downcast_numeric(result, dtype, do_round: bool = False): result : ndarray or ExtensionArray dtype : np.dtype or ExtensionDtype do_round : bool + how : str Returns ------- @@ -195,6 +199,8 @@ def maybe_downcast_numeric(result, dtype, do_round: bool = False): # earlier result = np.array(result) + dtype = groupby_result_dtype(dtype, how) + def trans(x): if do_round: return x.round() diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index b4b7fb36ee4d0..ff4a70ff8b76a 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1788,3 +1788,27 @@ def pandas_dtype(dtype) -> DtypeObj: raise TypeError(f"dtype '{dtype}' not understood") return npdtype + + +def groupby_result_dtype(dtype, how) -> DtypeObj: + """ + Get the desired dtype of an aggregation result based on the + input dtype and how the aggregation is done. + + Parameters + ---------- + dtype : dtype, type + The input dtype for the groupby. + how : str + How the aggregation is performed. + + Returns + ------- + The desired dtype of the aggregation result. + """ + d = { + (np.dtype(np.bool), "add"): np.dtype(np.int64), + (np.dtype(np.bool), "cumsum"): np.dtype(np.int64), + (np.dtype(np.bool), "sum"): np.dtype(np.int64), + } + return d.get((dtype, how), dtype) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4102b8527b6aa..db0cbca8fcd17 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -526,7 +526,7 @@ def _transform_fast(self, result, func_nm: str) -> Series: cast = self._transform_should_cast(func_nm) out = algorithms.take_1d(result._values, ids) if cast: - out = self._try_cast(out, self.obj) + out = self._try_cast(out, self.obj, how=func_nm) return Series(out, index=self.obj.index, name=self.obj.name) def filter(self, func, dropna=True, *args, **kwargs): @@ -1073,7 +1073,7 @@ def _cython_agg_blocks( if result is not no_result: # see if we can cast the block back to the original dtype - result = maybe_downcast_numeric(result, block.dtype) + result = maybe_downcast_numeric(result, block.dtype, how=how) if block.is_extension and isinstance(result, np.ndarray): # e.g. block.values was an IntegerArray @@ -1460,7 +1460,7 @@ def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: # TODO: we have no test cases that get here with EA dtypes; # try_cast may not be needed if EAs never get here if cast: - res = self._try_cast(res, obj.iloc[:, i]) + res = self._try_cast(res, obj.iloc[:, i], how=func_nm) output.append(res) return DataFrame._from_arrays(output, columns=result.columns, index=obj.index) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 19e51d05feb92..46fb0f37885c7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -42,6 +42,7 @@ class providing the base-class of operations. from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( ensure_float, + groupby_result_dtype, is_datetime64_dtype, is_extension_array_dtype, is_integer_dtype, @@ -792,7 +793,7 @@ def _cumcount_array(self, ascending: bool = True): rev[sorter] = np.arange(count, dtype=np.intp) return out[rev].astype(np.int64, copy=False) - def _try_cast(self, result, obj, numeric_only: bool = False): + def _try_cast(self, result, obj, numeric_only: bool = False, how: str = ""): """ Try to cast the result to our obj original type, we may have roundtripped through object in the mean-time. @@ -806,6 +807,8 @@ def _try_cast(self, result, obj, numeric_only: bool = False): else: dtype = obj.dtype + dtype = groupby_result_dtype(dtype, how) + if not is_scalar(result): if is_extension_array_dtype(dtype) and dtype.kind != "M": # The function can return something of any type, so check @@ -852,7 +855,7 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): continue if self._transform_should_cast(how): - result = self._try_cast(result, obj) + result = self._try_cast(result, obj, how=how) key = base.OutputKey(label=name, position=idx) output[key] = result @@ -895,12 +898,12 @@ def _cython_agg_general( assert len(agg_names) == result.shape[1] for result_column, result_name in zip(result.T, agg_names): key = base.OutputKey(label=result_name, position=idx) - output[key] = self._try_cast(result_column, obj) + output[key] = self._try_cast(result_column, obj, how=how) idx += 1 else: assert result.ndim == 1 key = base.OutputKey(label=name, position=idx) - output[key] = self._try_cast(result, obj) + output[key] = self._try_cast(result, obj, how=how) idx += 1 if len(output) == 0: diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b8d8f56512a69..f60ae6eb61a07 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -7,6 +7,8 @@ from pandas.errors import PerformanceWarning +from pandas.core.dtypes.common import is_integer_dtype + import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv import pandas._testing as tm @@ -2057,3 +2059,46 @@ def test_groups_repr_truncates(max_seq_items, expected): result = df.groupby(np.array(df.a)).groups.__repr__() assert result == expected + + +def test_bool_agg_dtype(): + # GH 7001 + # Bool aggregation results in int + df = pd.DataFrame({"a": [1, 1], "b": [False, True]}) + s = df.set_index("a")["b"] + + result = df.groupby("a").sum()["b"].dtype + assert is_integer_dtype(result) + + result = s.groupby("a").sum().dtype + assert is_integer_dtype(result) + + result = df.groupby("a").cumsum()["b"].dtype + assert is_integer_dtype(result) + + result = s.groupby("a").cumsum().dtype + assert is_integer_dtype(result) + + result = df.groupby("a").agg("sum")["b"].dtype + assert is_integer_dtype(result) + + result = s.groupby("a").agg("sum").dtype + assert is_integer_dtype(result) + + result = df.groupby("a").agg("cumsum")["b"].dtype + assert is_integer_dtype(result) + + result = s.groupby("a").agg("cumsum").dtype + assert is_integer_dtype(result) + + result = df.groupby("a").transform("sum")["b"].dtype + assert is_integer_dtype(result) + + result = s.groupby("a").transform("sum").dtype + assert is_integer_dtype(result) + + result = df.groupby("a").transform("cumsum")["b"].dtype + assert is_integer_dtype(result) + + result = s.groupby("a").transform("cumsum").dtype + assert is_integer_dtype(result) From 3bc35f88ed16453e3bede746681e371e9507594c Mon Sep 17 00:00:00 2001 From: Richard Date: Sun, 22 Mar 2020 10:45:20 -0400 Subject: [PATCH 2/7] Moved function groupby_result_dtype to _GroupBy._result_dtype - Reverted maybe_downcast_numeric to its original state - Parameterized tests --- pandas/core/dtypes/cast.py | 8 +---- pandas/core/dtypes/common.py | 24 -------------- pandas/core/groupby/generic.py | 6 ++-- pandas/core/groupby/groupby.py | 32 +++++++++++++++--- pandas/tests/groupby/test_groupby.py | 49 +++++++++------------------- 5 files changed, 47 insertions(+), 72 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 97e77d1add7f0..8173e95c9d3d6 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -30,7 +30,6 @@ ensure_int64, ensure_object, ensure_str, - groupby_result_dtype, is_bool, is_bool_dtype, is_complex, @@ -173,9 +172,7 @@ def maybe_downcast_to_dtype(result, dtype): return result -def maybe_downcast_numeric( - result, dtype, do_round: bool = False, how: str = "", -): +def maybe_downcast_numeric(result, dtype, do_round: bool = False): """ Subset of maybe_downcast_to_dtype restricted to numeric dtypes. @@ -184,7 +181,6 @@ def maybe_downcast_numeric( result : ndarray or ExtensionArray dtype : np.dtype or ExtensionDtype do_round : bool - how : str Returns ------- @@ -199,8 +195,6 @@ def maybe_downcast_numeric( # earlier result = np.array(result) - dtype = groupby_result_dtype(dtype, how) - def trans(x): if do_round: return x.round() diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index ff4a70ff8b76a..b4b7fb36ee4d0 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1788,27 +1788,3 @@ def pandas_dtype(dtype) -> DtypeObj: raise TypeError(f"dtype '{dtype}' not understood") return npdtype - - -def groupby_result_dtype(dtype, how) -> DtypeObj: - """ - Get the desired dtype of an aggregation result based on the - input dtype and how the aggregation is done. - - Parameters - ---------- - dtype : dtype, type - The input dtype for the groupby. - how : str - How the aggregation is performed. - - Returns - ------- - The desired dtype of the aggregation result. - """ - d = { - (np.dtype(np.bool), "add"): np.dtype(np.int64), - (np.dtype(np.bool), "cumsum"): np.dtype(np.int64), - (np.dtype(np.bool), "sum"): np.dtype(np.int64), - } - return d.get((dtype, how), dtype) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index db0cbca8fcd17..1a222472de8af 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1072,8 +1072,10 @@ def _cython_agg_blocks( assert not isinstance(result, DataFrame) if result is not no_result: - # see if we can cast the block back to the original dtype - result = maybe_downcast_numeric(result, block.dtype, how=how) + # see if we can cast the block to the desired dtype + # this may not be the original dtype + dtype = self._result_dtype(block.dtype, how) + result = maybe_downcast_numeric(result, dtype) if block.is_extension and isinstance(result, np.ndarray): # e.g. block.values was an IntegerArray diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 46fb0f37885c7..381f1dc0bb960 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -33,7 +33,7 @@ class providing the base-class of operations. from pandas._libs import Timestamp import pandas._libs.groupby as libgroupby -from pandas._typing import FrameOrSeries, Scalar +from pandas._typing import DtypeObj, FrameOrSeries, Scalar from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -42,7 +42,6 @@ class providing the base-class of operations. from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( ensure_float, - groupby_result_dtype, is_datetime64_dtype, is_extension_array_dtype, is_integer_dtype, @@ -795,7 +794,7 @@ def _cumcount_array(self, ascending: bool = True): def _try_cast(self, result, obj, numeric_only: bool = False, how: str = ""): """ - Try to cast the result to our obj original type, + Try to cast the result to the desired type, we may have roundtripped through object in the mean-time. If numeric_only is True, then only try to cast numerics @@ -806,8 +805,7 @@ def _try_cast(self, result, obj, numeric_only: bool = False, how: str = ""): dtype = obj._values.dtype else: dtype = obj.dtype - - dtype = groupby_result_dtype(dtype, how) + dtype = self._result_dtype(dtype, how) if not is_scalar(result): if is_extension_array_dtype(dtype) and dtype.kind != "M": @@ -1028,6 +1026,30 @@ def _apply_filter(self, indices, dropna): filtered = self._selected_obj.where(mask) # Fill with NaNs. return filtered + @staticmethod + def _result_dtype(dtype, how) -> DtypeObj: + """ + Get the desired dtype of a groupby result based on the + input dtype and how the aggregation is done. + + Parameters + ---------- + dtype : dtype, type + The input dtype of the groupby. + how : str + How the aggregation is performed. + + Returns + ------- + The desired dtype of the aggregation result. + """ + d = { + (np.dtype(np.bool), "add"): np.dtype(np.int64), + (np.dtype(np.bool), "cumsum"): np.dtype(np.int64), + (np.dtype(np.bool), "sum"): np.dtype(np.int64), + } + return d.get((dtype, how), dtype) + class GroupBy(_GroupBy): """ diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index f60ae6eb61a07..20e9bf65db8d5 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2061,44 +2061,25 @@ def test_groups_repr_truncates(max_seq_items, expected): assert result == expected -def test_bool_agg_dtype(): +@pytest.mark.parametrize( + "op", + [ + lambda x: x.sum(), + lambda x: x.cumsum(), + lambda x: x.transform("sum"), + lambda x: x.transform("cumsum"), + lambda x: x.agg("sum"), + lambda x: x.agg("cumsum"), + ], +) +def test_bool_agg_dtype(op): # GH 7001 - # Bool aggregation results in int + # Bool sum aggregations result in int df = pd.DataFrame({"a": [1, 1], "b": [False, True]}) s = df.set_index("a")["b"] - result = df.groupby("a").sum()["b"].dtype - assert is_integer_dtype(result) - - result = s.groupby("a").sum().dtype - assert is_integer_dtype(result) - - result = df.groupby("a").cumsum()["b"].dtype - assert is_integer_dtype(result) - - result = s.groupby("a").cumsum().dtype - assert is_integer_dtype(result) - - result = df.groupby("a").agg("sum")["b"].dtype - assert is_integer_dtype(result) - - result = s.groupby("a").agg("sum").dtype - assert is_integer_dtype(result) - - result = df.groupby("a").agg("cumsum")["b"].dtype - assert is_integer_dtype(result) - - result = s.groupby("a").agg("cumsum").dtype - assert is_integer_dtype(result) - - result = df.groupby("a").transform("sum")["b"].dtype - assert is_integer_dtype(result) - - result = s.groupby("a").transform("sum").dtype - assert is_integer_dtype(result) - - result = df.groupby("a").transform("cumsum")["b"].dtype + result = op(df.groupby("a"))["b"].dtype assert is_integer_dtype(result) - result = s.groupby("a").transform("cumsum").dtype + result = op(s.groupby("a")).dtype assert is_integer_dtype(result) From e6f2408e494e2203368a8e2eb44c4d3ac5d73ab5 Mon Sep 17 00:00:00 2001 From: Richard Date: Tue, 24 Mar 2020 18:18:18 -0400 Subject: [PATCH 3/7] Moved cast functions to core.dtypes.cast, tests to test_aggregate --- pandas/core/arrays/__init__.py | 3 +- pandas/core/arrays/base.py | 24 +----- pandas/core/dtypes/cast.py | 79 +++++++++++++++++++ pandas/core/groupby/generic.py | 10 ++- pandas/core/groupby/groupby.py | 72 ++--------------- pandas/core/series.py | 8 +- .../tests/groupby/aggregate/test_aggregate.py | 26 ++++++ pandas/tests/groupby/test_groupby.py | 26 ------ 8 files changed, 128 insertions(+), 120 deletions(-) diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index bf3469924a700..98baeda40b494 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -1,8 +1,9 @@ +from pandas.core.dtypes.cast import try_cast_to_ea + from pandas.core.arrays.base import ( ExtensionArray, ExtensionOpsMixin, ExtensionScalarOpsMixin, - try_cast_to_ea, ) from pandas.core.arrays.boolean import BooleanArray from pandas.core.arrays.categorical import Categorical diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 67e3807c477fb..147deec94f130 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -19,6 +19,7 @@ from pandas.util._decorators import Appender, Substitution from pandas.util._validators import validate_fillna_kwargs +from pandas.core.dtypes.cast import try_cast_to_ea from pandas.core.dtypes.common import is_array_like, is_list_like from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries @@ -32,29 +33,6 @@ _extension_array_shared_docs: Dict[str, str] = dict() -def try_cast_to_ea(cls_or_instance, obj, dtype=None): - """ - Call to `_from_sequence` that returns the object unchanged on Exception. - - Parameters - ---------- - cls_or_instance : ExtensionArray subclass or instance - obj : arraylike - Values to pass to cls._from_sequence - dtype : ExtensionDtype, optional - - Returns - ------- - ExtensionArray or obj - """ - try: - result = cls_or_instance._from_sequence(obj, dtype=dtype) - except Exception: - # We can't predict what downstream EA constructors may raise - result = obj - return result - - class ExtensionArray: """ Abstract base class for custom 1-D array types. diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 8173e95c9d3d6..c7ad4e8648598 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -246,6 +246,85 @@ def trans(x): return result +def maybe_cast_result(result, obj, numeric_only: bool = False, how: str = ""): + """ + Try to cast the result to the desired type, + we may have roundtripped through object in the mean-time. + + If numeric_only is True, then only try to cast numerics + and not datetimelikes. + + """ + if obj.ndim > 1: + dtype = obj._values.dtype + else: + dtype = obj.dtype + dtype = maybe_cast_result_dtype(dtype, how) + + if not is_scalar(result): + if is_extension_array_dtype(dtype) and dtype.kind != "M": + # The function can return something of any type, so check + # if the type is compatible with the calling EA. + # datetime64tz is handled correctly in agg_series, + # so is excluded here. + + if len(result) and isinstance(result[0], dtype.type): + cls = dtype.construct_array_type() + result = try_cast_to_ea(cls, result, dtype=dtype) + + elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: + result = maybe_downcast_to_dtype(result, dtype) + + return result + + +def maybe_cast_result_dtype(dtype, how): + """ + Get the desired dtype of a groupby result based on the + input dtype and how the aggregation is done. + + Parameters + ---------- + dtype : dtype, type + The input dtype of the groupby. + how : str + How the aggregation is performed. + + Returns + ------- + The desired dtype of the aggregation result. + """ + d = { + (np.dtype(np.bool), "add"): np.dtype(np.int64), + (np.dtype(np.bool), "cumsum"): np.dtype(np.int64), + (np.dtype(np.bool), "sum"): np.dtype(np.int64), + } + return d.get((dtype, how), dtype) + + +def try_cast_to_ea(cls_or_instance, obj, dtype=None): + """ + Call to `_from_sequence` that returns the object unchanged on Exception. + + Parameters + ---------- + cls_or_instance : ExtensionArray subclass or instance + obj : arraylike + Values to pass to cls._from_sequence + dtype : ExtensionDtype, optional + + Returns + ------- + ExtensionArray or obj + """ + try: + result = cls_or_instance._from_sequence(obj, dtype=dtype) + except Exception: + # We can't predict what downstream EA constructors may raise + result = obj + return result + + def maybe_upcast_putmask(result: np.ndarray, mask: np.ndarray, other): """ A safe version of putmask that potentially upcasts the result. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1a222472de8af..b7c071a8dfbbf 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -34,6 +34,8 @@ from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.cast import ( + maybe_cast_result, + maybe_cast_result_dtype, maybe_convert_objects, maybe_downcast_numeric, maybe_downcast_to_dtype, @@ -526,7 +528,7 @@ def _transform_fast(self, result, func_nm: str) -> Series: cast = self._transform_should_cast(func_nm) out = algorithms.take_1d(result._values, ids) if cast: - out = self._try_cast(out, self.obj, how=func_nm) + out = maybe_cast_result(out, self.obj, how=func_nm) return Series(out, index=self.obj.index, name=self.obj.name) def filter(self, func, dropna=True, *args, **kwargs): @@ -1074,7 +1076,7 @@ def _cython_agg_blocks( if result is not no_result: # see if we can cast the block to the desired dtype # this may not be the original dtype - dtype = self._result_dtype(block.dtype, how) + dtype = maybe_cast_result_dtype(block.dtype, how) result = maybe_downcast_numeric(result, dtype) if block.is_extension and isinstance(result, np.ndarray): @@ -1177,7 +1179,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: else: if cast: - result[item] = self._try_cast(result[item], data) + result[item] = maybe_cast_result(result[item], data) result_columns = obj.columns if cannot_agg: @@ -1462,7 +1464,7 @@ def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: # TODO: we have no test cases that get here with EA dtypes; # try_cast may not be needed if EAs never get here if cast: - res = self._try_cast(res, obj.iloc[:, i], how=func_nm) + res = maybe_cast_result(res, obj.iloc[:, i], how=func_nm) output.append(res) return DataFrame._from_arrays(output, columns=result.columns, index=obj.index) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 381f1dc0bb960..86171944d0c78 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -33,17 +33,16 @@ class providing the base-class of operations. from pandas._libs import Timestamp import pandas._libs.groupby as libgroupby -from pandas._typing import DtypeObj, FrameOrSeries, Scalar +from pandas._typing import FrameOrSeries, Scalar from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, cache_readonly -from pandas.core.dtypes.cast import maybe_downcast_to_dtype +from pandas.core.dtypes.cast import maybe_cast_result from pandas.core.dtypes.common import ( ensure_float, is_datetime64_dtype, - is_extension_array_dtype, is_integer_dtype, is_numeric_dtype, is_object_dtype, @@ -53,7 +52,7 @@ class providing the base-class of operations. from pandas.core import nanops import pandas.core.algorithms as algorithms -from pandas.core.arrays import Categorical, DatetimeArray, try_cast_to_ea +from pandas.core.arrays import Categorical, DatetimeArray from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.frame import DataFrame @@ -792,37 +791,6 @@ def _cumcount_array(self, ascending: bool = True): rev[sorter] = np.arange(count, dtype=np.intp) return out[rev].astype(np.int64, copy=False) - def _try_cast(self, result, obj, numeric_only: bool = False, how: str = ""): - """ - Try to cast the result to the desired type, - we may have roundtripped through object in the mean-time. - - If numeric_only is True, then only try to cast numerics - and not datetimelikes. - - """ - if obj.ndim > 1: - dtype = obj._values.dtype - else: - dtype = obj.dtype - dtype = self._result_dtype(dtype, how) - - if not is_scalar(result): - if is_extension_array_dtype(dtype) and dtype.kind != "M": - # The function can return something of any type, so check - # if the type is compatible with the calling EA. - # datetime64tz is handled correctly in agg_series, - # so is excluded here. - - if len(result) and isinstance(result[0], dtype.type): - cls = dtype.construct_array_type() - result = try_cast_to_ea(cls, result, dtype=dtype) - - elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: - result = maybe_downcast_to_dtype(result, dtype) - - return result - def _transform_should_cast(self, func_nm: str) -> bool: """ Parameters @@ -853,7 +821,7 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): continue if self._transform_should_cast(how): - result = self._try_cast(result, obj, how=how) + result = maybe_cast_result(result, obj, how=how) key = base.OutputKey(label=name, position=idx) output[key] = result @@ -896,12 +864,12 @@ def _cython_agg_general( assert len(agg_names) == result.shape[1] for result_column, result_name in zip(result.T, agg_names): key = base.OutputKey(label=result_name, position=idx) - output[key] = self._try_cast(result_column, obj, how=how) + output[key] = maybe_cast_result(result_column, obj, how=how) idx += 1 else: assert result.ndim == 1 key = base.OutputKey(label=name, position=idx) - output[key] = self._try_cast(result, obj, how=how) + output[key] = maybe_cast_result(result, obj, how=how) idx += 1 if len(output) == 0: @@ -930,7 +898,7 @@ def _python_agg_general(self, func, *args, **kwargs): assert result is not None key = base.OutputKey(label=name, position=idx) - output[key] = self._try_cast(result, obj, numeric_only=True) + output[key] = maybe_cast_result(result, obj, numeric_only=True) if len(output) == 0: return self._python_apply_general(f) @@ -945,7 +913,7 @@ def _python_agg_general(self, func, *args, **kwargs): if is_numeric_dtype(values.dtype): values = ensure_float(values) - output[key] = self._try_cast(values[mask], result) + output[key] = maybe_cast_result(values[mask], result) return self._wrap_aggregated_output(output) @@ -1026,30 +994,6 @@ def _apply_filter(self, indices, dropna): filtered = self._selected_obj.where(mask) # Fill with NaNs. return filtered - @staticmethod - def _result_dtype(dtype, how) -> DtypeObj: - """ - Get the desired dtype of a groupby result based on the - input dtype and how the aggregation is done. - - Parameters - ---------- - dtype : dtype, type - The input dtype of the groupby. - how : str - How the aggregation is performed. - - Returns - ------- - The desired dtype of the aggregation result. - """ - d = { - (np.dtype(np.bool), "add"): np.dtype(np.int64), - (np.dtype(np.bool), "cumsum"): np.dtype(np.int64), - (np.dtype(np.bool), "sum"): np.dtype(np.int64), - } - return d.get((dtype, how), dtype) - class GroupBy(_GroupBy): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index dfca19b7a8050..cd8dde1557607 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -27,7 +27,11 @@ from pandas.util._decorators import Appender, Substitution, doc from pandas.util._validators import validate_bool_kwarg, validate_percentile -from pandas.core.dtypes.cast import convert_dtypes, validate_numeric_casting +from pandas.core.dtypes.cast import ( + convert_dtypes, + try_cast_to_ea, + validate_numeric_casting, +) from pandas.core.dtypes.common import ( _is_unorderable_exception, ensure_platform_int, @@ -59,7 +63,7 @@ import pandas as pd from pandas.core import algorithms, base, generic, nanops, ops from pandas.core.accessor import CachedAccessor -from pandas.core.arrays import ExtensionArray, try_cast_to_ea +from pandas.core.arrays import ExtensionArray from pandas.core.arrays.categorical import CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor import pandas.core.common as com diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 1265547653d7b..e860ea1a3d052 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_integer_dtype + import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat import pandas._testing as tm @@ -340,6 +342,30 @@ def test_groupby_agg_coercing_bools(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "op", + [ + lambda x: x.sum(), + lambda x: x.cumsum(), + lambda x: x.transform("sum"), + lambda x: x.transform("cumsum"), + lambda x: x.agg("sum"), + lambda x: x.agg("cumsum"), + ], +) +def test_bool_agg_dtype(op): + # GH 7001 + # Bool sum aggregations result in int + df = pd.DataFrame({"a": [1, 1], "b": [False, True]}) + s = df.set_index("a")["b"] + + result = op(df.groupby("a"))["b"].dtype + assert is_integer_dtype(result) + + result = op(s.groupby("a")).dtype + assert is_integer_dtype(result) + + def test_order_aggregate_multiple_funcs(): # GH 25692 df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]}) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 20e9bf65db8d5..b8d8f56512a69 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -7,8 +7,6 @@ from pandas.errors import PerformanceWarning -from pandas.core.dtypes.common import is_integer_dtype - import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv import pandas._testing as tm @@ -2059,27 +2057,3 @@ def test_groups_repr_truncates(max_seq_items, expected): result = df.groupby(np.array(df.a)).groups.__repr__() assert result == expected - - -@pytest.mark.parametrize( - "op", - [ - lambda x: x.sum(), - lambda x: x.cumsum(), - lambda x: x.transform("sum"), - lambda x: x.transform("cumsum"), - lambda x: x.agg("sum"), - lambda x: x.agg("cumsum"), - ], -) -def test_bool_agg_dtype(op): - # GH 7001 - # Bool sum aggregations result in int - df = pd.DataFrame({"a": [1, 1], "b": [False, True]}) - s = df.set_index("a")["b"] - - result = op(df.groupby("a"))["b"].dtype - assert is_integer_dtype(result) - - result = op(s.groupby("a")).dtype - assert is_integer_dtype(result) From 8902484ad67ed4128690d703dd208b09cca59008 Mon Sep 17 00:00:00 2001 From: Richard Date: Wed, 25 Mar 2020 08:05:07 -0400 Subject: [PATCH 4/7] Added/improved docstrings and type hints - Removed unnecessary import of try_cast_to_ea from arrays.__init__ --- pandas/core/arrays/__init__.py | 3 --- pandas/core/dtypes/cast.py | 40 +++++++++++++++++++++++----------- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 98baeda40b494..1d538824e6d82 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -1,5 +1,3 @@ -from pandas.core.dtypes.cast import try_cast_to_ea - from pandas.core.arrays.base import ( ExtensionArray, ExtensionOpsMixin, @@ -20,7 +18,6 @@ "ExtensionArray", "ExtensionOpsMixin", "ExtensionScalarOpsMixin", - "try_cast_to_ea", "BooleanArray", "Categorical", "DatetimeArray", diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c7ad4e8648598..2d3a5f8a6bdf4 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -16,7 +16,7 @@ iNaT, ) from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import Dtype +from pandas._typing import Dtype, DtypeObj from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( @@ -246,14 +246,27 @@ def trans(x): return result -def maybe_cast_result(result, obj, numeric_only: bool = False, how: str = ""): +def maybe_cast_result( + result, obj: ABCSeries, numeric_only: bool = False, how: str = "" +): """ - Try to cast the result to the desired type, - we may have roundtripped through object in the mean-time. + Try casting result to a different type if appropriate - If numeric_only is True, then only try to cast numerics - and not datetimelikes. + Parameters + ---------- + result : array-like + Result to cast. + obj : ABCSeries + Input series from which result was calculated. + numeric_only : bool, default False + Whether to cast only numerics or datetimes as well. + how : str, default "" + If result was aggregated, how the aggregation was performed. + Returns + ------- + result : array-like + result maybe casted to the dtype. """ if obj.ndim > 1: dtype = obj._values.dtype @@ -278,21 +291,22 @@ def maybe_cast_result(result, obj, numeric_only: bool = False, how: str = ""): return result -def maybe_cast_result_dtype(dtype, how): +def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: """ - Get the desired dtype of a groupby result based on the - input dtype and how the aggregation is done. + Get the desired dtype of a result based on the + input dtype and how it was computed. Parameters ---------- - dtype : dtype, type - The input dtype of the groupby. + dtype : DtypeObj + Input dtype. how : str - How the aggregation is performed. + How the result was computed. Returns ------- - The desired dtype of the aggregation result. + DtypeObj + The desired dtype of the aggregation result. """ d = { (np.dtype(np.bool), "add"): np.dtype(np.int64), From 15028eb93c3f0c1c74751bf399c9a8d34ed7a826 Mon Sep 17 00:00:00 2001 From: Richard Date: Wed, 25 Mar 2020 08:10:28 -0400 Subject: [PATCH 5/7] Removed references to aggregation in docstrings. --- pandas/core/dtypes/cast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2d3a5f8a6bdf4..a0a789aee2b3e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -261,7 +261,7 @@ def maybe_cast_result( numeric_only : bool, default False Whether to cast only numerics or datetimes as well. how : str, default "" - If result was aggregated, how the aggregation was performed. + How the result was computed. Returns ------- @@ -306,7 +306,7 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: Returns ------- DtypeObj - The desired dtype of the aggregation result. + The desired dtype of the result. """ d = { (np.dtype(np.bool), "add"): np.dtype(np.int64), From 3c484f34dd44e87bd50f43c9ef6d19f61ee57947 Mon Sep 17 00:00:00 2001 From: Richard Date: Thu, 26 Mar 2020 16:41:59 -0400 Subject: [PATCH 6/7] Renamed try_cast_to_ea, updated comment, whatsnew --- doc/source/whatsnew/v1.1.0.rst | 2 ++ pandas/core/arrays/base.py | 4 ++-- pandas/core/arrays/categorical.py | 4 ++-- pandas/core/dtypes/cast.py | 11 ++++------- pandas/core/series.py | 4 ++-- 5 files changed, 12 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index dcbfe6aeb9a12..667e5b9ff390a 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -370,6 +370,8 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) +- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`) + Reshaping ^^^^^^^^^ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 147deec94f130..b2626ff0e6588 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -19,7 +19,7 @@ from pandas.util._decorators import Appender, Substitution from pandas.util._validators import validate_fillna_kwargs -from pandas.core.dtypes.cast import try_cast_to_ea +from pandas.core.dtypes.cast import maybe_cast_to_extension_array from pandas.core.dtypes.common import is_array_like, is_list_like from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries @@ -1192,7 +1192,7 @@ def _maybe_convert(arr): # https://github.com/pandas-dev/pandas/issues/22850 # We catch all regular exceptions here, and fall back # to an ndarray. - res = try_cast_to_ea(self, arr) + res = maybe_cast_to_extension_array(self, arr) if not isinstance(res, type(self)): # exception raised in _from_sequence; ensure we have ndarray res = np.asarray(arr) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index bfccc6f244219..ca5e7ea8faee8 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -50,7 +50,7 @@ from pandas.core.arrays.base import ( ExtensionArray, _extension_array_shared_docs, - try_cast_to_ea, + maybe_cast_to_extension_array, ) from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com @@ -2568,7 +2568,7 @@ def _get_codes_for_values(values, categories): # scalar objects. e.g. # Categorical(array[Period, Period], categories=PeriodIndex(...)) cls = categories.dtype.construct_array_type() - values = try_cast_to_ea(cls, values) + values = maybe_cast_to_extension_array(cls, values) if not isinstance(values, cls): # exception raised in _from_sequence values = ensure_object(values) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a0a789aee2b3e..2bd10d0ea5000 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -276,14 +276,11 @@ def maybe_cast_result( if not is_scalar(result): if is_extension_array_dtype(dtype) and dtype.kind != "M": - # The function can return something of any type, so check - # if the type is compatible with the calling EA. - # datetime64tz is handled correctly in agg_series, - # so is excluded here. - + # The result may be of any type, cast back to original + # type if it's compatible. if len(result) and isinstance(result[0], dtype.type): cls = dtype.construct_array_type() - result = try_cast_to_ea(cls, result, dtype=dtype) + result = maybe_cast_to_extension_array(cls, result, dtype=dtype) elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: result = maybe_downcast_to_dtype(result, dtype) @@ -316,7 +313,7 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: return d.get((dtype, how), dtype) -def try_cast_to_ea(cls_or_instance, obj, dtype=None): +def maybe_cast_to_extension_array(cls_or_instance, obj, dtype=None): """ Call to `_from_sequence` that returns the object unchanged on Exception. diff --git a/pandas/core/series.py b/pandas/core/series.py index cd8dde1557607..08fb2dbee3401 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -29,7 +29,7 @@ from pandas.core.dtypes.cast import ( convert_dtypes, - try_cast_to_ea, + maybe_cast_to_extension_array, validate_numeric_casting, ) from pandas.core.dtypes.common import ( @@ -2725,7 +2725,7 @@ def combine(self, other, func, fill_value=None) -> "Series": # TODO: can we do this for only SparseDtype? # The function can return something of any type, so check # if the type is compatible with the calling EA. - new_values = try_cast_to_ea(self._values, new_values) + new_values = maybe_cast_to_extension_array(self._values, new_values) return self._constructor(new_values, index=new_index, name=new_name) def combine_first(self, other) -> "Series": From 9b32d0eda7d2d25352d89e558557ef56243bc6a4 Mon Sep 17 00:00:00 2001 From: Richard Date: Thu, 26 Mar 2020 18:59:46 -0400 Subject: [PATCH 7/7] Fixed import from refactor, restricted type in maybe_cast_to_extension_array --- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/categorical.py | 12 ++++++------ pandas/core/dtypes/cast.py | 7 ++++--- pandas/core/series.py | 2 +- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index b2626ff0e6588..af897e86a14d4 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1192,7 +1192,7 @@ def _maybe_convert(arr): # https://github.com/pandas-dev/pandas/issues/22850 # We catch all regular exceptions here, and fall back # to an ndarray. - res = maybe_cast_to_extension_array(self, arr) + res = maybe_cast_to_extension_array(type(self), arr) if not isinstance(res, type(self)): # exception raised in _from_sequence; ensure we have ndarray res = np.asarray(arr) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ca5e7ea8faee8..c11d879840fb9 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -19,7 +19,11 @@ ) from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs -from pandas.core.dtypes.cast import coerce_indexer_dtype, maybe_infer_to_datetimelike +from pandas.core.dtypes.cast import ( + coerce_indexer_dtype, + maybe_cast_to_extension_array, + maybe_infer_to_datetimelike, +) from pandas.core.dtypes.common import ( ensure_int64, ensure_object, @@ -47,11 +51,7 @@ from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algorithms from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d -from pandas.core.arrays.base import ( - ExtensionArray, - _extension_array_shared_docs, - maybe_cast_to_extension_array, -) +from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2bd10d0ea5000..da9646aa8c46f 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -313,13 +313,13 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: return d.get((dtype, how), dtype) -def maybe_cast_to_extension_array(cls_or_instance, obj, dtype=None): +def maybe_cast_to_extension_array(cls, obj, dtype=None): """ Call to `_from_sequence` that returns the object unchanged on Exception. Parameters ---------- - cls_or_instance : ExtensionArray subclass or instance + cls : ExtensionArray subclass obj : arraylike Values to pass to cls._from_sequence dtype : ExtensionDtype, optional @@ -328,8 +328,9 @@ def maybe_cast_to_extension_array(cls_or_instance, obj, dtype=None): ------- ExtensionArray or obj """ + assert isinstance(cls, type), f"must pass a type: {cls}" try: - result = cls_or_instance._from_sequence(obj, dtype=dtype) + result = cls._from_sequence(obj, dtype=dtype) except Exception: # We can't predict what downstream EA constructors may raise result = obj diff --git a/pandas/core/series.py b/pandas/core/series.py index 08fb2dbee3401..39e1178a3a5c3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2725,7 +2725,7 @@ def combine(self, other, func, fill_value=None) -> "Series": # TODO: can we do this for only SparseDtype? # The function can return something of any type, so check # if the type is compatible with the calling EA. - new_values = maybe_cast_to_extension_array(self._values, new_values) + new_values = maybe_cast_to_extension_array(type(self._values), new_values) return self._constructor(new_values, index=new_index, name=new_name) def combine_first(self, other) -> "Series":