Skip to content

BUG: DataFrameGroupBy agg with multi-column object block #41111

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -850,6 +850,7 @@ Groupby/resample/rolling
- Bug in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` computing wrong result with nullable data types too large to roundtrip when casting to float (:issue:`37493`)
- Bug in :meth:`DataFrame.rolling` returning mean zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`)
- Bug in :meth:`DataFrame.rolling` returning sum not zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`)
- Bug in :meth:`DataFrameGroupBy.min` and :meth:`DataFrameGroupBy.max` with multiple object-dtype columns and ``numeric_only=False`` incorrectly raising ``ValueError`` (:issue:41111`)

Reshaping
^^^^^^^^^
Expand Down
41 changes: 15 additions & 26 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1088,9 +1088,7 @@ def _cython_agg_general(

using_array_manager = isinstance(data, ArrayManager)

def cast_agg_result(
result: ArrayLike, values: ArrayLike, how: str
) -> ArrayLike:
def cast_agg_result(result: ArrayLike, values: ArrayLike) -> ArrayLike:
# see if we can cast the values to the desired dtype
# this may not be the original dtype

Expand All @@ -1102,7 +1100,7 @@ def cast_agg_result(

elif (
not using_array_manager
and isinstance(result, np.ndarray)
and isinstance(result.dtype, np.dtype)
and result.ndim == 1
):
# We went through a SeriesGroupByPath and need to reshape
Expand All @@ -1129,34 +1127,26 @@ def py_fallback(values: ArrayLike) -> ArrayLike:
else:
# We only get here with values.dtype == object
# TODO special case not needed with ArrayManager
obj = DataFrame(values.T)
if obj.shape[1] == 1:
# Avoid call to self.values that can occur in DataFrame
# reductions; see GH#28949
obj = obj.iloc[:, 0]
df = DataFrame(values.T)
# bc we split object blocks in grouped_reduce, we have only 1 col
# otherwise we'd have to worry about block-splitting GH#39329
assert df.shape[1] == 1
# Avoid call to self.values that can occur in DataFrame
# reductions; see GH#28949
obj = df.iloc[:, 0]

# Create SeriesGroupBy with observed=True so that it does
# not try to add missing categories if grouping over multiple
# Categoricals. This will done by later self._reindex_output()
# Doing it here creates an error. See GH#34951
sgb = get_groupby(obj, self.grouper, observed=True)
result = sgb.aggregate(lambda x: alt(x, axis=self.axis))
# Note: bc obj is always a Series here, we can ignore axis and pass
# `alt` directly instead of `lambda x: alt(x, axis=self.axis)`
res_ser = sgb.aggregate(alt) # this will go through sgb._python_agg_general

# In the case of object dtype block, it may have been split
# in the operation. We un-split here.
result = result._consolidate()
# unwrap DataFrame/Series to get array
mgr = result._mgr
arrays = mgr.arrays
if len(arrays) != 1:
# We've split an object block! Everything we've assumed
# about a single block input returning a single block output
# is a lie. See eg GH-39329
return mgr.as_array()
else:
# We are a single block from a BlockManager
# or one array from SingleArrayManager
return arrays[0]
# unwrap Series to get array
res_values = res_ser._mgr.arrays[0]
return cast_agg_result(res_values, values)

def array_func(values: ArrayLike) -> ArrayLike:

Expand All @@ -1170,7 +1160,6 @@ def array_func(values: ArrayLike) -> ArrayLike:
# try to python agg
result = py_fallback(values)

return cast_agg_result(result, values, how)
return result

# TypeError -> we may have an exception in trying to aggregate
Expand Down
26 changes: 19 additions & 7 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,13 +302,25 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
result_blocks: list[Block] = []

for blk in self.blocks:
try:
applied = blk.apply(func)
except (TypeError, NotImplementedError):
if not ignore_failures:
raise
continue
result_blocks = extend_blocks(applied, result_blocks)
if blk.is_object:
# split on object-dtype blocks bc some columns may raise
# while others do not.
for sb in blk._split():
try:
applied = sb.apply(func)
except (TypeError, NotImplementedError):
if not ignore_failures:
raise
continue
result_blocks = extend_blocks(applied, result_blocks)
else:
try:
applied = blk.apply(func)
except (TypeError, NotImplementedError):
if not ignore_failures:
raise
continue
result_blocks = extend_blocks(applied, result_blocks)

if len(result_blocks) == 0:
index = Index([None]) # placeholder
Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,37 @@ def test_max_min_non_numeric():
assert "ss" in result


def test_max_min_object_multiple_columns(using_array_manager):
# GH#41111 case where the aggregation is valid for some columns but not
# others; we split object blocks column-wise, consistent with
# DataFrame._reduce

df = DataFrame(
{
"A": [1, 1, 2, 2, 3],
"B": [1, "foo", 2, "bar", False],
"C": ["a", "b", "c", "d", "e"],
}
)
df._consolidate_inplace() # should already be consolidate, but double-check
if not using_array_manager:
assert len(df._mgr.blocks) == 2

gb = df.groupby("A")

result = gb.max(numeric_only=False)
# "max" is valid for column "C" but not for "B"
ei = Index([1, 2, 3], name="A")
expected = DataFrame({"C": ["b", "d", "e"]}, index=ei)
tm.assert_frame_equal(result, expected)

result = gb.min(numeric_only=False)
# "min" is valid for column "C" but not for "B"
ei = Index([1, 2, 3], name="A")
expected = DataFrame({"C": ["a", "c", "e"]}, index=ei)
tm.assert_frame_equal(result, expected)


def test_min_date_with_nans():
# GH26321
dates = pd.to_datetime(
Expand Down