From 9ba0c868f5116319a365c12a6bb6f401261d15ee Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 20 Jul 2020 11:13:34 -0700 Subject: [PATCH] REF: Avoid post-processing in blockwise op --- pandas/core/groupby/generic.py | 97 ++++++++++++++++------------------ 1 file changed, 47 insertions(+), 50 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ec7b14f27c5a1..b7aa186b64ad8 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1031,11 +1031,36 @@ def _cython_agg_blocks( agg_blocks: List[Block] = [] new_items: List[np.ndarray] = [] deleted_items: List[np.ndarray] = [] - # Some object-dtype blocks might be split into List[Block[T], Block[U]] - split_items: List[np.ndarray] = [] - split_frames: List[DataFrame] = [] no_result = object() + + def cast_result_block(result, block: "Block", how: str) -> "Block": + # see if we can cast the block to the desired dtype + # this may not be the original dtype + assert not isinstance(result, DataFrame) + assert result is not no_result + + dtype = maybe_cast_result_dtype(block.dtype, how) + result = maybe_downcast_numeric(result, dtype) + + if block.is_extension and isinstance(result, np.ndarray): + # e.g. block.values was an IntegerArray + # (1, N) case can occur if block.values was Categorical + # and result is ndarray[object] + # TODO(EA2D): special casing not needed with 2D EAs + assert result.ndim == 1 or result.shape[0] == 1 + try: + # Cast back if feasible + result = type(block.values)._from_sequence( + result.ravel(), dtype=block.values.dtype + ) + except (ValueError, TypeError): + # reshape to be valid for non-Extension Block + result = result.reshape(1, -1) + + agg_block: Block = block.make_block(result) + return agg_block + for block in data.blocks: # Avoid inheriting result from earlier in the loop result = no_result @@ -1067,9 +1092,9 @@ def _cython_agg_blocks( # not try to add missing categories if grouping over multiple # Categoricals. This will done by later self._reindex_output() # Doing it here creates an error. See GH#34951 - s = get_groupby(obj, self.grouper, observed=True) + sgb = get_groupby(obj, self.grouper, observed=True) try: - result = s.aggregate(lambda x: alt(x, axis=self.axis)) + result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) except TypeError: # we may have an exception in trying to aggregate # continue and exclude the block @@ -1083,54 +1108,26 @@ def _cython_agg_blocks( # about a single block input returning a single block output # is a lie. To keep the code-path for the typical non-split case # clean, we choose to clean up this mess later on. - split_items.append(locs) - split_frames.append(result) - continue - - assert len(result._mgr.blocks) == 1 - result = result._mgr.blocks[0].values - if isinstance(result, np.ndarray) and result.ndim == 1: - result = result.reshape(1, -1) - - assert not isinstance(result, DataFrame) - - if result is not no_result: - # see if we can cast the block to the desired dtype - # this may not be the original dtype - dtype = maybe_cast_result_dtype(block.dtype, how) - result = maybe_downcast_numeric(result, dtype) - - if block.is_extension and isinstance(result, np.ndarray): - # e.g. block.values was an IntegerArray - # (1, N) case can occur if block.values was Categorical - # and result is ndarray[object] - # TODO(EA2D): special casing not needed with 2D EAs - assert result.ndim == 1 or result.shape[0] == 1 - try: - # Cast back if feasible - result = type(block.values)._from_sequence( - result.ravel(), dtype=block.values.dtype - ) - except (ValueError, TypeError): - # reshape to be valid for non-Extension Block - result = result.reshape(1, -1) - - agg_block: Block = block.make_block(result) - - new_items.append(locs) - agg_blocks.append(agg_block) + assert len(locs) == result.shape[1] + for i, loc in enumerate(locs): + new_items.append(np.array([loc], dtype=locs.dtype)) + agg_block = result.iloc[:, [i]]._mgr.blocks[0] + agg_blocks.append(agg_block) + else: + result = result._mgr.blocks[0].values + if isinstance(result, np.ndarray) and result.ndim == 1: + result = result.reshape(1, -1) + agg_block = cast_result_block(result, block, how) + new_items.append(locs) + agg_blocks.append(agg_block) + else: + agg_block = cast_result_block(result, block, how) + new_items.append(locs) + agg_blocks.append(agg_block) - if not (agg_blocks or split_frames): + if not agg_blocks: raise DataError("No numeric types to aggregate") - if split_items: - # Clean up the mess left over from split blocks. - for locs, result in zip(split_items, split_frames): - assert len(locs) == result.shape[1] - for i, loc in enumerate(locs): - new_items.append(np.array([loc], dtype=locs.dtype)) - agg_blocks.append(result.iloc[:, [i]]._mgr.blocks[0]) - # reset the locs in the blocks to correspond to our # current ordering indexer = np.concatenate(new_items)