From bd97cfd0070f37ac36dec9316372eb41469c8f61 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Mar 2021 20:07:58 -0800 Subject: [PATCH 1/7] REF: use _split instead of split_and_operate for fillna --- pandas/core/internals/blocks.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 597023cb5b000..64edc47140769 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -489,17 +489,19 @@ def fillna( # we can't process the value, but nothing to do return [self] if inplace else [self.copy()] - # operate column-by-column - def f(mask, val, idx): - block = self.coerce_to_target_dtype(value) - - # slice out our block - if idx is not None: - # i.e. self.ndim == 2 - block = block.getitem_block(slice(idx, idx + 1)) - return block.fillna(value, limit=limit, inplace=inplace, downcast=None) + elif self.ndim == 1 or self.shape[0] == 1: + blk = self.coerce_to_target_dtype(value) + # bc we have already cast, inplace=True may avoid an extra copy + return blk.fillna(value, limit=limit, inplace=True, downcast=None) - return self.split_and_operate(None, f, inplace) + else: + # operate column-by-column + res_blocks = [] + nbs = self._split() + for nb in nbs: + rbs = nb.fillna(value, limit=limit, inplace=inplace, downcast=None) + res_blocks.extend(rbs) + return res_blocks @final def _split(self) -> List[Block]: From b38bbedc8db816f78ca99e184c8f64450886f3f4 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Mar 2021 20:27:12 -0800 Subject: [PATCH 2/7] REF: dont use split_and_operate in Block.downcast --- pandas/core/internals/blocks.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 64edc47140769..3a81e9e8cc45d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -627,13 +627,18 @@ def downcast(self, dtypes=None) -> List[Block]: elif dtypes != "infer": raise AssertionError("dtypes as dict is not supported yet") - # operate column-by-column - # this is expensive as it splits the blocks items-by-item - def f(mask, val, idx): - val = maybe_downcast_to_dtype(val, dtype="infer") - return val - - return self.split_and_operate(None, f, False) + if self.ndim == 1 or self.shape[0] == 1: + new_values = maybe_downcast_to_dtype(self.values, dtype="infer") + return [self.make_block(new_values)] + else: + # operate column-by-column + # this is expensive as it splits the blocks items-by-item + res_blocks = [] + nbs = self._split() + for nb in nbs: + rbs = nb.downcast(dtypes="infer") + res_blocks.extend(rbs) + return res_blocks @final def astype(self, dtype, copy: bool = False, errors: str = "raise"): From 3d7951cb43d40d389baa1e8ea322838e4aa6c47d Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Mar 2021 22:02:28 -0800 Subject: [PATCH 3/7] REF: dont use split_and_operate in ObjectBlock.convert --- pandas/core/internals/blocks.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3a81e9e8cc45d..f20aef63d1923 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2186,29 +2186,28 @@ def convert( the block (if copy = True) by definition we ARE an ObjectBlock!!!!! """ - # operate column-by-column - def f(mask, val, idx): - shape = val.shape - values = soft_convert_objects( - val.ravel(), + if self.ndim == 1 or self.shape[0] == 1: + # no need to operate column-wise + res_values = soft_convert_objects( + self.values.ravel(), datetime=datetime, numeric=numeric, timedelta=timedelta, copy=copy, ) - if isinstance(values, np.ndarray): - # TODO(EA2D): allow EA once reshape is supported - values = values.reshape(shape) - - return values + res_values = ensure_block_shape(res_values, self.ndim) + return [self.make_block(res_values)] - if self.ndim == 2: - blocks = self.split_and_operate(None, f, False) else: - values = f(None, self.values.ravel(), None) - blocks = [self.make_block(values)] - - return blocks + # operate column-wise + res_blocks = [] + nbs = self._split() + for nb in nbs: + rbs = nb.convert( + copy=copy, datetime=datetime, numeric=numeric, timedelta=timedelta + ) + res_blocks.extend(rbs) + return res_blocks def _maybe_downcast(self, blocks: List[Block], downcast=None) -> List[Block]: From 78b15d376746fd464f46fd4e690165fb0aa09c9f Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Mar 2021 22:22:51 -0800 Subject: [PATCH 4/7] REF: dont use split_and_operate in ObjectBlock.reduce --- pandas/core/internals/blocks.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f20aef63d1923..fe1cc43bcb7bf 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2150,20 +2150,16 @@ def reduce(self, func, ignore_failures: bool = False) -> List[Block]: """ assert self.ndim == 2 - values = self.values - if len(values) > 1: - # split_and_operate expects func with signature (mask, values, inplace) - def mask_func(mask, values, inplace): - if values.ndim == 1: - values = values.reshape(1, -1) - return func(values) - - return self.split_and_operate( - None, mask_func, False, ignore_failures=ignore_failures - ) + if self.shape[0] > 1: + res_blocks = [] + nbs = self._split() + for nb in nbs: + rbs = nb.reduce(func, ignore_failures) + res_blocks.extend(rbs) + return res_blocks try: - res = func(values) + res = func(self.values) except TypeError: if not ignore_failures: raise From d2bb7926dbf0f413905530d045ec2926dfa36c5a Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Mar 2021 07:26:21 -0800 Subject: [PATCH 5/7] REF: implement maybe_split --- pandas/core/internals/blocks.py | 66 +++++++++++++++++---------------- 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index fe1cc43bcb7bf..a1d2bc986833d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1,5 +1,6 @@ from __future__ import annotations +from functools import wraps import re from typing import ( TYPE_CHECKING, @@ -29,6 +30,7 @@ ArrayLike, Dtype, DtypeObj, + F, Shape, final, ) @@ -118,6 +120,28 @@ from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +def maybe_split(meth: F) -> F: + """ + If we have a multi-column block, split and operate block-wise. Otherwise + use the original method. + """ + + @wraps(meth) + def newfunc(self, *args, **kwargs) -> List[Block]: + + if self.ndim == 1 or self.shape[0] == 1: + return meth(self, *args, **kwargs) + else: + # Split and operate column-by-column + res_blocks = [] + for nb in self._split(): + rbs = meth(nb, *args, **kwargs) + res_blocks.extend(rbs) + return res_blocks + + return cast(F, newfunc) + + class Block(PandasObject): """ Canonical n-dimensional unit of homogeneous dtype contained in a pandas @@ -2144,20 +2168,13 @@ def is_bool(self): """ return lib.is_bool_array(self.values.ravel("K")) + @maybe_split def reduce(self, func, ignore_failures: bool = False) -> List[Block]: """ For object-dtype, we operate column-wise. """ assert self.ndim == 2 - if self.shape[0] > 1: - res_blocks = [] - nbs = self._split() - for nb in nbs: - rbs = nb.reduce(func, ignore_failures) - res_blocks.extend(rbs) - return res_blocks - try: res = func(self.values) except TypeError: @@ -2170,6 +2187,7 @@ def reduce(self, func, ignore_failures: bool = False) -> List[Block]: res = res.reshape(1, -1) return [self.make_block_same_class(res)] + @maybe_split def convert( self, copy: bool = True, @@ -2181,29 +2199,15 @@ def convert( attempt to cast any object types to better types return a copy of the block (if copy = True) by definition we ARE an ObjectBlock!!!!! """ - - if self.ndim == 1 or self.shape[0] == 1: - # no need to operate column-wise - res_values = soft_convert_objects( - self.values.ravel(), - datetime=datetime, - numeric=numeric, - timedelta=timedelta, - copy=copy, - ) - res_values = ensure_block_shape(res_values, self.ndim) - return [self.make_block(res_values)] - - else: - # operate column-wise - res_blocks = [] - nbs = self._split() - for nb in nbs: - rbs = nb.convert( - copy=copy, datetime=datetime, numeric=numeric, timedelta=timedelta - ) - res_blocks.extend(rbs) - return res_blocks + res_values = soft_convert_objects( + self.values.ravel(), + datetime=datetime, + numeric=numeric, + timedelta=timedelta, + copy=copy, + ) + res_values = ensure_block_shape(res_values, self.ndim) + return [self.make_block(res_values)] def _maybe_downcast(self, blocks: List[Block], downcast=None) -> List[Block]: From 1caeeda96f73dd457cef3b78cba6a66d2d3409f0 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Mar 2021 08:32:32 -0800 Subject: [PATCH 6/7] REF: use maybe_split for downcast --- pandas/core/internals/blocks.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a1d2bc986833d..4e0f073ec6922 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -651,18 +651,17 @@ def downcast(self, dtypes=None) -> List[Block]: elif dtypes != "infer": raise AssertionError("dtypes as dict is not supported yet") - if self.ndim == 1 or self.shape[0] == 1: - new_values = maybe_downcast_to_dtype(self.values, dtype="infer") - return [self.make_block(new_values)] - else: - # operate column-by-column - # this is expensive as it splits the blocks items-by-item - res_blocks = [] - nbs = self._split() - for nb in nbs: - rbs = nb.downcast(dtypes="infer") - res_blocks.extend(rbs) - return res_blocks + return self._downcast_2d() + + @maybe_split + def _downcast_2d(self) -> List[Block]: + """ + downcast specialized to 2D case post-validation. + + Refactored to allow use of maybe_split. + """ + new_values = maybe_downcast_to_dtype(self.values, dtype="infer") + return [self.make_block(new_values)] @final def astype(self, dtype, copy: bool = False, errors: str = "raise"): From a0bf89a334ef0172af37236115418dc2b1755b77 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 9 Mar 2021 10:49:33 -0800 Subject: [PATCH 7/7] REF: re-implement split_and_operate --- pandas/core/internals/blocks.py | 115 ++++++++------------------------ 1 file changed, 28 insertions(+), 87 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index db2e328fefe42..833a92753f886 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -133,11 +133,7 @@ def newfunc(self, *args, **kwargs) -> List[Block]: return meth(self, *args, **kwargs) else: # Split and operate column-by-column - res_blocks = [] - for nb in self._split(): - rbs = meth(nb, *args, **kwargs) - res_blocks.extend(rbs) - return res_blocks + return self.split_and_operate(meth, *args, **kwargs) return cast(F, newfunc) @@ -520,12 +516,9 @@ def fillna( else: # operate column-by-column - res_blocks = [] - nbs = self._split() - for nb in nbs: - rbs = nb.fillna(value, limit=limit, inplace=inplace, downcast=None) - res_blocks.extend(rbs) - return res_blocks + return self.split_and_operate( + type(self).fillna, value, limit=limit, inplace=inplace, downcast=None + ) @final def _split(self) -> List[Block]: @@ -538,80 +531,32 @@ def _split(self) -> List[Block]: for i, ref_loc in enumerate(self.mgr_locs): vals = self.values[slice(i, i + 1)] - nb = self.make_block(vals, [ref_loc]) + nb = self.make_block(vals, BlockPlacement(ref_loc)) new_blocks.append(nb) return new_blocks @final - def split_and_operate( - self, mask, f, inplace: bool, ignore_failures: bool = False - ) -> List[Block]: + def split_and_operate(self, func, *args, **kwargs) -> List[Block]: """ - split the block per-column, and apply the callable f - per-column, return a new block for each. Handle - masking which will not change a block unless needed. + Split the block and apply func column-by-column. Parameters ---------- - mask : 2-d boolean mask - f : callable accepting (1d-mask, 1d values, indexer) - inplace : bool - ignore_failures : bool, default False + func : Block method + *args + **kwargs Returns ------- - list of blocks + List[Block] """ - if mask is None: - mask = np.broadcast_to(True, shape=self.shape) - - new_values = self.values - - def make_a_block(nv, ref_loc): - if isinstance(nv, list): - assert len(nv) == 1, nv - assert isinstance(nv[0], Block) - block = nv[0] - else: - # Put back the dimension that was taken from it and make - # a block out of the result. - nv = ensure_block_shape(nv, ndim=self.ndim) - block = self.make_block(values=nv, placement=ref_loc) - return block - - # ndim == 1 - if self.ndim == 1: - if mask.any(): - nv = f(mask, new_values, None) - else: - nv = new_values if inplace else new_values.copy() - block = make_a_block(nv, self.mgr_locs) - return [block] - - # ndim > 1 - new_blocks = [] - for i, ref_loc in enumerate(self.mgr_locs): - m = mask[i] - v = new_values[i] - - # need a new block - if m.any() or m.size == 0: - # Apply our function; we may ignore_failures if this is a - # reduction that is dropping nuisance columns GH#37827 - try: - nv = f(m, v, i) - except TypeError: - if ignore_failures: - continue - else: - raise - else: - nv = v if inplace else v.copy() + assert self.ndim == 2 and self.shape[0] != 1 - block = make_a_block(nv, [ref_loc]) - new_blocks.append(block) - - return new_blocks + res_blocks = [] + for nb in self._split(): + rbs = func(nb, *args, **kwargs) + res_blocks.extend(rbs) + return res_blocks def _maybe_downcast(self, blocks: List[Block], downcast=None) -> List[Block]: @@ -788,18 +733,13 @@ def replace( # bc _can_hold_element is incorrect. return [self] if inplace else [self.copy()] - if not self._can_hold_element(value): - if self.ndim == 2 and self.shape[0] > 1: - # split so that we only upcast where necessary - nbs = self._split() - res_blocks = extend_blocks( - [ - blk.replace(to_replace, value, inplace=inplace, regex=regex) - for blk in nbs - ] - ) - return res_blocks + elif self._can_hold_element(value): + blk = self if inplace else self.copy() + putmask_inplace(blk.values, mask, value) + blocks = blk.convert(numeric=False, copy=False) + return blocks + elif self.ndim == 1 or self.shape[0] == 1: blk = self.coerce_to_target_dtype(value) return blk.replace( to_replace=to_replace, @@ -808,10 +748,11 @@ def replace( regex=regex, ) - blk = self if inplace else self.copy() - putmask_inplace(blk.values, mask, value) - blocks = blk.convert(numeric=False, copy=False) - return blocks + else: + # split so that we only upcast where necessary + return self.split_and_operate( + type(self).replace, to_replace, value, inplace=inplace, regex=regex + ) @final def _replace_regex(