Skip to content

PERF: avoid object conversion in fillna(method=pad|backfill) for masked arrays #39953

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Mar 5, 2021
Merged
24 changes: 18 additions & 6 deletions asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,15 +351,27 @@ def time_isnull_obj(self):

class Fillna:

params = ([True, False], ["pad", "bfill"])
param_names = ["inplace", "method"]

def setup(self, inplace, method):
params = (
[True, False],
["pad", "bfill"],
[
"float64",
"float32",
"object",
"Int64",
"Float64",
],
)
param_names = ["inplace", "method", "dtype"]

def setup(self, inplace, method, dtype):
values = np.random.randn(10000, 100)
values[::2] = np.nan
self.df = DataFrame(values)
if dtype == "Int64":
values = values.round()
self.df = DataFrame(values, dtype=dtype)

def time_frame_fillna(self, inplace, method):
def time_frame_fillna(self, inplace, method, dtype):
self.df.fillna(inplace=inplace, method=method)


Expand Down
32 changes: 32 additions & 0 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,38 @@ def pad_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None):
val = values[i]


@cython.boundscheck(False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rather than adding new, can we simply always use the mask (for all dtypes)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably, the 1d versions of pad and backfill are only used by EA.fillna and Series.replace.

some of the EAs override fillna and use 2d algorithms. NDArrayBackedExtensionArray uses the 1d versions but should be using a 2d version (not an issue since only called columnwise, but would be fail if called directly on a 2d EA) The other EAs that use this convert to object so are not performant anyway.

not sure about Series.replace, will look some more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don't support any 2D EA atm, so can leave those out for now

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done. if we are always updating and returning the mask, maybe we should return the same dtype as the original mask, or only allow boolean mask in python space and always return a boolean mask.

@cython.wraparound(False)
def pad_inplace_masked(algos_t[:] values, uint8_t[:] mask, limit=None):
cdef:
Py_ssize_t i, N
algos_t val
uint8_t prev_mask
int lim, fill_count = 0

N = len(values)

# GH#2778
if N == 0:
return

lim = validate_limit(N, limit)

val = values[0]
prev_mask = mask[0]
for i in range(1, N):
if mask[i]:
if fill_count >= lim:
continue
fill_count += 1
values[i] = val
mask[i] = prev_mask
else:
fill_count = 0
val = values[i]
prev_mask = mask[i]


@cython.boundscheck(False)
@cython.wraparound(False)
def pad_2d_inplace(algos_t[:, :] values, const uint8_t[:, :] mask, limit=None):
Expand Down
41 changes: 40 additions & 1 deletion pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
cache_readonly,
doc,
)
from pandas.util._validators import validate_fillna_kwargs

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.common import (
Expand All @@ -38,12 +39,16 @@
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.inference import is_array_like
from pandas.core.dtypes.missing import (
isna,
notna,
)

from pandas.core import nanops
from pandas.core import (
missing,
nanops,
)
from pandas.core.algorithms import (
factorize_array,
isin,
Expand Down Expand Up @@ -144,6 +149,40 @@ def __getitem__(

return type(self)(self._data[item], self._mask[item])

@doc(ExtensionArray.fillna)
def fillna(
self: BaseMaskedArrayT, value=None, method=None, limit=None
) -> BaseMaskedArrayT:
value, method = validate_fillna_kwargs(value, method)

mask = self._mask

if is_array_like(value):
if len(value) != len(self):
raise ValueError(
f"Length of 'value' does not match. Got ({len(value)}) "
f" expected {len(self)}"
)
value = value[mask]

if mask.any():
if method is not None:
func = missing.get_fill_func(method)
new_values, new_mask = func(
self._data.copy(),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ExtensionBlock.interpolate creates another copy L1774. if we always copy in EA.fillna then that copy shouldn't be needed.

limit=limit,
mask=mask.copy(),
update_mask=True,
)
return type(self)(new_values, new_mask.view(np.bool_))
else:
# fill with value
new_values = self.copy()
new_values[mask] = value
else:
new_values = self.copy()
return new_values

def _coerce_to_array(self, values) -> Tuple[np.ndarray, np.ndarray]:
raise AbstractMethodError(self)

Expand Down
30 changes: 24 additions & 6 deletions pandas/core/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,16 +692,34 @@ def _fillna_prep(values, mask=None):
return values, mask


def _pad_1d(values, limit=None, mask=None):
def _pad_1d(
values: np.ndarray,
limit: int | None = None,
mask: np.ndarray | None = None,
update_mask: bool = False,
) -> np.ndarray | tuple[np.ndarray, np.ndarray]:
values, mask = _fillna_prep(values, mask)
algos.pad_inplace(values, mask, limit=limit)
return values
if update_mask:
algos.pad_inplace_masked(values, mask, limit=limit)
return values, mask
else:
algos.pad_inplace(values, mask, limit=limit)
return values


def _backfill_1d(values, limit=None, mask=None):
def _backfill_1d(
values: np.ndarray,
limit: int | None = None,
mask: np.ndarray | None = None,
update_mask: bool = False,
) -> np.ndarray | tuple[np.ndarray, np.ndarray]:
values, mask = _fillna_prep(values, mask)
algos.backfill_inplace(values, mask, limit=limit)
return values
if update_mask:
algos.pad_inplace_masked(values[::-1], mask[::-1], limit=limit)
return values, mask
else:
algos.backfill_inplace(values, mask, limit=limit)
return values


def _pad_2d(values, limit=None, mask=None):
Expand Down