-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
PERF: avoid object conversion in fillna(method=pad|backfill) for masked arrays #39953
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
d3ae0cf
0b4d283
c661788
9024d95
8f4417b
5955dad
245d99c
9f10cf2
fb26e31
bf1ecab
6407fc8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -623,6 +623,38 @@ def pad_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None): | |
val = values[i] | ||
|
||
|
||
@cython.boundscheck(False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. rather than adding new, can we simply always use the mask (for all dtypes) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. probably, the 1d versions of pad and backfill are only used by EA.fillna and Series.replace. some of the EAs override fillna and use 2d algorithms. NDArrayBackedExtensionArray uses the 1d versions but should be using a 2d version (not an issue since only called columnwise, but would be fail if called directly on a 2d EA) The other EAs that use this convert to object so are not performant anyway. not sure about Series.replace, will look some more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we don't support any 2D EA atm, so can leave those out for now There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done. if we are always updating and returning the mask, maybe we should return the same dtype as the original mask, or only allow boolean mask in python space and always return a boolean mask. |
||
@cython.wraparound(False) | ||
def pad_inplace_masked(algos_t[:] values, uint8_t[:] mask, limit=None): | ||
cdef: | ||
Py_ssize_t i, N | ||
algos_t val | ||
uint8_t prev_mask | ||
int lim, fill_count = 0 | ||
|
||
N = len(values) | ||
|
||
# GH#2778 | ||
if N == 0: | ||
return | ||
|
||
lim = validate_limit(N, limit) | ||
|
||
val = values[0] | ||
prev_mask = mask[0] | ||
for i in range(1, N): | ||
if mask[i]: | ||
if fill_count >= lim: | ||
continue | ||
fill_count += 1 | ||
values[i] = val | ||
mask[i] = prev_mask | ||
else: | ||
fill_count = 0 | ||
val = values[i] | ||
prev_mask = mask[i] | ||
|
||
|
||
@cython.boundscheck(False) | ||
@cython.wraparound(False) | ||
def pad_2d_inplace(algos_t[:, :] values, const uint8_t[:, :] mask, limit=None): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,7 @@ | |
cache_readonly, | ||
doc, | ||
) | ||
from pandas.util._validators import validate_fillna_kwargs | ||
|
||
from pandas.core.dtypes.base import ExtensionDtype | ||
from pandas.core.dtypes.common import ( | ||
|
@@ -38,12 +39,16 @@ | |
is_string_dtype, | ||
pandas_dtype, | ||
) | ||
from pandas.core.dtypes.inference import is_array_like | ||
from pandas.core.dtypes.missing import ( | ||
isna, | ||
notna, | ||
) | ||
|
||
from pandas.core import nanops | ||
from pandas.core import ( | ||
missing, | ||
nanops, | ||
) | ||
from pandas.core.algorithms import ( | ||
factorize_array, | ||
isin, | ||
|
@@ -144,6 +149,40 @@ def __getitem__( | |
|
||
return type(self)(self._data[item], self._mask[item]) | ||
|
||
@doc(ExtensionArray.fillna) | ||
def fillna( | ||
self: BaseMaskedArrayT, value=None, method=None, limit=None | ||
) -> BaseMaskedArrayT: | ||
value, method = validate_fillna_kwargs(value, method) | ||
|
||
mask = self._mask | ||
|
||
if is_array_like(value): | ||
if len(value) != len(self): | ||
raise ValueError( | ||
f"Length of 'value' does not match. Got ({len(value)}) " | ||
f" expected {len(self)}" | ||
) | ||
value = value[mask] | ||
|
||
if mask.any(): | ||
if method is not None: | ||
func = missing.get_fill_func(method) | ||
new_values, new_mask = func( | ||
self._data.copy(), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ExtensionBlock.interpolate creates another copy L1774. if we always copy in EA.fillna then that copy shouldn't be needed. |
||
limit=limit, | ||
mask=mask.copy(), | ||
update_mask=True, | ||
) | ||
return type(self)(new_values, new_mask.view(np.bool_)) | ||
else: | ||
# fill with value | ||
new_values = self.copy() | ||
new_values[mask] = value | ||
else: | ||
new_values = self.copy() | ||
return new_values | ||
|
||
def _coerce_to_array(self, values) -> Tuple[np.ndarray, np.ndarray]: | ||
raise AbstractMethodError(self) | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.