Skip to content

PERF: ffill/bfill with non-numpy dtypes #53950

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@ Performance improvements
- Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`)
- Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`)
- Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`)
- Performance improvement in :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, :meth:`DataFrame.bfill` with pyarrow dtypes (:issue:`53950`)
- Performance improvement in :meth:`Series.str.get_dummies` for pyarrow-backed strings (:issue:`53655`)
- Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`)
- Performance improvement in :meth:`Series.str.split` with ``expand=True`` for pyarrow-backed strings (:issue:`53585`)
Expand Down
4 changes: 4 additions & 0 deletions pandas/_libs/algos.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ def nancorr_spearman(
# ----------------------------------------------------------------------

def validate_limit(nobs: int | None, limit=...) -> int: ...
def get_fill_indexer(
mask: npt.NDArray[np.bool_],
limit: int | None = None,
) -> npt.NDArray[np.intp]: ...
def pad(
old: np.ndarray, # ndarray[numeric_object_t]
new: np.ndarray, # ndarray[numeric_object_t]
Expand Down
36 changes: 36 additions & 0 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,42 @@ def validate_limit(nobs: int | None, limit=None) -> int:
return lim


# TODO: overlap with libgroupby.group_fillna_indexer?
@cython.boundscheck(False)
@cython.wraparound(False)
def get_fill_indexer(const uint8_t[:] mask, limit=None):
"""
Find an indexer to use for ffill to `take` on the array being filled.
"""
cdef:
ndarray[intp_t, ndim=1] indexer
Py_ssize_t i, N = len(mask), last_valid
int lim

# fill_count is the number of consecutive NAs we have seen.
# If it exceeds the given limit, we stop padding.
int fill_count = 0

lim = validate_limit(N, limit)
indexer = np.empty(N, dtype=np.intp)

last_valid = -1 # haven't yet seen anything non-NA

for i in range(N):
if not mask[i]:
indexer[i] = i
last_valid = i
fill_count = 0
else:
if fill_count < lim:
indexer[i] = last_valid
else:
indexer[i] = -1
fill_count += 1

return indexer


@cython.boundscheck(False)
@cython.wraparound(False)
def pad(
Expand Down
3 changes: 0 additions & 3 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,6 @@

from pandas.core.dtypes.dtypes import ArrowDtype

from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning

ARROW_CMP_FUNCS = {
"eq": pc.equal,
"ne": pc.not_equal,
Expand Down Expand Up @@ -918,7 +916,6 @@ def fillna(
return super().fillna(value=value, method=method, limit=limit)

if method is not None:
fallback_performancewarning()
return super().fillna(value=value, method=method, limit=limit)

if isinstance(value, (np.ndarray, ExtensionArray)):
Expand Down
19 changes: 14 additions & 5 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@

import numpy as np

from pandas._libs import lib
from pandas._libs import (
algos as libalgos,
lib,
)
from pandas.compat import set_function_name
from pandas.compat.numpy import function as nv
from pandas.errors import AbstractMethodError
Expand Down Expand Up @@ -824,10 +827,16 @@ def fillna(

if mask.any():
if method is not None:
func = missing.get_fill_func(method)
npvalues = self.astype(object)
func(npvalues, limit=limit, mask=mask)
new_values = self._from_sequence(npvalues, dtype=self.dtype)
meth = missing.clean_fill_method(method)

npmask = np.asarray(mask)
if meth == "pad":
indexer = libalgos.get_fill_indexer(npmask, limit=limit)
return self.take(indexer, allow_fill=True)
else:
# i.e. meth == "backfill"
indexer = libalgos.get_fill_indexer(npmask[::-1], limit=limit)[::-1]
return self[::-1].take(indexer, allow_fill=True)
else:
# fill with value
new_values = self.copy()
Expand Down
7 changes: 0 additions & 7 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
pa_version_under9p0,
pa_version_under11p0,
)
from pandas.errors import PerformanceWarning

from pandas.core.dtypes.dtypes import (
ArrowDtype,
Expand Down Expand Up @@ -698,12 +697,6 @@ def test_fillna_no_op_returns_copy(self, data):
assert result is not data
self.assert_extension_array_equal(result, data)

def test_fillna_series_method(self, data_missing, fillna_method):
with tm.maybe_produces_warning(
PerformanceWarning, fillna_method is not None, check_stacklevel=False
):
super().test_fillna_series_method(data_missing, fillna_method)


class TestBasePrinting(base.BasePrintingTests):
pass
Expand Down
11 changes: 0 additions & 11 deletions pandas/tests/extension/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,7 @@
import numpy as np
import pytest

from pandas.errors import PerformanceWarning

import pandas as pd
import pandas._testing as tm
from pandas.api.types import is_string_dtype
from pandas.core.arrays import ArrowStringArray
from pandas.core.arrays.string_ import StringDtype
Expand Down Expand Up @@ -169,14 +166,6 @@ def test_fillna_no_op_returns_copy(self, data):
assert result is not data
self.assert_extension_array_equal(result, data)

def test_fillna_series_method(self, data_missing, fillna_method):
with tm.maybe_produces_warning(
PerformanceWarning,
fillna_method is not None and data_missing.dtype.storage == "pyarrow",
check_stacklevel=False,
):
super().test_fillna_series_method(data_missing, fillna_method)


class TestNoReduce(base.BaseNoReduceTests):
@pytest.mark.parametrize("skipna", [True, False])
Expand Down