diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index ebbdbcb0f61f5..6390fbeed8548 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -342,6 +342,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) - Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`) - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) +- Performance improvement in :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, :meth:`DataFrame.bfill` with pyarrow dtypes (:issue:`53950`) - Performance improvement in :meth:`Series.str.get_dummies` for pyarrow-backed strings (:issue:`53655`) - Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`) - Performance improvement in :meth:`Series.str.split` with ``expand=True`` for pyarrow-backed strings (:issue:`53585`) diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index 20a805533e8cc..cbbe418c8ab48 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -60,6 +60,10 @@ def nancorr_spearman( # ---------------------------------------------------------------------- def validate_limit(nobs: int | None, limit=...) -> int: ... +def get_fill_indexer( + mask: npt.NDArray[np.bool_], + limit: int | None = None, +) -> npt.NDArray[np.intp]: ... def pad( old: np.ndarray, # ndarray[numeric_object_t] new: np.ndarray, # ndarray[numeric_object_t] diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 854eacc5e1df5..0b6ea58f987d4 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -525,6 +525,42 @@ def validate_limit(nobs: int | None, limit=None) -> int: return lim +# TODO: overlap with libgroupby.group_fillna_indexer? +@cython.boundscheck(False) +@cython.wraparound(False) +def get_fill_indexer(const uint8_t[:] mask, limit=None): + """ + Find an indexer to use for ffill to `take` on the array being filled. + """ + cdef: + ndarray[intp_t, ndim=1] indexer + Py_ssize_t i, N = len(mask), last_valid + int lim + + # fill_count is the number of consecutive NAs we have seen. + # If it exceeds the given limit, we stop padding. + int fill_count = 0 + + lim = validate_limit(N, limit) + indexer = np.empty(N, dtype=np.intp) + + last_valid = -1 # haven't yet seen anything non-NA + + for i in range(N): + if not mask[i]: + indexer[i] = i + last_valid = i + fill_count = 0 + else: + if fill_count < lim: + indexer[i] = last_valid + else: + indexer[i] = -1 + fill_count += 1 + + return indexer + + @cython.boundscheck(False) @cython.wraparound(False) def pad( diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 85a75fff25ebd..17120d0de5c5f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -67,8 +67,6 @@ from pandas.core.dtypes.dtypes import ArrowDtype - from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning - ARROW_CMP_FUNCS = { "eq": pc.equal, "ne": pc.not_equal, @@ -918,7 +916,6 @@ def fillna( return super().fillna(value=value, method=method, limit=limit) if method is not None: - fallback_performancewarning() return super().fillna(value=value, method=method, limit=limit) if isinstance(value, (np.ndarray, ExtensionArray)): diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index ceac8e22426d9..64f917a419391 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -23,7 +23,10 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + algos as libalgos, + lib, +) from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -824,10 +827,16 @@ def fillna( if mask.any(): if method is not None: - func = missing.get_fill_func(method) - npvalues = self.astype(object) - func(npvalues, limit=limit, mask=mask) - new_values = self._from_sequence(npvalues, dtype=self.dtype) + meth = missing.clean_fill_method(method) + + npmask = np.asarray(mask) + if meth == "pad": + indexer = libalgos.get_fill_indexer(npmask, limit=limit) + return self.take(indexer, allow_fill=True) + else: + # i.e. meth == "backfill" + indexer = libalgos.get_fill_indexer(npmask[::-1], limit=limit)[::-1] + return self[::-1].take(indexer, allow_fill=True) else: # fill with value new_values = self.copy() diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index abcca16340365..56e35d30ad83c 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -38,7 +38,6 @@ pa_version_under9p0, pa_version_under11p0, ) -from pandas.errors import PerformanceWarning from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -698,12 +697,6 @@ def test_fillna_no_op_returns_copy(self, data): assert result is not data self.assert_extension_array_equal(result, data) - def test_fillna_series_method(self, data_missing, fillna_method): - with tm.maybe_produces_warning( - PerformanceWarning, fillna_method is not None, check_stacklevel=False - ): - super().test_fillna_series_method(data_missing, fillna_method) - class TestBasePrinting(base.BasePrintingTests): pass diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 1f39e8e9b450e..eb166691d3314 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -18,10 +18,7 @@ import numpy as np import pytest -from pandas.errors import PerformanceWarning - import pandas as pd -import pandas._testing as tm from pandas.api.types import is_string_dtype from pandas.core.arrays import ArrowStringArray from pandas.core.arrays.string_ import StringDtype @@ -169,14 +166,6 @@ def test_fillna_no_op_returns_copy(self, data): assert result is not data self.assert_extension_array_equal(result, data) - def test_fillna_series_method(self, data_missing, fillna_method): - with tm.maybe_produces_warning( - PerformanceWarning, - fillna_method is not None and data_missing.dtype.storage == "pyarrow", - check_stacklevel=False, - ): - super().test_fillna_series_method(data_missing, fillna_method) - class TestNoReduce(base.BaseNoReduceTests): @pytest.mark.parametrize("skipna", [True, False])