From defe83bd4a5abd74ff5f96faafc1fabbd8c07af2 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 30 Jun 2023 09:58:30 -0700 Subject: [PATCH 1/3] PERF: ffill/bfill with non-numpy dtypes --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/_libs/algos.pyx | 36 +++++++++++++++++++++++++++ pandas/core/arrays/arrow/array.py | 3 --- pandas/core/arrays/base.py | 18 ++++++++++---- pandas/tests/extension/test_arrow.py | 7 ------ pandas/tests/extension/test_string.py | 11 -------- 6 files changed, 50 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index ebbdbcb0f61f5..85daf51ebeb20 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -350,6 +350,7 @@ Performance improvements - Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`52525`) - Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`) - Performance improvement when indexing with pyarrow timestamp and duration dtypes (:issue:`53368`) +- Performance improvement in :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, :meth:`DataFrame.bfill` with pyarrow dtypes (:issue:`??`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 854eacc5e1df5..0b6ea58f987d4 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -525,6 +525,42 @@ def validate_limit(nobs: int | None, limit=None) -> int: return lim +# TODO: overlap with libgroupby.group_fillna_indexer? +@cython.boundscheck(False) +@cython.wraparound(False) +def get_fill_indexer(const uint8_t[:] mask, limit=None): + """ + Find an indexer to use for ffill to `take` on the array being filled. + """ + cdef: + ndarray[intp_t, ndim=1] indexer + Py_ssize_t i, N = len(mask), last_valid + int lim + + # fill_count is the number of consecutive NAs we have seen. + # If it exceeds the given limit, we stop padding. + int fill_count = 0 + + lim = validate_limit(N, limit) + indexer = np.empty(N, dtype=np.intp) + + last_valid = -1 # haven't yet seen anything non-NA + + for i in range(N): + if not mask[i]: + indexer[i] = i + last_valid = i + fill_count = 0 + else: + if fill_count < lim: + indexer[i] = last_valid + else: + indexer[i] = -1 + fill_count += 1 + + return indexer + + @cython.boundscheck(False) @cython.wraparound(False) def pad( diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 85a75fff25ebd..17120d0de5c5f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -67,8 +67,6 @@ from pandas.core.dtypes.dtypes import ArrowDtype - from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning - ARROW_CMP_FUNCS = { "eq": pc.equal, "ne": pc.not_equal, @@ -918,7 +916,6 @@ def fillna( return super().fillna(value=value, method=method, limit=limit) if method is not None: - fallback_performancewarning() return super().fillna(value=value, method=method, limit=limit) if isinstance(value, (np.ndarray, ExtensionArray)): diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index ceac8e22426d9..9bb18d4e0debb 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -23,7 +23,10 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + algos as libalgos, + lib, +) from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -824,10 +827,15 @@ def fillna( if mask.any(): if method is not None: - func = missing.get_fill_func(method) - npvalues = self.astype(object) - func(npvalues, limit=limit, mask=mask) - new_values = self._from_sequence(npvalues, dtype=self.dtype) + meth = missing.clean_fill_method(method) + + if meth == "pad": + indexer = libalgos.get_fill_indexer(mask, limit=limit) + return self.take(indexer, allow_fill=True) + else: + # i.e. meth == "backfill" + indexer = libalgos.get_fill_indexer(mask[::-1], limit=limit)[::-1] + return self[::-1].take(indexer, allow_fill=True) else: # fill with value new_values = self.copy() diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index abcca16340365..56e35d30ad83c 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -38,7 +38,6 @@ pa_version_under9p0, pa_version_under11p0, ) -from pandas.errors import PerformanceWarning from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -698,12 +697,6 @@ def test_fillna_no_op_returns_copy(self, data): assert result is not data self.assert_extension_array_equal(result, data) - def test_fillna_series_method(self, data_missing, fillna_method): - with tm.maybe_produces_warning( - PerformanceWarning, fillna_method is not None, check_stacklevel=False - ): - super().test_fillna_series_method(data_missing, fillna_method) - class TestBasePrinting(base.BasePrintingTests): pass diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 1f39e8e9b450e..eb166691d3314 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -18,10 +18,7 @@ import numpy as np import pytest -from pandas.errors import PerformanceWarning - import pandas as pd -import pandas._testing as tm from pandas.api.types import is_string_dtype from pandas.core.arrays import ArrowStringArray from pandas.core.arrays.string_ import StringDtype @@ -169,14 +166,6 @@ def test_fillna_no_op_returns_copy(self, data): assert result is not data self.assert_extension_array_equal(result, data) - def test_fillna_series_method(self, data_missing, fillna_method): - with tm.maybe_produces_warning( - PerformanceWarning, - fillna_method is not None and data_missing.dtype.storage == "pyarrow", - check_stacklevel=False, - ): - super().test_fillna_series_method(data_missing, fillna_method) - class TestNoReduce(base.BaseNoReduceTests): @pytest.mark.parametrize("skipna", [True, False]) From 9a1e5945baca2cd6e2e38c4800aaa9e337c5a90a Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 30 Jun 2023 09:59:51 -0700 Subject: [PATCH 2/3] GH ref --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 85daf51ebeb20..6390fbeed8548 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -342,6 +342,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) - Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`) - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) +- Performance improvement in :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, :meth:`DataFrame.bfill` with pyarrow dtypes (:issue:`53950`) - Performance improvement in :meth:`Series.str.get_dummies` for pyarrow-backed strings (:issue:`53655`) - Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`) - Performance improvement in :meth:`Series.str.split` with ``expand=True`` for pyarrow-backed strings (:issue:`53585`) @@ -350,7 +351,6 @@ Performance improvements - Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`52525`) - Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`) - Performance improvement when indexing with pyarrow timestamp and duration dtypes (:issue:`53368`) -- Performance improvement in :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, :meth:`DataFrame.bfill` with pyarrow dtypes (:issue:`??`) - .. --------------------------------------------------------------------------- From 79408120b98901b289f131bf3a5d671d630768ec Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 30 Jun 2023 10:46:44 -0700 Subject: [PATCH 3/3] mypy fixup --- pandas/_libs/algos.pyi | 4 ++++ pandas/core/arrays/base.py | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index 20a805533e8cc..cbbe418c8ab48 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -60,6 +60,10 @@ def nancorr_spearman( # ---------------------------------------------------------------------- def validate_limit(nobs: int | None, limit=...) -> int: ... +def get_fill_indexer( + mask: npt.NDArray[np.bool_], + limit: int | None = None, +) -> npt.NDArray[np.intp]: ... def pad( old: np.ndarray, # ndarray[numeric_object_t] new: np.ndarray, # ndarray[numeric_object_t] diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 9bb18d4e0debb..64f917a419391 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -829,12 +829,13 @@ def fillna( if method is not None: meth = missing.clean_fill_method(method) + npmask = np.asarray(mask) if meth == "pad": - indexer = libalgos.get_fill_indexer(mask, limit=limit) + indexer = libalgos.get_fill_indexer(npmask, limit=limit) return self.take(indexer, allow_fill=True) else: # i.e. meth == "backfill" - indexer = libalgos.get_fill_indexer(mask[::-1], limit=limit)[::-1] + indexer = libalgos.get_fill_indexer(npmask[::-1], limit=limit)[::-1] return self[::-1].take(indexer, allow_fill=True) else: # fill with value