diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index 2a115fb0b4fe3..c09e70915ef81 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -13,10 +13,25 @@ def setup(self, inplace): rng = pd.date_range("1/1/2000", periods=N, freq="min") data = np.random.randn(N) data[::2] = np.nan - self.ts = pd.Series(data, index=rng) + self.series = pd.Series(data, index=rng) + self.ts = pd.Series(rng.to_series()) + self.ts[::2] = np.datetime64("nat") + self.df = pd.DataFrame(np.random.randn(10 ** 3, 10 ** 3)) def time_fillna(self, inplace): - self.ts.fillna(0.0, inplace=inplace) + self.series.fillna(0.0, inplace=inplace) + + def time_fillna_ts(self, inplace): + self.ts.fillna(np.datetime64("2021"), inplace=inplace) + + def peakmem_fillna(self, inplace): + self.series.fillna(0.0, inplace=inplace) + + def time_fillna_limit(self, inplace): + self.series.fillna(0.0, inplace=inplace, limit=10 ** 5) + + def time_fillna_df(self, inplace): + self.df.fillna(0.0, inplace=inplace) def time_replace(self, inplace): self.ts.replace(np.nan, 0.0, inplace=inplace) diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index d0f664c323a89..6e52872fbf3e8 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -106,6 +106,41 @@ def backfill_2d_inplace( mask: np.ndarray, # const uint8_t[:, :] limit=None, ) -> None: ... + +# ---------------------------------------------------------------------- +# Fillna +# ---------------------------------------------------------------------- +# ctypedef fused fillna_t: +# float64_t +# float32_t +# object +# int64_t # Datetime64 +# uint16_t # Float 16 +# complex64_t +# complex128_t +# ctypedef fused fillna_values_t: +# algos_t +# uint16_t # Float 16 +# complex64_t +# complex128_t +def fillna1d( + arr: np.ndarray, # fillna_t[:] + value: object, # fillna_t + limit: int, + inf_as_na: bool = False, +) -> None: ... +def fillna1d_multi_values( + arr: np.ndarray, # fillna_t[:] + value: np.ndarray, # fillna_values_t + limit: int, + inf_as_na: bool = False, +) -> None: ... +def fillna2d( + arr: np.ndarray, # fillna_t[:] + value: object, # fillna_t + limit: int, + inf_as_na: bool = False, +) -> None: ... def is_monotonic( arr: np.ndarray, # ndarray[algos_t, ndim=1] timelike: bool, diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 4efc30e40654c..c4c29d1130331 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -26,6 +26,8 @@ from numpy cimport ( NPY_UINT16, NPY_UINT32, NPY_UINT64, + complex64_t, + complex128_t, float32_t, float64_t, int8_t, @@ -52,6 +54,10 @@ from pandas._libs.khash cimport ( kh_resize_int64, khiter_t, ) +from pandas._libs.missing cimport ( + checknull, + checknull_old, +) from pandas._libs.util cimport ( get_nat, numeric, @@ -62,7 +68,13 @@ import pandas._libs.missing as missing cdef: float64_t FP_ERR = 1e-13 float64_t NaN = np.NaN + # Numpy Float 16 is actually Uint16 since most compilers don't support halfs + # We use this value in fillna to fill float16 nans + # https://docs.scipy.org/doc/numpy-1.13.0/reference/c-api.coremath.html#half-precision-functions + uint16_t uNaN = np.float16(np.nan).view(np.uint16) int64_t NPY_NAT = get_nat() + float64_t INF = np.inf + float64_t NEGINF = -INF cdef enum TiebreakEnumType: TIEBREAK_AVERAGE @@ -832,6 +844,193 @@ def backfill_2d_inplace(algos_t[:, :] values, pad_2d_inplace(values[:, ::-1], mask[:, ::-1], limit) +# Fillna logic +# We have our own fused type instead of algos_t +# since we don't need to support types that can't hold NAs(ints, etc) +ctypedef fused fillna_t: + float64_t + float32_t + object + int8_t # Categorical + int16_t # Categorical + int32_t # Categorical + int64_t # Categorical/Datetime64 + uint16_t # Float 16 + complex64_t + complex128_t + + +@cython.boundscheck(False) +@cython.wraparound(False) +def fillna1d(fillna_t[:] arr, + fillna_t value, + Py_ssize_t limit, + bint inf_as_na=False + ) -> ndarray: + """ + Fills na-like elements inplace for a 1D array + + Parameters + ---------- + arr : ndarray + value : object + The value to use to replace nans + limit : int, default None + The number of elements to fill. If None, fills all NaN values + inf_as_na: + Whether to consider INF and NEGINF as NA + """ + cdef: + Py_ssize_t i, N, lim + Py_ssize_t count=0 + fillna_t val + bint result + + assert arr.ndim == 1, "'arr' must be 1-D." + + N = len(arr) + for i in range(N): + val = arr[i] + if fillna_t is object: + if inf_as_na: + result = checknull_old(val) + else: + result = checknull(val) + elif fillna_t is int64_t: + # Datetime64/Timedelta64 + result = val == NPY_NAT + elif fillna_t is uint16_t: + # Float 16 + result = val == uNaN + else: + result = val != val + if inf_as_na: + result = result and (val == INF or val == NEGINF) + if result and count < limit: + arr[i] = value + count+=1 + + +@cython.boundscheck(False) +@cython.wraparound(False) +def fillna1d_multi_values(fillna_t[:] arr, + algos_t[:] value, + Py_ssize_t limit, + bint inf_as_na=False + ) -> ndarray: + """ + Fills na-like elements inplace for a 1D array + + Parameters + ---------- + arr : ndarray + value : ndarray/ExtensionArray + A ndarray/ExtensionArray with same length as arr + describing which fill value to use at each position, + with a value of np.nan indicating that a position should + not be filled + limit : int, default None + The number of elements to fill. If None, fills all NaN values + inf_as_na: + Whether to consider INF and NEGINF as NA + """ + cdef: + Py_ssize_t i, N + Py_ssize_t count=0 + fillna_t val + algos_t fill_value + bint result + + assert arr.ndim == 1, "'arr' must be 1-D." + + N = len(arr) + for i in range(N): + fill_value = value[i] + if algos_t is object or algos_t is float64_t or algos_t is float32_t: + if fill_value != fill_value: + # np.nan don't fill + continue + val = arr[i] + if fillna_t is object: + if inf_as_na: + result = checknull_old(val) + else: + result = checknull(val) + elif fillna_t is int64_t: + # Datetime64/Timedelta64 + result = val == NPY_NAT + elif fillna_t is uint16_t: + # Float 16 + result = val == uNaN + else: + result = val != val + if inf_as_na: + result = result and (val == INF or val == NEGINF) + if result and count < limit: + # Ugh... We have to cast here since technically could have a int64->float32 + # There shouldn't be any risk here since BlockManager should check + # that the element can be held + arr[i] = fill_value + count+=1 + + +@cython.boundscheck(False) +@cython.wraparound(False) +def fillna2d(fillna_t[:, :] arr, + fillna_t value, + Py_ssize_t limit, + bint inf_as_na=False + ) -> ndarray: + """ + Fills na-like elements inplace for a 2D array + + Parameters + ---------- + arr : ndarray + value : object + The value to use to replace nans + limit : int, default None + The number of elements to fill. If None, fills all NaN values + inf_as_na: + Whether to consider INF and NEGINF as NA + """ + cdef: + Py_ssize_t i, j, n, m + Py_ssize_t count=0 + fillna_t val + bint result + + assert arr.ndim == 2, "'arr' must be 2-D." + + n, m = (arr).shape + if inf_as_na: + check_func = checknull_old + else: + check_func = checknull + for i in range(n): + count = 0 # Limit is per axis + for j in range(m): + val = arr[i, j] + if fillna_t is object: + if inf_as_na: + result = checknull_old(val) + else: + result = checknull(val) + elif fillna_t is int64_t: + # Datetime64/Timedelta64 + result = val == NPY_NAT + elif fillna_t is uint16_t: + # Float 16 + result = val == uNaN + else: + result = val != val + if inf_as_na: + result = result and (val == INF or val == NEGINF) + if result and count < limit: + arr[i, j] = value + count+=1 + + @cython.boundscheck(False) @cython.wraparound(False) def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 237d06402a0ee..12faec2bdc9a4 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -50,6 +50,7 @@ is_dtype_equal, is_extension_array_dtype, is_list_like, + is_scalar, is_sparse, is_string_dtype, pandas_dtype, @@ -434,40 +435,68 @@ def fillna( fillna on the block with the value. If we fail, then convert to ObjectBlock and try again """ + # TODO: Handle inf_as_na, we need to get option and pass to cython funcs inplace = validate_bool_kwarg(inplace, "inplace") + blk = self if inplace else self.copy() + nobs = len(self) if self.ndim == 1 else self.shape[-1] + limit = libalgos.validate_limit(nobs if nobs > 0 else None, limit=limit) - mask = isna(self.values) - mask, noop = validate_putmask(self.values, mask) - - if limit is not None: - limit = libalgos.validate_limit(None, limit=limit) - mask[mask.cumsum(self.ndim - 1) > limit] = False + if not self._can_hold_na or nobs == 0: + return [blk] - if not self._can_hold_na: - if inplace: - return [self] + if not self.is_extension: + if self._can_hold_element(value): + if blk.dtype == np.float16: + # Float16 not supported by compiler, use view as uint16 hack + arr = blk.values.view(np.uint16) + if is_scalar(value): + value = np.float16(value).view(np.uint16) + else: + value = value.astype(np.float16).view(np.uint16) + else: + arr = blk.values + if self.ndim == 1: + if is_list_like(value): + # TODO: Verify EA case + if is_extension_array_dtype(value): + mask = value.isna() + value = np.asarray(value[mask], dtype=object) + libalgos.fillna1d_multi_values( + arr[mask], value=value, limit=limit + ) + else: + libalgos.fillna1d_multi_values( + arr, value=value, limit=limit + ) + else: + libalgos.fillna1d(arr, value=value, limit=limit) + else: + libalgos.fillna2d(arr, value=value, limit=limit) + return blk._maybe_downcast([blk], downcast) + elif self.ndim == 1 or self.shape[0] == 1: + coerced_blk = self.coerce_to_target_dtype(value) + # bc we have already cast, inplace=True may avoid an extra copy + return coerced_blk.fillna( + value, limit=limit, inplace=True, downcast=None + ) else: - return [self.copy()] - - if self._can_hold_element(value): - nb = self if inplace else self.copy() - putmask_inplace(nb.values, mask, value) - return nb._maybe_downcast([nb], downcast) - - if noop: - # we can't process the value, but nothing to do - return [self] if inplace else [self.copy()] - - elif self.ndim == 1 or self.shape[0] == 1: - blk = self.coerce_to_target_dtype(value) - # bc we have already cast, inplace=True may avoid an extra copy - return blk.fillna(value, limit=limit, inplace=True, downcast=None) - + # operate column-by-column + return self.split_and_operate( + type(self).fillna, + value, + limit=limit, + inplace=inplace, + downcast=None, + ) else: - # operate column-by-column - return self.split_and_operate( - type(self).fillna, value, limit=limit, inplace=inplace, downcast=None - ) + # TODO: This seems to work for EAS, verify it does + return [ + self.make_block_same_class( + values=cast(ExtensionArray, self.values).fillna( + value=value, limit=limit + ) + ) + ] @final def _split(self) -> list[Block]: