diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 7802c5cbdbfb3..f08812cf3d65b 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -2,7 +2,7 @@ Routines for filling missing data. """ -from typing import Any, List, Optional, Set, Union +from typing import Any, Optional import numpy as np @@ -230,41 +230,26 @@ def interpolate_1d( # default limit is unlimited GH #16282 limit = algos._validate_limit(nobs=None, limit=limit) - # These are sets of index pointers to invalid values... i.e. {0, 1, etc... - all_nans = set(np.flatnonzero(invalid)) - start_nans = set(range(find_valid_index(yvalues, "first"))) - end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid))) - mid_nans = all_nans - start_nans - end_nans - - # Like the sets above, preserve_nans contains indices of invalid values, - # but in this case, it is the final set of indices that need to be - # preserved as NaN after the interpolation. - - # For example if limit_direction='forward' then preserve_nans will - # contain indices of NaNs at the beginning of the series, and NaNs that - # are more than'limit' away from the prior non-NaN. - - # set preserve_nans based on direction using _interp_limit - preserve_nans: Union[List, Set] if limit_direction == "forward": - preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) + nans_to_interpolate = _interp_limit(invalid, limit, 0) elif limit_direction == "backward": - preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) + nans_to_interpolate = _interp_limit(invalid, 0, limit) else: # both directions... just use _interp_limit - preserve_nans = set(_interp_limit(invalid, limit, limit)) + nans_to_interpolate = _interp_limit(invalid, limit, limit) # if limit_area is set, add either mid or outside indices # to preserve_nans GH #16284 - if limit_area == "inside": - # preserve NaNs on the outside - preserve_nans |= start_nans | end_nans - elif limit_area == "outside": - # preserve NaNs on the inside - preserve_nans |= mid_nans - - # sort preserve_nans and covert to list - preserve_nans = sorted(preserve_nans) + if limit_area: + first = find_valid_index(yvalues, "first") + last = find_valid_index(yvalues, "last") + if limit_area == "inside": + # preserve NaNs on the outside + nans_to_interpolate[:first] = False + nans_to_interpolate[last + 1 :] = False + else: + # preserve NaNs on the inside + nans_to_interpolate[first : last + 1] = False yvalues = getattr(yvalues, "values", yvalues) result = yvalues.copy() @@ -288,14 +273,14 @@ def interpolate_1d( if method in NP_METHODS: # np.interp requires sorted X values, #21037 indexer = np.argsort(inds[valid]) - result[invalid] = np.interp( - inds[invalid], inds[valid][indexer], yvalues[valid][indexer] + result[nans_to_interpolate] = np.interp( + inds[nans_to_interpolate], inds[valid][indexer], yvalues[valid][indexer] ) else: - result[invalid] = _interpolate_scipy_wrapper( + result[nans_to_interpolate] = _interpolate_scipy_wrapper( inds[valid], yvalues[valid], - inds[invalid], + inds[nans_to_interpolate], method=method, fill_value=fill_value, bounds_error=bounds_error, @@ -303,7 +288,6 @@ def interpolate_1d( **kwargs, ) - result[preserve_nans] = np.nan return result @@ -666,10 +650,11 @@ def clean_reindex_fill_method(method): return clean_fill_method(method, allow_nearest=True) -def _interp_limit(invalid, fw_limit, bw_limit): +def _interp_limit( + invalid: np.ndarray, fw_limit: Optional[int], bw_limit: Optional[int] +) -> np.ndarray: """ - Get indexers of values that won't be filled - because they exceed the limits. + Update mask to exclude elements not within limits Parameters ---------- @@ -681,68 +666,121 @@ def _interp_limit(invalid, fw_limit, bw_limit): Returns ------- - set of indexers + boolean ndarray Notes ----- - This is equivalent to the more readable, but slower + There follows a description of the implementation used for creating a mask + for forward interpolation with a limit. To create a backwards fill, we first + reverse the array and use the same algorithm. + To fill in both directions we combine the masks from both forward and backwards + fills. - .. code-block:: python + Say we start with the following array - def _interp_limit(invalid, fw_limit, bw_limit): - for x in np.where(invalid)[0]: - if invalid[max(0, x - fw_limit):x + bw_limit + 1].all(): - yield x - """ - # handle forward first; the backward direction is the same except - # 1. operate on the reversed array - # 2. subtract the returned indices from N - 1 - N = len(invalid) - f_idx = set() - b_idx = set() - - def inner(invalid, limit): - limit = min(limit, N) - windowed = _rolling_window(invalid, limit + 1).all(1) - idx = set(np.where(windowed)[0] + limit) | set( - np.where((~invalid[: limit + 1]).cumsum() == 0)[0] - ) - return idx + array([nan, nan, 1., 3., nan, nan, nan, 11., nan, nan]) - if fw_limit is not None: + create (or get from masked arrays) a boolean array of missing values - if fw_limit == 0: - f_idx = set(np.where(invalid)[0]) - else: - f_idx = inner(invalid, fw_limit) + >>> arr = pd.core.missing.isna(arr) + >>> arr + array([ True, True, False, False, True, True, True, False, True, + True]) - if bw_limit is not None: + we convert the boolean array to integer array for counting the streaks - if bw_limit == 0: - # then we don't even need to care about backwards - # just use forwards - return f_idx - else: - b_idx = list(inner(invalid[::-1], bw_limit)) - b_idx = set(N - 1 - np.asarray(b_idx)) - if fw_limit == 0: - return b_idx + >>> arr = arr.astype(int) + >>> arr + array([1, 1, 0, 0, 1, 1, 1, 0, 1, 1]) - return f_idx & b_idx + cumsum will get us off to a good start, we store this as we will need this later + >>> cumsum = arr.cumsum() + >>> cumsum + array([1, 2, 2, 2, 3, 4, 5, 5, 6, 7], dtype=int32) -def _rolling_window(a, window): - """ - [True, True, False, True, False], 2 -> + multiplying this accumulation with the original array of ones to get non-zero + values where we originally had ones - [ - [True, True], - [True, False], - [False, True], - [True, False], - ] + >>> arr = cumsum * arr + >>> arr + array([1, 2, 0, 0, 3, 4, 5, 0, 6, 7]) + + the previous result is close to what we want, but we want to restart + each streak at one. start by using the diff method to substract the previous + value for each element + + >>> arr = np.diff(arr, prepend=0) + >>> arr + array([ 1, 1, -2, 0, 3, 1, 1, -5, 6, 1]) + + a negative value now represents the end of a streak of missing values + so let's first select just the negative values + + >>> arr = np.where(arr < 0, arr, 0) + >>> arr + array([ 0, 0, -2, 0, 0, 0, 0, -5, 0, 0]) + + we will need to propegate the negative values + + >>> arr = np.minimum.accumulate(arr) + >>> arr + array([ 0, 0, -2, -2, -2, -2, -2, -5, -5, -5], dtype=int32) + + and then subtract the excess accumlation + + >>> arr = arr + cumsum + >>> arr + array([1, 2, 0, 0, 1, 2, 3, 0, 1, 2], dtype=int32) + + remember that positive values represent missing values and zeros represent + valid values. We have a array with some missing values at the start. For a + forward fill algorithm, we want to update the mask to leave these missing + values unchanged. + + >>> arr[: arr.argmin()] = 0 + >>> arr + array([0, 0, 0, 0, 1, 2, 3, 0, 1, 2], dtype=int32) + + we will now select only values within a set limit, say 2 + + >>> arr = np.where(arr > 2, 0, arr) + >>> arr + array([0, 0, 0, 0, 1, 2, 0, 0, 1, 2], dtype=int32) + + and finally convert back into a boolean mask + + >>> arr.astype(bool) + array([ False, False, False, False, True, True, False, False, True, + True]) """ - # https://stackoverflow.com/a/6811241 - shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) - strides = a.strides + (a.strides[-1],) - return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) + + def inner(arr, limit): + arr = arr.astype(int) + arr[: arr.argmin()] = 0 + if limit: + cumsum = arr.cumsum() + arr = cumsum * arr + arr = np.diff(arr) + arr = np.pad(arr, (1, 0), mode="constant") + arr = np.where(arr < 0, arr, 0) + arr = np.minimum.accumulate(arr) + arr = arr + cumsum + arr = np.where(arr > limit, 0, arr) + return arr.astype(bool) + + if fw_limit == 0: + f_idx = invalid + else: + f_idx = inner(invalid, fw_limit) + + if bw_limit == 0: + # then we don't even need to care about backwards + # just use forwards + return f_idx + else: + b_idx = inner(invalid[::-1], bw_limit)[::-1] + if fw_limit == 0: + return b_idx + + return f_idx | b_idx