diff --git a/doc/source/computation.rst b/doc/source/computation.rst index ebda0cde9fb5c..79d85ae9586ed 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -240,7 +240,11 @@ accept the following arguments: or :ref:`DateOffset ` to pre-conform the data to. Note that prior to pandas v0.8.0, a keyword argument ``time_rule`` was used instead of ``freq`` that referred to the legacy time rule constants - + - ``how``: optionally specify method for down or re-sampling. Default is + is min for ``rolling_min``, max for ``rolling_max``, median for + ``rolling_median``, and mean for all other rolling functions. See + :meth:`DataFrame.resample`'s how argument for more information. + These functions can be applied to ndarrays or Series objects: .. ipython:: python diff --git a/doc/source/release.rst b/doc/source/release.rst index 03b89f9077994..caf0afd830217 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -265,6 +265,8 @@ Improvements to existing features - ``Float64Index`` is now backed by a ``float64`` dtype ndarray instead of an ``object`` dtype array (:issue:`6471`). - Add option to turn off escaping in ``DataFrame.to_latex`` (:issue:`6472`) +- Added ``how`` option to rolling-moment functions to dictate how to handle resampling; :func:``rolling_max`` defaults to max, + :func:``rolling_min`` defaults to min, and all others default to mean (:issue:`6297`) .. _release.bug_fixes-0.14.0: diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 187a757f53d45..9e776201a5cc9 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -471,6 +471,8 @@ Enhancements - ``Float64Index`` is now backed by a ``float64`` dtype ndarray instead of an ``object`` dtype array (:issue:`6471`). - Implemented ``Panel.pct_change`` (:issue:`6904`) +- Added ``how`` option to rolling-moment functions to dictate how to handle resampling; :func:``rolling_max`` defaults to max, + :func:``rolling_min`` defaults to min, and all others default to mean (:issue:`6297`) Performance ~~~~~~~~~~~ diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index 3f6352d5cbfbf..246037c7d7009 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -57,6 +57,8 @@ as a frequency string or DateOffset object. center : boolean, default False Set the labels at the center of the window. +how : string, default '%s' + Method for down- or re-sampling """ _roll_notes = r""" @@ -85,6 +87,8 @@ adjust : boolean, default True Divide by decaying adjustment factor in beginning periods to account for imbalance in relative weightings (viewing EWMA as a moving average) +how : string, default 'mean' + Method for down- or re-sampling """ _ewm_notes = r""" @@ -148,7 +152,7 @@ """ -def rolling_count(arg, window, freq=None, center=False): +def rolling_count(arg, window, freq=None, center=False, how=None): """ Rolling count of number of non-NaN observations inside provided window. @@ -163,6 +167,8 @@ def rolling_count(arg, window, freq=None, center=False): as a frequency string or DateOffset object. center : boolean, default False Whether the label should correspond with center of window + how : string, default 'mean' + Method for down- or re-sampling Returns ------- @@ -174,7 +180,7 @@ def rolling_count(arg, window, freq=None, center=False): frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ - arg = _conv_timerule(arg, freq) + arg = _conv_timerule(arg, freq, how) window = min(window, len(arg)) return_hook, values = _process_data_structure(arg, kill_inf=False) @@ -190,10 +196,10 @@ def rolling_count(arg, window, freq=None, center=False): @Substitution("Unbiased moving covariance.", _binary_arg_flex, - _roll_kw+_pairwise_kw, _flex_retval, _roll_notes) + _roll_kw%'None'+_pairwise_kw, _flex_retval, _roll_notes) @Appender(_doc_template) def rolling_cov(arg1, arg2=None, window=None, min_periods=None, freq=None, - center=False, pairwise=None): + center=False, pairwise=None, how=None): if window is None and isinstance(arg2, (int, float)): window = arg2 arg2 = arg1 @@ -201,8 +207,8 @@ def rolling_cov(arg1, arg2=None, window=None, min_periods=None, freq=None, elif arg2 is None: arg2 = arg1 pairwise = True if pairwise is None else pairwise # only default unset - arg1 = _conv_timerule(arg1, freq) - arg2 = _conv_timerule(arg2, freq) + arg1 = _conv_timerule(arg1, freq, how) + arg2 = _conv_timerule(arg2, freq, how) window = min(window, len(arg1), len(arg2)) def _get_cov(X, Y): @@ -215,10 +221,10 @@ def _get_cov(X, Y): @Substitution("Moving sample correlation.", _binary_arg_flex, - _roll_kw+_pairwise_kw, _flex_retval, _roll_notes) + _roll_kw%'None'+_pairwise_kw, _flex_retval, _roll_notes) @Appender(_doc_template) def rolling_corr(arg1, arg2=None, window=None, min_periods=None, freq=None, - center=False, pairwise=None): + center=False, pairwise=None, how=None): if window is None and isinstance(arg2, (int, float)): window = arg2 arg2 = arg1 @@ -226,8 +232,8 @@ def rolling_corr(arg1, arg2=None, window=None, min_periods=None, freq=None, elif arg2 is None: arg2 = arg1 pairwise = True if pairwise is None else pairwise # only default unset - arg1 = _conv_timerule(arg1, freq) - arg2 = _conv_timerule(arg2, freq) + arg1 = _conv_timerule(arg1, freq, how) + arg2 = _conv_timerule(arg2, freq, how) window = min(window, len(arg1), len(arg2)) def _get_corr(a, b): @@ -289,7 +295,7 @@ def _flex_binary_moment(arg1, arg2, f, pairwise=False): @Substitution("Deprecated. Use rolling_corr(..., pairwise=True) instead.\n\n" "Pairwise moving sample correlation", _pairwise_arg, - _roll_kw, _pairwise_retval, _roll_notes) + _roll_kw%'None', _pairwise_retval, _roll_notes) @Appender(_doc_template) def rolling_corr_pairwise(df1, df2=None, window=None, min_periods=None, freq=None, center=False): @@ -301,7 +307,7 @@ def rolling_corr_pairwise(df1, df2=None, window=None, min_periods=None, def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False, - args=(), kwargs={}, **kwds): + how=None, args=(), kwargs={}, **kwds): """ Rolling statistical measure using supplied function. Designed to be used with passed-in Cython array-based functions. @@ -318,6 +324,8 @@ def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False, Frequency to conform to before computing statistic center : boolean, default False Whether the label should correspond with center of window + how : string, default 'mean' + Method for down- or re-sampling args : tuple Passed on to func kwargs : dict @@ -327,7 +335,7 @@ def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False, ------- y : type of input """ - arg = _conv_timerule(arg, freq) + arg = _conv_timerule(arg, freq, how) calc = lambda x: func(x, window, minp=minp, args=args, kwargs=kwargs, **kwds) return_hook, values = _process_data_structure(arg) @@ -413,9 +421,9 @@ def _get_center_of_mass(com, span, halflife): _type_of_input_retval, _ewm_notes) @Appender(_doc_template) def ewma(arg, com=None, span=None, halflife=None, min_periods=0, freq=None, - adjust=True): + adjust=True, how=None): com = _get_center_of_mass(com, span, halflife) - arg = _conv_timerule(arg, freq) + arg = _conv_timerule(arg, freq, how) def _ewma(v): result = algos.ewma(v, com, int(adjust)) @@ -437,9 +445,9 @@ def _first_valid_index(arr): _ewm_kw+_bias_kw, _type_of_input_retval, _ewm_notes) @Appender(_doc_template) def ewmvar(arg, com=None, span=None, halflife=None, min_periods=0, bias=False, - freq=None): + freq=None, how=None): com = _get_center_of_mass(com, span, halflife) - arg = _conv_timerule(arg, freq) + arg = _conv_timerule(arg, freq, how) moment2nd = ewma(arg * arg, com=com, min_periods=min_periods) moment1st = ewma(arg, com=com, min_periods=min_periods) @@ -465,7 +473,7 @@ def ewmstd(arg, com=None, span=None, halflife=None, min_periods=0, bias=False): _ewm_kw+_pairwise_kw, _type_of_input_retval, _ewm_notes) @Appender(_doc_template) def ewmcov(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0, - bias=False, freq=None, pairwise=None): + bias=False, freq=None, pairwise=None, how=None): if arg2 is None: arg2 = arg1 pairwise = True if pairwise is None else pairwise @@ -473,8 +481,8 @@ def ewmcov(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0, com = arg2 arg2 = arg1 pairwise = True if pairwise is None else pairwise - arg1 = _conv_timerule(arg1, freq) - arg2 = _conv_timerule(arg2, freq) + arg1 = _conv_timerule(arg1, freq, how) + arg2 = _conv_timerule(arg2, freq, how) def _get_ewmcov(X, Y): mean = lambda x: ewma(x, com=com, span=span, halflife=halflife, min_periods=min_periods) @@ -492,7 +500,7 @@ def _get_ewmcov(X, Y): _ewm_kw+_pairwise_kw, _type_of_input_retval, _ewm_notes) @Appender(_doc_template) def ewmcorr(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0, - freq=None, pairwise=None): + freq=None, pairwise=None, how=None): if arg2 is None: arg2 = arg1 pairwise = True if pairwise is None else pairwise @@ -500,8 +508,8 @@ def ewmcorr(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0, com = arg2 arg2 = arg1 pairwise = True if pairwise is None else pairwise - arg1 = _conv_timerule(arg1, freq) - arg2 = _conv_timerule(arg2, freq) + arg1 = _conv_timerule(arg1, freq, how) + arg2 = _conv_timerule(arg2, freq, how) def _get_ewmcorr(X, Y): mean = lambda x: ewma(x, com=com, span=span, halflife=halflife, min_periods=min_periods) @@ -541,12 +549,12 @@ def _prep_binary(arg1, arg2): # Python interface to Cython functions -def _conv_timerule(arg, freq): +def _conv_timerule(arg, freq, how): types = (DataFrame, Series) if freq is not None and isinstance(arg, types): # Conform to whatever frequency needed. - arg = arg.resample(freq) + arg = arg.resample(freq, how=how) return arg @@ -567,25 +575,32 @@ def _use_window(minp, window): return minp -def _rolling_func(func, desc, check_minp=_use_window): - @Substitution(desc, _unary_arg, _roll_kw, _type_of_input_retval, _roll_notes) +def _rolling_func(func, desc, check_minp=_use_window, how=None): + if how is None: + how_arg_str = 'None' + else: + how_arg_str = "'%s"%how + + @Substitution(desc, _unary_arg, _roll_kw%how_arg_str, _type_of_input_retval, + _roll_notes) @Appender(_doc_template) @wraps(func) - def f(arg, window, min_periods=None, freq=None, center=False, + def f(arg, window, min_periods=None, freq=None, center=False, how=how, **kwargs): def call_cython(arg, window, minp, args=(), kwargs={}, **kwds): minp = check_minp(minp, window) return func(arg, window, minp, **kwds) return _rolling_moment(arg, window, call_cython, min_periods, freq=freq, - center=center, **kwargs) + center=center, how=how, **kwargs) return f -rolling_max = _rolling_func(algos.roll_max2, 'Moving maximum.') -rolling_min = _rolling_func(algos.roll_min2, 'Moving minimum.') +rolling_max = _rolling_func(algos.roll_max2, 'Moving maximum.', how='max') +rolling_min = _rolling_func(algos.roll_min2, 'Moving minimum.', how='min') rolling_sum = _rolling_func(algos.roll_sum, 'Moving sum.') rolling_mean = _rolling_func(algos.roll_mean, 'Moving mean.') -rolling_median = _rolling_func(algos.roll_median_cython, 'Moving median.') +rolling_median = _rolling_func(algos.roll_median_cython, 'Moving median.', + how='median') _ts_std = lambda *a, **kw: _zsqrt(algos.roll_var(*a, **kw)) rolling_std = _rolling_func(_ts_std, 'Unbiased moving standard deviation.', @@ -687,7 +702,7 @@ def call_cython(arg, window, minp, args, kwargs): def rolling_window(arg, window=None, win_type=None, min_periods=None, freq=None, center=False, mean=True, - axis=0, **kwargs): + axis=0, how=None, **kwargs): """ Applies a moving window of type ``window_type`` and size ``window`` on the data. @@ -711,6 +726,8 @@ def rolling_window(arg, window=None, win_type=None, min_periods=None, mean : boolean, default True If True computes weighted mean, else weighted sum axis : {0, 1}, default 0 + how : string, default 'mean' + Method for down- or re-sampling Returns ------- @@ -761,7 +778,7 @@ def rolling_window(arg, window=None, win_type=None, min_periods=None, minp = _use_window(min_periods, len(window)) - arg = _conv_timerule(arg, freq) + arg = _conv_timerule(arg, freq, how) return_hook, values = _process_data_structure(arg) f = lambda x: algos.roll_window(x, window, minp, avg=mean) diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py index 22661ea7cacda..8c9eb080cfc61 100644 --- a/pandas/stats/tests/test_moments.py +++ b/pandas/stats/tests/test_moments.py @@ -848,6 +848,97 @@ def _check_expanding(self, func, static_comp, has_min_periods=True, preserve_nan=preserve_nan) self._check_expanding_structures(func) + def test_rolling_max_gh6297(self): + """Replicate result expected in GH #6297""" + + indices = [datetime(1975, 1, i) for i in range(1, 6)] + # So that we can have 2 datapoints on one of the days + indices.append(datetime(1975, 1, 3, 6, 0)) + series = Series(range(1, 7), index=indices) + # Use floats instead of ints as values + series = series.map(lambda x: float(x)) + # Sort chronologically + series = series.sort_index() + + expected = Series([1.0, 2.0, 6.0, 4.0, 5.0], + index=[datetime(1975, 1, i, 0) + for i in range(1, 6)]) + x = mom.rolling_max(series, window=1, freq='D') + assert_series_equal(expected, x) + + def test_rolling_max_how_resample(self): + + indices = [datetime(1975, 1, i) for i in range(1, 6)] + # So that we can have 3 datapoints on last day (4, 10, and 20) + indices.append(datetime(1975, 1, 5, 1)) + indices.append(datetime(1975, 1, 5, 2)) + series = Series(list(range(0, 5)) + [10, 20], index=indices) + # Use floats instead of ints as values + series = series.map(lambda x: float(x)) + # Sort chronologically + series = series.sort_index() + + # Default how should be max + expected = Series([0.0, 1.0, 2.0, 3.0, 20.0], + index=[datetime(1975, 1, i, 0) + for i in range(1, 6)]) + x = mom.rolling_max(series, window=1, freq='D') + assert_series_equal(expected, x) + + # Now specify median (10.0) + expected = Series([0.0, 1.0, 2.0, 3.0, 10.0], + index=[datetime(1975, 1, i, 0) + for i in range(1, 6)]) + x = mom.rolling_max(series, window=1, freq='D', how='median') + assert_series_equal(expected, x) + + # Now specify mean (4+10+20)/3 + v = (4.0+10.0+20.0)/3.0 + expected = Series([0.0, 1.0, 2.0, 3.0, v], + index=[datetime(1975, 1, i, 0) + for i in range(1, 6)]) + x = mom.rolling_max(series, window=1, freq='D', how='mean') + assert_series_equal(expected, x) + + + def test_rolling_min_how_resample(self): + + indices = [datetime(1975, 1, i) for i in range(1, 6)] + # So that we can have 3 datapoints on last day (4, 10, and 20) + indices.append(datetime(1975, 1, 5, 1)) + indices.append(datetime(1975, 1, 5, 2)) + series = Series(list(range(0, 5)) + [10, 20], index=indices) + # Use floats instead of ints as values + series = series.map(lambda x: float(x)) + # Sort chronologically + series = series.sort_index() + + # Default how should be min + expected = Series([0.0, 1.0, 2.0, 3.0, 4.0], + index=[datetime(1975, 1, i, 0) + for i in range(1, 6)]) + x = mom.rolling_min(series, window=1, freq='D') + assert_series_equal(expected, x) + + def test_rolling_median_how_resample(self): + + indices = [datetime(1975, 1, i) for i in range(1, 6)] + # So that we can have 3 datapoints on last day (4, 10, and 20) + indices.append(datetime(1975, 1, 5, 1)) + indices.append(datetime(1975, 1, 5, 2)) + series = Series(list(range(0, 5)) + [10, 20], index=indices) + # Use floats instead of ints as values + series = series.map(lambda x: float(x)) + # Sort chronologically + series = series.sort_index() + + # Default how should be median + expected = Series([0.0, 1.0, 2.0, 3.0, 10], + index=[datetime(1975, 1, i, 0) + for i in range(1, 6)]) + x = mom.rolling_median(series, window=1, freq='D') + assert_series_equal(expected, x) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],