From 3c1410f56de973ea0e4932f37bb83461280b110e Mon Sep 17 00:00:00 2001 From: stahlous Date: Sun, 7 Sep 2014 12:59:56 -0700 Subject: [PATCH] BUG: rolling_window() properly averages weights in mean=True mode; removed scikits-timeseries dependency for testing; added further tests for rolling_window() --- doc/source/computation.rst | 17 ++- doc/source/v0.15.0.txt | 28 +++++ pandas/algos.pyx | 6 +- pandas/stats/tests/test_moments.py | 191 ++++++++++++++++++++++------- pandas/util/print_versions.py | 1 - 5 files changed, 192 insertions(+), 51 deletions(-) diff --git a/doc/source/computation.rst b/doc/source/computation.rst index b8559eb51ece8..56dc551268a37 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -310,7 +310,7 @@ keyword. The list of recognized types are: rolling_window(ser, 5, 'triang') -Note that the ``boxcar`` window is equivalent to ``rolling_mean``: +Note that the ``boxcar`` window is equivalent to ``rolling_mean``. .. ipython:: python @@ -336,6 +336,19 @@ This keyword is available in other rolling functions as well. rolling_mean(ser, 5, center=True) +.. _stats.moments.normalization + +.. note:: + + In rolling sum mode (``mean=False``) there is no normalization done to the + weights. Passing custom weights of ``[1, 1, 1]`` will yield a different + result than passing weights of ``[2, 2, 2]``, for example. When passing a + ``win_type`` instead of explicitly specifying the weights, the weights are + already normalized so that the largest weight is 1. + + In contrast, the nature of the rolling mean calculation (``mean=True``)is + such that the weights are normalized with respect to each other. Weights + of ``[1, 1, 1]`` and ``[2, 2, 2]`` yield the same result. .. _stats.moments.binary: @@ -610,4 +623,4 @@ are scaled by debiasing factors (For :math:`w_i = 1`, this reduces to the usual :math:`N / (N - 1)` factor, with :math:`N = t + 1`.) See http://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance -for further details. \ No newline at end of file +for further details. diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index f49c919e80d50..73b8b7ddbcba3 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -383,6 +383,34 @@ Rolling/Expanding Moments API changes rolling_sum(Series(range(4)), window=3, min_periods=0, center=True) +- :func:`rolling_window` now normalizes the weights properly in rolling mean mode (`mean=True`) so that + the calculated weighted means (e.g. 'triang', 'gaussian') are distributed about the same means as those + calculated without weighting (i.e. 'boxcar'). See :ref:`the note on normalization + ` for further details. (:issue:`7618`) + + .. ipython:: python + + s = Series([10.5, 8.8, 11.4, 9.7, 9.3]) + + Behavior prior to 0.15.0: + + .. code-block:: python + + In [39]: rolling_window(s, window=3, win_type='triang', center=True) + Out[39]: + 0 NaN + 1 6.583333 + 2 6.883333 + 3 6.683333 + 4 NaN + dtype: float64 + + New behavior + + .. ipython:: python + + rolling_window(s, window=3, win_type='triang', center=True) + - Removed ``center`` argument from :func:`expanding_max`, :func:`expanding_min`, :func:`expanding_sum`, :func:`expanding_mean`, :func:`expanding_median`, :func:`expanding_std`, :func:`expanding_var`, :func:`expanding_skew`, :func:`expanding_kurt`, :func:`expanding_quantile`, :func:`expanding_count`, diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 8f37d76e50f9c..316a282b71609 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -1897,7 +1897,7 @@ def roll_generic(ndarray[float64_t, cast=True] input, def roll_window(ndarray[float64_t, ndim=1, cast=True] input, ndarray[float64_t, ndim=1, cast=True] weights, - int minp, bint avg=True, bint avg_wgt=False): + int minp, bint avg=True): """ Assume len(weights) << len(input) """ @@ -1915,7 +1915,7 @@ def roll_window(ndarray[float64_t, ndim=1, cast=True] input, minp = _check_minp(len(weights), minp, in_n) - if avg_wgt: + if avg: for win_i from 0 <= win_i < win_n: val_win = weights[win_i] if val_win != val_win: @@ -1956,8 +1956,6 @@ def roll_window(ndarray[float64_t, ndim=1, cast=True] input, c = counts[in_i] if c < minp: output[in_i] = NaN - elif avg: - output[in_i] /= c return output diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py index 94c2521ff6938..fab25f955fa76 100644 --- a/pandas/stats/tests/test_moments.py +++ b/pandas/stats/tests/test_moments.py @@ -65,47 +65,40 @@ def test_rolling_mean(self): self._check_moment_func(mom.rolling_mean, np.mean) def test_cmov_mean(self): + # GH 8238 tm._skip_if_no_scipy() - try: - from scikits.timeseries.lib import cmov_mean - except ImportError: - raise nose.SkipTest("no scikits.timeseries") - vals = np.random.randn(10) - xp = cmov_mean(vals, 5) + vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, + 16.68, 9.48, 10.63, 14.48]) + xp = np.array([np.nan, np.nan, 9.962, 11.27 , 11.564, 12.516, + 12.818, 12.952, np.nan, np.nan]) rs = mom.rolling_mean(vals, 5, center=True) - assert_almost_equal(xp.compressed(), rs[2:-2]) - assert_almost_equal(xp.mask, np.isnan(rs)) + assert_almost_equal(xp, rs) xp = Series(rs) rs = mom.rolling_mean(Series(vals), 5, center=True) assert_series_equal(xp, rs) def test_cmov_window(self): + # GH 8238 tm._skip_if_no_scipy() - try: - from scikits.timeseries.lib import cmov_window - except ImportError: - raise nose.SkipTest("no scikits.timeseries") - vals = np.random.randn(10) - xp = cmov_window(vals, 5, 'boxcar') + vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, + 13.49, 16.68, 9.48, 10.63, 14.48]) + xp = np.array([np.nan, np.nan, 9.962, 11.27 , 11.564, 12.516, + 12.818, 12.952, np.nan, np.nan]) rs = mom.rolling_window(vals, 5, 'boxcar', center=True) - assert_almost_equal(xp.compressed(), rs[2:-2]) - assert_almost_equal(xp.mask, np.isnan(rs)) + assert_almost_equal(xp, rs) xp = Series(rs) rs = mom.rolling_window(Series(vals), 5, 'boxcar', center=True) assert_series_equal(xp, rs) def test_cmov_window_corner(self): + # GH 8238 tm._skip_if_no_scipy() - try: - from scikits.timeseries.lib import cmov_window - except ImportError: - raise nose.SkipTest("no scikits.timeseries") # all nan vals = np.empty(10, dtype=float) @@ -125,24 +118,37 @@ def test_cmov_window_corner(self): self.assertEqual(len(rs), 5) def test_cmov_window_frame(self): + # Gh 8238 tm._skip_if_no_scipy() - try: - from scikits.timeseries.lib import cmov_window - except ImportError: - raise nose.SkipTest("no scikits.timeseries") + + vals = np.array([[ 12.18, 3.64], + [ 10.18, 9.16], + [ 13.24, 14.61], + [ 4.51, 8.11], + [ 6.15, 11.44], + [ 9.14, 6.21], + [ 11.31, 10.67], + [ 2.94, 6.51], + [ 9.42, 8.39], + [ 12.44, 7.34 ]]) + + xp = np.array([[ np.nan, np.nan], + [ np.nan, np.nan], + [ 9.252, 9.392], + [ 8.644, 9.906], + [ 8.87 , 10.208], + [ 6.81 , 8.588], + [ 7.792, 8.644], + [ 9.05 , 7.824], + [ np.nan, np.nan], + [ np.nan, np.nan]]) # DataFrame - vals = np.random.randn(10, 2) - xp = cmov_window(vals, 5, 'boxcar') rs = mom.rolling_window(DataFrame(vals), 5, 'boxcar', center=True) assert_frame_equal(DataFrame(xp), rs) def test_cmov_window_na_min_periods(self): tm._skip_if_no_scipy() - try: - from scikits.timeseries.lib import cmov_window - except ImportError: - raise nose.SkipTest("no scikits.timeseries") # min_periods vals = Series(np.random.randn(10)) @@ -155,39 +161,136 @@ def test_cmov_window_na_min_periods(self): assert_series_equal(xp, rs) def test_cmov_window_regular(self): + # GH 8238 tm._skip_if_no_scipy() - try: - from scikits.timeseries.lib import cmov_window - except ImportError: - raise nose.SkipTest("no scikits.timeseries") win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman', 'blackmanharris', 'nuttall', 'barthann'] + + vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, + 13.49, 16.68, 9.48, 10.63, 14.48]) + xps = { + 'hamming': [np.nan, np.nan, 8.71384, 9.56348, 12.38009, + 14.03687, 13.8567, 11.81473, np.nan, np.nan], + 'triang': [np.nan, np.nan, 9.28667, 10.34667, 12.00556, + 13.33889, 13.38, 12.33667, np.nan, np.nan], + 'barthann': [np.nan, np.nan, 8.4425, 9.1925, 12.5575, + 14.3675, 14.0825, 11.5675, np.nan, np.nan], + 'bohman': [np.nan, np.nan, 7.61599, 9.1764, 12.83559, + 14.17267, 14.65923, 11.10401, np.nan, np.nan], + 'blackmanharris': [np.nan, np.nan, 6.97691, 9.16438, 13.05052, + 14.02156, 15.10512, 10.74574, np.nan, np.nan], + 'nuttall': [np.nan, np.nan, 7.04618, 9.16786, 13.02671, + 14.03559, 15.05657, 10.78514, np.nan, np.nan], + 'blackman': [np.nan, np.nan, 7.73345, 9.17869, 12.79607, + 14.20036, 14.57726, 11.16988, np.nan, np.nan], + 'bartlett': [np.nan, np.nan, 8.4425, 9.1925, 12.5575, + 14.3675, 14.0825, 11.5675, np.nan, np.nan]} + for wt in win_types: - vals = np.random.randn(10) - xp = cmov_window(vals, 5, wt) + xp = Series(xps[wt]) + rs = mom.rolling_window(Series(vals), 5, wt, center=True) + assert_series_equal(xp, rs) + + def test_cmov_window_regular_linear_range(self): + # GH 8238 + tm._skip_if_no_scipy() + win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman', + 'blackmanharris', 'nuttall', 'barthann'] + + vals = np.array(range(10), dtype=np.float) + xp = vals.copy() + xp[:2] = np.nan + xp[-2:] = np.nan + xp = Series(xp) + + for wt in win_types: rs = mom.rolling_window(Series(vals), 5, wt, center=True) - assert_series_equal(Series(xp), rs) + assert_series_equal(xp, rs) + + def test_cmov_window_regular_missing_data(self): + # GH 8238 + tm._skip_if_no_scipy() + + win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman', + 'blackmanharris', 'nuttall', 'barthann'] + + vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, + 13.49, 16.68, np.nan, 10.63, 14.48]) + xps = { + 'bartlett': [np.nan, np.nan, 9.70333, 10.5225, 8.4425, + 9.1925, 12.5575, 14.3675, 15.61667, 13.655], + 'blackman': [np.nan, np.nan, 9.04582, 11.41536, 7.73345, + 9.17869, 12.79607, 14.20036, 15.8706, 13.655], + 'barthann': [np.nan, np.nan, 9.70333, 10.5225, 8.4425, + 9.1925, 12.5575, 14.3675, 15.61667, 13.655], + 'bohman': [np.nan, np.nan, 8.9444, 11.56327, 7.61599, + 9.1764, 12.83559, 14.17267, 15.90976, 13.655], + 'hamming': [np.nan, np.nan, 9.59321, 10.29694, 8.71384, + 9.56348, 12.38009, 14.20565, 15.24694, 13.69758], + 'nuttall': [np.nan, np.nan, 8.47693, 12.2821, 7.04618, + 9.16786, 13.02671, 14.03673, 16.08759, 13.65553], + 'triang': [np.nan, np.nan, 9.33167, 9.76125, 9.28667, + 10.34667, 12.00556, 13.82125, 14.49429, 13.765], + 'blackmanharris': [np.nan, np.nan, 8.42526, 12.36824, 6.97691, + 9.16438, 13.05052, 14.02175, 16.1098, + 13.65509] + } + + for wt in win_types: + xp = Series(xps[wt]) + rs = mom.rolling_window(Series(vals), 5, wt, min_periods=3) + assert_series_equal(xp, rs) def test_cmov_window_special(self): + # GH 8238 tm._skip_if_no_scipy() - try: - from scikits.timeseries.lib import cmov_window - except ImportError: - raise nose.SkipTest("no scikits.timeseries") win_types = ['kaiser', 'gaussian', 'general_gaussian', 'slepian'] kwds = [{'beta': 1.}, {'std': 1.}, {'power': 2., 'width': 2.}, {'width': 0.5}] + vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, + 13.49, 16.68, 9.48, 10.63, 14.48]) + + xps = { + 'gaussian': [np.nan, np.nan, 8.97297, 9.76077, 12.24763, + 13.89053, 13.65671, 12.01002, np.nan, np.nan], + 'general_gaussian': [np.nan, np.nan, 9.85011, 10.71589, + 11.73161, 13.08516, 12.95111, 12.74577, + np.nan, np.nan], + 'slepian': [np.nan, np.nan, 9.81073, 10.89359, 11.70284, + 12.88331, 12.96079, 12.77008, np.nan, np.nan], + 'kaiser': [np.nan, np.nan, 9.86851, 11.02969, 11.65161, + 12.75129, 12.90702, 12.83757, np.nan, np.nan] + } + for wt, k in zip(win_types, kwds): - vals = np.random.randn(10) - xp = cmov_window(vals, 5, (wt,) + tuple(k.values())) + xp = Series(xps[wt]) rs = mom.rolling_window(Series(vals), 5, wt, center=True, **k) - assert_series_equal(Series(xp), rs) + assert_series_equal(xp, rs) + + def test_cmov_window_special_linear_range(self): + # GH 8238 + tm._skip_if_no_scipy() + + win_types = ['kaiser', 'gaussian', 'general_gaussian', 'slepian'] + kwds = [{'beta': 1.}, {'std': 1.}, {'power': 2., 'width': 2.}, + {'width': 0.5}] + + vals = np.array(range(10), dtype=np.float) + xp = vals.copy() + xp[:2] = np.nan + xp[-2:] = np.nan + xp = Series(xp) + + for wt, k in zip(win_types, kwds): + rs = mom.rolling_window(Series(vals), 5, wt, center=True, + **k) + assert_series_equal(xp, rs) def test_rolling_median(self): self._check_moment_func(mom.rolling_median, np.median) diff --git a/pandas/util/print_versions.py b/pandas/util/print_versions.py index 3d793698c7caa..d3dbeef1af4d2 100644 --- a/pandas/util/print_versions.py +++ b/pandas/util/print_versions.py @@ -68,7 +68,6 @@ def show_versions(as_json=False): ("IPython", lambda mod: mod.__version__), ("sphinx", lambda mod: mod.__version__), ("patsy", lambda mod: mod.__version__), - ("scikits.timeseries", lambda mod: mod.__version__), ("dateutil", lambda mod: mod.__version__), ("pytz", lambda mod: mod.VERSION), ("bottleneck", lambda mod: mod.__version__),