diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 354c510b843dd..8909f5b33066b 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1888,6 +1888,34 @@ Those two examples are equivalent for this time series: Note the use of ``'start'`` for ``origin`` on the last example. In that case, ``origin`` will be set to the first value of the timeseries. +Backward resample +~~~~~~~~~~~~~~~~~ + +.. versionadded:: 1.3.0 + +Instead of adjusting the beginning of bins, sometimes we need to fix the end of the bins to make a backward resample with a given ``freq``. The backward resample sets ``closed`` to ``'right'`` by default since the last value should be considered as the edge point for the last bin. + +We can set ``origin`` to ``'end'``. The value for a specific ``Timestamp`` index stands for the resample result from the current ``Timestamp`` minus ``freq`` to the current ``Timestamp`` with a right close. + +.. ipython:: python + + ts.resample('17min', origin='end').sum() + +Besides, in contrast with the ``'start_day'`` option, ``end_day`` is supported. This will set the origin as the ceiling midnight of the largest ``Timestamp``. + +.. ipython:: python + + ts.resample('17min', origin='end_day').sum() + +The above result uses ``2000-10-02 00:29:00`` as the last bin's right edge since the following computation. + +.. ipython:: python + + ceil_mid = rng.max().ceil('D') + freq = pd.offsets.Minute(17) + bin_res = ceil_mid - freq * ((ceil_mid - rng.max()) // freq) + bin_res + .. _timeseries.periods: Time span representation diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 7671962018144..da9740f9dd58b 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -40,6 +40,7 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - Added :meth:`MultiIndex.dtypes` (:issue:`37062`) +- Added ``end`` and ``end_day`` options for ``origin`` in :meth:`DataFrame.resample` (:issue:`37804`) - Improve error message when ``usecols`` and ``names`` do not match for :func:`read_csv` and ``engine="c"`` (:issue:`29042`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ee09d4d7a274b..ebf311ae429cb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8050,7 +8050,8 @@ def resample( level : str or int, optional For a MultiIndex, level (name or number) to use for resampling. `level` must be datetime-like. - origin : {{'epoch','start','start_day'}}, Timestamp or str, default 'start_day' + origin : {{'epoch', 'start', 'start_day', 'end', 'end_day'}}, Timestamp + or str, default 'start_day' The timestamp on which to adjust the grouping. The timezone of origin must match the timezone of the index. If a timestamp is not used, these values are also supported: @@ -8061,6 +8062,11 @@ def resample( .. versionadded:: 1.1.0 + - 'end': `origin` is the last value of the timeseries + - 'end_day': `origin` is the ceiling midnight of the last day + + .. versionadded:: 1.3.0 + offset : Timedelta or str, default is None An offset timedelta added to the origin. @@ -8343,6 +8349,26 @@ def resample( 2000-10-02 00:21:00 24 Freq: 17T, dtype: int64 + If you want to take the largest Timestamp as the end of the bins: + + >>> ts.resample('17min', origin='end').sum() + 2000-10-01 23:35:00 0 + 2000-10-01 23:52:00 18 + 2000-10-02 00:09:00 27 + 2000-10-02 00:26:00 63 + Freq: 17T, dtype: int64 + + In contrast with the `start_day`, you can use `end_day` to take the ceiling + midnight of the largest Timestamp as the end of the bins and drop the bins + not containing data: + + >>> ts.resample('17min', origin='end_day').sum() + 2000-10-01 23:38:00 3 + 2000-10-01 23:55:00 15 + 2000-10-02 00:12:00 45 + 2000-10-02 00:29:00 45 + Freq: 17T, dtype: int64 + To replace the use of the deprecated `base` argument, you can now use `offset`, in this example it is equivalent to have `base=2`: diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 820952d6e85f3..1e6645686f93f 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -82,7 +82,8 @@ class Grouper: However, loffset is also deprecated for ``.resample(...)`` See: :class:`DataFrame.resample` - origin : {'epoch', 'start', 'start_day'}, Timestamp or str, default 'start_day' + origin : {{'epoch', 'start', 'start_day', 'end', 'end_day'}}, Timestamp + or str, default 'start_day' The timestamp on which to adjust the grouping. The timezone of origin must match the timezone of the index. If a timestamp is not used, these values are also supported: @@ -93,6 +94,11 @@ class Grouper: .. versionadded:: 1.1.0 + - 'end': `origin` is the last value of the timeseries + - 'end_day': `origin` is the ceiling midnight of the last day + + .. versionadded:: 1.3.0 + offset : Timedelta or str, default is None An offset timedelta added to the origin. diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 01a9cfd0c532b..b8b372e7666b8 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1388,10 +1388,22 @@ def __init__( if label is None: label = "right" else: - if closed is None: - closed = "left" - if label is None: - label = "left" + # The backward resample sets ``closed`` to ``'right'`` by default + # since the last value should be considered as the edge point for + # the last bin. When origin in "end" or "end_day", the value for a + # specific ``Timestamp`` index stands for the resample result from + # the current ``Timestamp`` minus ``freq`` to the current + # ``Timestamp`` with a right close. + if origin in ["end", "end_day"]: + if closed is None: + closed = "right" + if label is None: + label = "right" + else: + if closed is None: + closed = "left" + if label is None: + label = "left" self.closed = closed self.label = label @@ -1404,14 +1416,15 @@ def __init__( self.fill_method = fill_method self.limit = limit - if origin in ("epoch", "start", "start_day"): + if origin in ("epoch", "start", "start_day", "end", "end_day"): self.origin = origin else: try: self.origin = Timestamp(origin) except Exception as e: raise ValueError( - "'origin' should be equal to 'epoch', 'start', 'start_day' or " + "'origin' should be equal to 'epoch', 'start', 'start_day', " + "'end', 'end_day' or " f"should be a Timestamp convertible type. Got '{origin}' instead." ) from e @@ -1846,6 +1859,13 @@ def _adjust_dates_anchored( origin_nanos = first.value elif isinstance(origin, Timestamp): origin_nanos = origin.value + elif origin in ["end", "end_day"]: + origin = last if origin == "end" else last.ceil("D") + sub_freq_times = (origin.value - first.value) // freq.nanos + if closed == "left": + sub_freq_times += 1 + first = origin - sub_freq_times * freq + origin_nanos = first.value origin_nanos += offset.value if offset else 0 # GH 10117 & GH 19375. If first and last contain timezone information, diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 8bf40c924ec86..c23a22448fbb0 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -772,8 +772,9 @@ def test_resample_bad_origin(origin): rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s") ts = Series(np.random.randn(len(rng)), index=rng) msg = ( - "'origin' should be equal to 'epoch', 'start', 'start_day' or " - f"should be a Timestamp convertible type. Got '{origin}' instead." + "'origin' should be equal to 'epoch', 'start', 'start_day', " + "'end', 'end_day' or should be a Timestamp convertible type. Got " + f"'{origin}' instead." ) with pytest.raises(ValueError, match=msg): ts.resample("5min", origin=origin) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 5588b185793cc..2cd9bb70385bf 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -611,3 +611,80 @@ def test_resample_agg_readonly(): result = rs.agg("min") tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "start,end,freq,data,resample_freq,origin,closed,exp_data,exp_end,exp_periods", + [ + ( + "2000-10-01 23:30:00", + "2000-10-02 00:26:00", + "7min", + [0, 3, 6, 9, 12, 15, 18, 21, 24], + "17min", + "end", + None, + [0, 18, 27, 63], + "20001002 00:26:00", + 4, + ), + ( + "20200101 8:26:35", + "20200101 9:31:58", + "77s", + [1] * 51, + "7min", + "end", + "right", + [1, 6, 5, 6, 5, 6, 5, 6, 5, 6], + "2020-01-01 09:30:45", + 10, + ), + ( + "2000-10-01 23:30:00", + "2000-10-02 00:26:00", + "7min", + [0, 3, 6, 9, 12, 15, 18, 21, 24], + "17min", + "end", + "left", + [0, 18, 27, 39, 24], + "20001002 00:43:00", + 5, + ), + ( + "2000-10-01 23:30:00", + "2000-10-02 00:26:00", + "7min", + [0, 3, 6, 9, 12, 15, 18, 21, 24], + "17min", + "end_day", + None, + [3, 15, 45, 45], + "2000-10-02 00:29:00", + 4, + ), + ], +) +def test_end_and_end_day_origin( + start, + end, + freq, + data, + resample_freq, + origin, + closed, + exp_data, + exp_end, + exp_periods, +): + rng = date_range(start, end, freq=freq) + ts = Series(data, index=rng) + + res = ts.resample(resample_freq, origin=origin, closed=closed).sum() + expected = Series( + exp_data, + index=date_range(end=exp_end, freq=resample_freq, periods=exp_periods), + ) + + tm.assert_series_equal(res, expected)