From 5e8f03412595dfd5ae58f1af560c5540e1dc1c5b Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Fri, 11 Dec 2020 14:20:35 +0800 Subject: [PATCH 01/14] add end and end_day origin for resample --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/generic.py | 8 +- pandas/core/groupby/grouper.py | 8 +- pandas/core/resample.py | 22 +++- pandas/tests/resample/test_datetime_index.py | 5 +- pandas/tests/resample/test_resample_api.py | 73 ++++++++++++ test.ipynb | 114 +++++++++++++++++++ 7 files changed, 223 insertions(+), 8 deletions(-) create mode 100644 test.ipynb diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index af9219bc25931..16ffd5ea3276b 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -308,6 +308,7 @@ Other enhancements - Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) - Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`) - When :func:`read_csv/sas/json` are called with ``chuncksize``/``iterator`` they can be used in a ``with`` statement as they return context-managers (:issue:`38225`) +- Added ``end`` and ``end_day`` options for ``origin`` in :meth:`DataFrame.resample` (:issue:`37804`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 41cb76d88957e..3272077e9029c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8027,7 +8027,8 @@ def resample( level : str or int, optional For a MultiIndex, level (name or number) to use for resampling. `level` must be datetime-like. - origin : {'epoch', 'start', 'start_day'}, Timestamp or str, default 'start_day' + origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp \ + or str, default 'start_day' The timestamp on which to adjust the grouping. The timezone of origin must match the timezone of the index. If a timestamp is not used, these values are also supported: @@ -8038,6 +8039,11 @@ def resample( .. versionadded:: 1.1.0 + - 'end': `origin` is the last value of the timeseries + - 'end_day': `origin` is the ceiling midnight of the last day + + .. versionadded:: 1.2.0 + offset : Timedelta or str, default is None An offset timedelta added to the origin. diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index d814a7cee436e..c3fdf98e3dedc 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -82,7 +82,8 @@ class Grouper: However, loffset is also deprecated for ``.resample(...)`` See: :class:`DataFrame.resample` - origin : {'epoch', 'start', 'start_day'}, Timestamp or str, default 'start_day' + origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp \ + or str, default 'start_day' The timestamp on which to adjust the grouping. The timezone of origin must match the timezone of the index. If a timestamp is not used, these values are also supported: @@ -93,6 +94,11 @@ class Grouper: .. versionadded:: 1.1.0 + - 'end': `origin` is the last value of the timeseries + - 'end_day': `origin` is the ceiling midnight of the last day + + .. versionadded:: 1.2.0 + offset : Timedelta or str, default is None An offset timedelta added to the origin. diff --git a/pandas/core/resample.py b/pandas/core/resample.py index afd189ad16b5d..1781883909dd4 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1389,9 +1389,15 @@ def __init__( label = "right" else: if closed is None: - closed = "left" + if origin in ["end", "end_day"]: + closed = "right" + else: + closed = "left" if label is None: - label = "left" + if origin in ["end", "end_day"]: + label = "right" + else: + label = "left" self.closed = closed self.label = label @@ -1404,14 +1410,15 @@ def __init__( self.fill_method = fill_method self.limit = limit - if origin in ("epoch", "start", "start_day"): + if origin in ("epoch", "start", "start_day", "end", "end_day"): self.origin = origin else: try: self.origin = Timestamp(origin) except Exception as e: raise ValueError( - "'origin' should be equal to 'epoch', 'start', 'start_day' or " + "'origin' should be equal to 'epoch', 'start', 'start_day', " + "'end' or 'end_day' " f"should be a Timestamp convertible type. Got '{origin}' instead." ) from e @@ -1846,6 +1853,13 @@ def _adjust_dates_anchored( origin_nanos = first.value elif isinstance(origin, Timestamp): origin_nanos = origin.value + elif origin in ["end", "end_day"]: + origin = last if origin == "end" else last.ceil("D") + sub_freq_times = (origin.value - first.value) // freq.nanos + if closed == "left": + sub_freq_times += 1 + first = origin - sub_freq_times * freq + origin_nanos = first.value origin_nanos += offset.value if offset else 0 # GH 10117 & GH 19375. If first and last contain timezone information, diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 8bf40c924ec86..a5d1576510462 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -772,8 +772,9 @@ def test_resample_bad_origin(origin): rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s") ts = Series(np.random.randn(len(rng)), index=rng) msg = ( - "'origin' should be equal to 'epoch', 'start', 'start_day' or " - f"should be a Timestamp convertible type. Got '{origin}' instead." + "'origin' should be equal to 'epoch', 'start', 'start_day', " + "'end' or 'end_day' should be a Timestamp convertible type. Got " + f"'{origin}' instead." ) with pytest.raises(ValueError, match=msg): ts.resample("5min", origin=origin) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 5588b185793cc..0393e9bd8098d 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -611,3 +611,76 @@ def test_resample_agg_readonly(): result = rs.agg("min") tm.assert_series_equal(result, expected) + + +def test_end_origin(): + + start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" + rng = date_range(start, end, freq="7min") + ts = Series(np.arange(len(rng)) * 3, index=rng) + + res = ts.resample("17min", origin="end").sum().astype("int64") + data = [0, 18, 27, 63] + expected = Series( + data, + index=date_range( + end="20001002 00:26:00", + freq="17min", + periods=4, + ), + ) + + tm.assert_series_equal(res, expected) + + # an extra test case + idx = date_range("20200101 8:26:35", "20200101 9:31:58", freq="77s") + data = np.ones(len(idx)) + s = Series(data, index=idx) + result = s.resample("7min", origin="end", closed="right").sum() + + exp_idx = date_range("2020-01-01 08:27:45", "2020-01-01 09:30:45", freq="7T") + exp_data = [1.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0] + expected = Series(exp_data, index=exp_idx) + + tm.assert_series_equal(result, expected) + + +def test_end_with_left_closed(): + + start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" + rng = date_range(start, end, freq="7min") + ts = Series(np.arange(len(rng)) * 3, index=rng) + + res = ts.resample("17min", origin="end", closed="left").sum().astype("int64") + data = [0, 18, 27, 39, 24] + expected = Series( + data, + index=date_range( + end="20001002 00:43:00", + freq="17min", + periods=5, + ), + ) + + tm.assert_series_equal(res, expected) + + +def test_end_day_origin(): + + start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" + rng = date_range(start, end, freq="7min") + ts = Series(np.arange(len(rng)) * 3, index=rng) + + # 12 == 24 * 60 - 84 * 17 <= 26 (last value) <= 24 * 60 - 83 * 17 == 29 + res = ts.resample("17min", origin="end_day").sum().astype("int64") + data = [3, 15, 45, 45] + expected = Series( + data, + index=date_range( + end="2000-10-02 00:29:00", + freq="17min", + periods=4, + ), + ) + + tm.assert_series_equal(res, expected) diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 0000000000000..40c4b3c272212 --- /dev/null +++ b/test.ipynb @@ -0,0 +1,114 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "pycharm-f7427e7c", + "display_name": "PyCharm (pythonProject)", + "language": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "source": [ + "import numpy as np\n", + "import temptemptemp.pandas.pandas as pd" + ], + "cell_type": "code", + "metadata": {}, + "execution_count": 1, + "outputs": [ + { + "output_type": "error", + "ename": "ImportError", + "evalue": "cannot import name 'np_version_under1p17' from 'pandas.compat.numpy' (C:\\Users\\gyh\\anaconda3\\envs\\ML\\lib\\site-packages\\pandas\\compat\\numpy\\__init__.py)", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mImportError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mtemptemptemp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpandas\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpandas\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32mc:\\Users\\gyh\\Desktop\\temptemptemp\\pandas\\pandas\\__init__.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 20\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 21\u001b[0m \u001b[1;31m# numpy compat\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 22\u001b[1;33m from pandas.compat.numpy import (\n\u001b[0m\u001b[0;32m 23\u001b[0m \u001b[0mnp_version_under1p17\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0m_np_version_under1p17\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 24\u001b[0m \u001b[0mnp_version_under1p18\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0m_np_version_under1p18\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mImportError\u001b[0m: cannot import name 'np_version_under1p17' from 'pandas.compat.numpy' (C:\\Users\\gyh\\anaconda3\\envs\\ML\\lib\\site-packages\\pandas\\compat\\numpy\\__init__.py)" + ] + } + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# 第一题" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "32.7" + ] + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "start, end = \"2000-10-01 23:30:00\", \"2000-10-02 00:26:00\"\n", + "rng = date_range(start, end, freq=\"7min\")\n", + "ts = Series(np.arange(len(rng)) * 3, index=rng)\n", + "\n", + "res = ts.resample(\"17min\", origin=\"end\").sum().astype(\"int64\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# 第二题\n", + "# 100, 50, 20, 10, 5, 1, 0.5, 0.1" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "f = open('ex2.txt')\n", + "money = np.array([float(i) for i in f.readlines()[1].split(' ')])\n", + "f2 = open('output.txt','w')\n", + "for i in money:\n", + " L = [100, 50, 20, 10, 5, 1, 0.5, 0.1]\n", + " res = []\n", + " for n in L:\n", + " if i//n > 0:\n", + " res.append(i//n)\n", + " i -= n*(i//n)\n", + " else:\n", + " res.append(0)\n", + " f2.write(str(res)[1:-1]+'\\n')\n", + "f2.close()" + ] + } + ] +} \ No newline at end of file From 8dce4e3914663b5f2aa4b2820535655c2a502c54 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Fri, 11 Dec 2020 14:31:12 +0800 Subject: [PATCH 02/14] Delete test.ipynb --- test.ipynb | 114 ----------------------------------------------------- 1 file changed, 114 deletions(-) delete mode 100644 test.ipynb diff --git a/test.ipynb b/test.ipynb deleted file mode 100644 index 40c4b3c272212..0000000000000 --- a/test.ipynb +++ /dev/null @@ -1,114 +0,0 @@ -{ - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9-final" - }, - "orig_nbformat": 2, - "kernelspec": { - "name": "pycharm-f7427e7c", - "display_name": "PyCharm (pythonProject)", - "language": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 2, - "cells": [ - { - "source": [ - "import numpy as np\n", - "import temptemptemp.pandas.pandas as pd" - ], - "cell_type": "code", - "metadata": {}, - "execution_count": 1, - "outputs": [ - { - "output_type": "error", - "ename": "ImportError", - "evalue": "cannot import name 'np_version_under1p17' from 'pandas.compat.numpy' (C:\\Users\\gyh\\anaconda3\\envs\\ML\\lib\\site-packages\\pandas\\compat\\numpy\\__init__.py)", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mImportError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mtemptemptemp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpandas\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpandas\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;32mc:\\Users\\gyh\\Desktop\\temptemptemp\\pandas\\pandas\\__init__.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 20\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 21\u001b[0m \u001b[1;31m# numpy compat\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 22\u001b[1;33m from pandas.compat.numpy import (\n\u001b[0m\u001b[0;32m 23\u001b[0m \u001b[0mnp_version_under1p17\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0m_np_version_under1p17\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 24\u001b[0m \u001b[0mnp_version_under1p18\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0m_np_version_under1p18\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mImportError\u001b[0m: cannot import name 'np_version_under1p17' from 'pandas.compat.numpy' (C:\\Users\\gyh\\anaconda3\\envs\\ML\\lib\\site-packages\\pandas\\compat\\numpy\\__init__.py)" - ] - } - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# 第一题" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "32.7" - ] - }, - "metadata": {}, - "execution_count": 3 - } - ], - "source": [ - "start, end = \"2000-10-01 23:30:00\", \"2000-10-02 00:26:00\"\n", - "rng = date_range(start, end, freq=\"7min\")\n", - "ts = Series(np.arange(len(rng)) * 3, index=rng)\n", - "\n", - "res = ts.resample(\"17min\", origin=\"end\").sum().astype(\"int64\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# 第二题\n", - "# 100, 50, 20, 10, 5, 1, 0.5, 0.1" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "f = open('ex2.txt')\n", - "money = np.array([float(i) for i in f.readlines()[1].split(' ')])\n", - "f2 = open('output.txt','w')\n", - "for i in money:\n", - " L = [100, 50, 20, 10, 5, 1, 0.5, 0.1]\n", - " res = []\n", - " for n in L:\n", - " if i//n > 0:\n", - " res.append(i//n)\n", - " i -= n*(i//n)\n", - " else:\n", - " res.append(0)\n", - " f2.write(str(res)[1:-1]+'\\n')\n", - "f2.close()" - ] - } - ] -} \ No newline at end of file From f316aead568ef9d712cbcb1ece690783f4a76cf2 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Fri, 11 Dec 2020 19:20:47 +0800 Subject: [PATCH 03/14] fix --- pandas/core/resample.py | 14 +-- pandas/tests/resample/test_resample_api.py | 139 +++++++++++---------- test.ipynb | 132 +++++++++++++++++++ 3 files changed, 213 insertions(+), 72 deletions(-) create mode 100644 test.ipynb diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 1781883909dd4..2dff009e6abad 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1388,15 +1388,15 @@ def __init__( if label is None: label = "right" else: - if closed is None: - if origin in ["end", "end_day"]: + if origin in ["end", "end_day"]: + if closed is None: closed = "right" - else: - closed = "left" - if label is None: - if origin in ["end", "end_day"]: + if label is None: label = "right" - else: + else: + if closed is None: + closed = "left" + if label is None: label = "left" self.closed = closed diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 0393e9bd8098d..59b97a8758ba6 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -613,73 +613,82 @@ def test_resample_agg_readonly(): tm.assert_series_equal(result, expected) -def test_end_origin(): - - start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" - rng = date_range(start, end, freq="7min") - ts = Series(np.arange(len(rng)) * 3, index=rng) - - res = ts.resample("17min", origin="end").sum().astype("int64") - data = [0, 18, 27, 63] - expected = Series( - data, - index=date_range( - end="20001002 00:26:00", - freq="17min", - periods=4, +@pytest.mark.parametrize( + "start,end,freq,data,resample_freq,origin,closed," + "exp_data,exp_end,exp_periods", + [ + ( + "2000-10-01 23:30:00", + "2000-10-02 00:26:00", + "7min", + [0, 3, 6, 9, 12, 15, 18, 21, 24], + "17min", + "end", + None, + [0, 18, 27, 63], + "20001002 00:26:00", + 4, ), - ) - - tm.assert_series_equal(res, expected) - - # an extra test case - idx = date_range("20200101 8:26:35", "20200101 9:31:58", freq="77s") - data = np.ones(len(idx)) - s = Series(data, index=idx) - result = s.resample("7min", origin="end", closed="right").sum() - - exp_idx = date_range("2020-01-01 08:27:45", "2020-01-01 09:30:45", freq="7T") - exp_data = [1.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0] - expected = Series(exp_data, index=exp_idx) - - tm.assert_series_equal(result, expected) - - -def test_end_with_left_closed(): - - start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" - rng = date_range(start, end, freq="7min") - ts = Series(np.arange(len(rng)) * 3, index=rng) - - res = ts.resample("17min", origin="end", closed="left").sum().astype("int64") - data = [0, 18, 27, 39, 24] - expected = Series( - data, - index=date_range( - end="20001002 00:43:00", - freq="17min", - periods=5, + ( + "20200101 8:26:35", + "20200101 9:31:58", + "77s", + [1] * 51, + "7min", + "end", + "right", + [1, 6, 5, 6, 5, 6, 5, 6, 5, 6], + "2020-01-01 09:30:45", + 10, ), - ) - - tm.assert_series_equal(res, expected) - - -def test_end_day_origin(): - - start, end = "2000-10-01 23:30:00", "2000-10-02 00:26:00" - rng = date_range(start, end, freq="7min") - ts = Series(np.arange(len(rng)) * 3, index=rng) - - # 12 == 24 * 60 - 84 * 17 <= 26 (last value) <= 24 * 60 - 83 * 17 == 29 - res = ts.resample("17min", origin="end_day").sum().astype("int64") - data = [3, 15, 45, 45] - expected = Series( - data, - index=date_range( - end="2000-10-02 00:29:00", - freq="17min", - periods=4, + ( + "2000-10-01 23:30:00", + "2000-10-02 00:26:00", + "7min", + [0, 3, 6, 9, 12, 15, 18, 21, 24], + "17min", + "end", + "left", + [0, 18, 27, 39, 24], + "20001002 00:43:00", + 5, + ), + ( + "2000-10-01 23:30:00", + "2000-10-02 00:26:00", + "7min", + [0, 3, 6, 9, 12, 15, 18, 21, 24], + "17min", + "end_day", + None, + [3, 15, 45, 45], + "2000-10-02 00:29:00", + 4, + ), + ], +) +def test_end_and_end_day_origin( + start, + end, + freq, + data, + resample_freq, + origin, + closed, + exp_data, + exp_end, + exp_periods, +): + rng = pd.date_range(start, end, freq=freq) + ts = pd.Series(data, index=rng) + + res = ts.resample(resample_freq, origin=origin, closed=closed).sum() + expected = pd.Series( + exp_data, + index=pd.date_range( + end=exp_end, + freq=resample_freq, + periods=exp_periods ), ) diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 0000000000000..ea39612a77905 --- /dev/null +++ b/test.ipynb @@ -0,0 +1,132 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python3", + "display_name": "Python 3", + "language": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "2000-10-01 23:35:00 0\n", + "2000-10-01 23:52:00 18\n", + "2000-10-02 00:09:00 27\n", + "2000-10-02 00:26:00 63\n", + "Freq: 17T, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ], + "source": [ + "start, end = \"2000-10-01 23:30:00\", \"2000-10-02 00:26:00\"\n", + "rng = pd.date_range(start, end, freq=\"7min\")\n", + "ts = pd.Series([0, 3, 6, 9, 12, 15, 18, 21, 24], index=rng)\n", + "\n", + "res = ts.resample(\"17min\", origin=\"end\").sum()\n", + "data = [0, 18, 27, 63]\n", + "expected = pd.Series(\n", + " data,\n", + " index=pd.date_range(\n", + " end=\"20001002 00:26:00\",\n", + " freq=\"17min\",\n", + " periods=4,\n", + " ),\n", + " )\n", + "res" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([ 0, 3, 6, 9, 12, 15, 18, 21, 24])" + ] + }, + "metadata": {}, + "execution_count": 16 + } + ], + "source": [ + "np.arange(len(rng)) * 3" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "2020-01-01 08:27:45 1\n", + "2020-01-01 08:34:45 6\n", + "2020-01-01 08:41:45 5\n", + "2020-01-01 08:48:45 6\n", + "2020-01-01 08:55:45 5\n", + "2020-01-01 09:02:45 6\n", + "2020-01-01 09:09:45 5\n", + "2020-01-01 09:16:45 6\n", + "2020-01-01 09:23:45 5\n", + "2020-01-01 09:30:45 6\n", + "Freq: 7T, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 14 + } + ], + "source": [ + "idx = pd.date_range(\"20200101 8:26:35\", \"20200101 9:31:58\", freq=\"77s\")\n", + "data = [1]*len(idx)\n", + "s = pd.Series(data, index=idx)\n", + "result = s.resample(\"7min\", origin=\"end\", closed=\"right\").sum()\n", + "\n", + "exp_idx = pd.date_range(\"2020-01-01 08:27:45\", \"2020-01-01 09:30:45\", freq=\"7T\")\n", + "exp_data = [1, 6, 5, 6, 5, 6, 5, 6, 5, 6]\n", + "expected = pd.Series(exp_data, index=exp_idx)\n", + "expected" + ] + } + ] +} \ No newline at end of file From 20e15e4aa94699361b529d81a4361d3ca5510ac3 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Fri, 11 Dec 2020 19:29:46 +0800 Subject: [PATCH 04/14] fix --- pandas/tests/resample/test_resample_api.py | 15 +-- test.ipynb | 132 --------------------- 2 files changed, 5 insertions(+), 142 deletions(-) delete mode 100644 test.ipynb diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 59b97a8758ba6..2cd9bb70385bf 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -614,8 +614,7 @@ def test_resample_agg_readonly(): @pytest.mark.parametrize( - "start,end,freq,data,resample_freq,origin,closed," - "exp_data,exp_end,exp_periods", + "start,end,freq,data,resample_freq,origin,closed,exp_data,exp_end,exp_periods", [ ( "2000-10-01 23:30:00", @@ -679,17 +678,13 @@ def test_end_and_end_day_origin( exp_end, exp_periods, ): - rng = pd.date_range(start, end, freq=freq) - ts = pd.Series(data, index=rng) + rng = date_range(start, end, freq=freq) + ts = Series(data, index=rng) res = ts.resample(resample_freq, origin=origin, closed=closed).sum() - expected = pd.Series( + expected = Series( exp_data, - index=pd.date_range( - end=exp_end, - freq=resample_freq, - periods=exp_periods - ), + index=date_range(end=exp_end, freq=resample_freq, periods=exp_periods), ) tm.assert_series_equal(res, expected) diff --git a/test.ipynb b/test.ipynb deleted file mode 100644 index ea39612a77905..0000000000000 --- a/test.ipynb +++ /dev/null @@ -1,132 +0,0 @@ -{ - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9-final" - }, - "orig_nbformat": 2, - "kernelspec": { - "name": "python3", - "display_name": "Python 3", - "language": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 2, - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "2000-10-01 23:35:00 0\n", - "2000-10-01 23:52:00 18\n", - "2000-10-02 00:09:00 27\n", - "2000-10-02 00:26:00 63\n", - "Freq: 17T, dtype: int64" - ] - }, - "metadata": {}, - "execution_count": 17 - } - ], - "source": [ - "start, end = \"2000-10-01 23:30:00\", \"2000-10-02 00:26:00\"\n", - "rng = pd.date_range(start, end, freq=\"7min\")\n", - "ts = pd.Series([0, 3, 6, 9, 12, 15, 18, 21, 24], index=rng)\n", - "\n", - "res = ts.resample(\"17min\", origin=\"end\").sum()\n", - "data = [0, 18, 27, 63]\n", - "expected = pd.Series(\n", - " data,\n", - " index=pd.date_range(\n", - " end=\"20001002 00:26:00\",\n", - " freq=\"17min\",\n", - " periods=4,\n", - " ),\n", - " )\n", - "res" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "array([ 0, 3, 6, 9, 12, 15, 18, 21, 24])" - ] - }, - "metadata": {}, - "execution_count": 16 - } - ], - "source": [ - "np.arange(len(rng)) * 3" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "2020-01-01 08:27:45 1\n", - "2020-01-01 08:34:45 6\n", - "2020-01-01 08:41:45 5\n", - "2020-01-01 08:48:45 6\n", - "2020-01-01 08:55:45 5\n", - "2020-01-01 09:02:45 6\n", - "2020-01-01 09:09:45 5\n", - "2020-01-01 09:16:45 6\n", - "2020-01-01 09:23:45 5\n", - "2020-01-01 09:30:45 6\n", - "Freq: 7T, dtype: int64" - ] - }, - "metadata": {}, - "execution_count": 14 - } - ], - "source": [ - "idx = pd.date_range(\"20200101 8:26:35\", \"20200101 9:31:58\", freq=\"77s\")\n", - "data = [1]*len(idx)\n", - "s = pd.Series(data, index=idx)\n", - "result = s.resample(\"7min\", origin=\"end\", closed=\"right\").sum()\n", - "\n", - "exp_idx = pd.date_range(\"2020-01-01 08:27:45\", \"2020-01-01 09:30:45\", freq=\"7T\")\n", - "exp_data = [1, 6, 5, 6, 5, 6, 5, 6, 5, 6]\n", - "expected = pd.Series(exp_data, index=exp_idx)\n", - "expected" - ] - } - ] -} \ No newline at end of file From 11e1252a986e0cc61cbf84c40939b99888007443 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Sat, 12 Dec 2020 07:46:24 +0800 Subject: [PATCH 05/14] update --- doc/source/whatsnew/v1.2.0.rst | 1 - pandas/core/resample.py | 2 +- pandas/tests/resample/test_datetime_index.py | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 16ffd5ea3276b..af9219bc25931 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -308,7 +308,6 @@ Other enhancements - Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) - Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`) - When :func:`read_csv/sas/json` are called with ``chuncksize``/``iterator`` they can be used in a ``with`` statement as they return context-managers (:issue:`38225`) -- Added ``end`` and ``end_day`` options for ``origin`` in :meth:`DataFrame.resample` (:issue:`37804`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 2dff009e6abad..81fbafdcd4efa 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1418,7 +1418,7 @@ def __init__( except Exception as e: raise ValueError( "'origin' should be equal to 'epoch', 'start', 'start_day', " - "'end' or 'end_day' " + "'end', 'end_day' or " f"should be a Timestamp convertible type. Got '{origin}' instead." ) from e diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index a5d1576510462..c23a22448fbb0 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -773,7 +773,7 @@ def test_resample_bad_origin(origin): ts = Series(np.random.randn(len(rng)), index=rng) msg = ( "'origin' should be equal to 'epoch', 'start', 'start_day', " - "'end' or 'end_day' should be a Timestamp convertible type. Got " + "'end', 'end_day' or should be a Timestamp convertible type. Got " f"'{origin}' instead." ) with pytest.raises(ValueError, match=msg): From edfe52a2104f70661e110510adbe52c232ad97e3 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Sat, 12 Dec 2020 08:17:23 +0800 Subject: [PATCH 06/14] Update v1.3.0.rst --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index ab9f303bec6aa..8709b10413a2e 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -20,7 +20,7 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - Added :meth:`MultiIndex.dtypes` (:issue:`37062`) -- +- Added ``end`` and ``end_day`` options for ``origin`` in :meth:`DataFrame.resample` (:issue:`37804`) .. --------------------------------------------------------------------------- From f47b6ea794da8c2990848066d230c074b198d697 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Sat, 12 Dec 2020 08:19:20 +0800 Subject: [PATCH 07/14] update version --- pandas/core/generic.py | 2 +- pandas/core/groupby/grouper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3272077e9029c..3bff3f246654e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8042,7 +8042,7 @@ def resample( - 'end': `origin` is the last value of the timeseries - 'end_day': `origin` is the ceiling midnight of the last day - .. versionadded:: 1.2.0 + .. versionadded:: 1.3.0 offset : Timedelta or str, default is None An offset timedelta added to the origin. diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index c3fdf98e3dedc..30a7f35dd7379 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -97,7 +97,7 @@ class Grouper: - 'end': `origin` is the last value of the timeseries - 'end_day': `origin` is the ceiling midnight of the last day - .. versionadded:: 1.2.0 + .. versionadded:: 1.3.0 offset : Timedelta or str, default is None An offset timedelta added to the origin. From 0cc22cd6495f9f39a2ee870a70fc553709b501ee Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Sun, 13 Dec 2020 10:42:15 +0800 Subject: [PATCH 08/14] Update timeseries.rst --- doc/source/user_guide/timeseries.rst | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 354c510b843dd..7d75be80f6baf 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1888,6 +1888,33 @@ Those two examples are equivalent for this time series: Note the use of ``'start'`` for ``origin`` on the last example. In that case, ``origin`` will be set to the first value of the timeseries. +Backward resample +~~~~~~~~~~~~~~~~~ + +.. versionadded:: 1.3.0 + +Instead of adjusting the beginning of bins, sometimes we need to fix the end of the bins to make a backward resample with a given ``freq``. The backward resample sets ``closed`` to ``'right'`` by default since the last value should be considered as the edge point for the last bin. + +We can set ``origin`` to ``'end'``. The value for a specific ``Timestamp`` index stands for the resample result from the current ``Timestamp`` minus ``freq`` to the current ``Timestamp`` with a right close. + +.. ipython:: python + + ts.resample('17min', origin='end').sum() + +Besides, in contrast with the ``'start_day'`` option, ``end_day`` is supported. This will set the origin as the ceiling midnight of the largest ``Timestamp``. + +.. ipython:: python + + ts.resample('17min', origin='end_day').sum() + +The above result uses ``2000-10-02 00:29:00`` as the last bin's right edge since the following computation. + +.. ipython:: python + + ceil_mid = rng.max().ceil('D') + freq = pd.offsets.Minute(17) + bin_res = ceil_mid - freq * ((ceil_mid - rng.max()) // freq) + .. _timeseries.periods: Time span representation From e6ae1a5f74a22cfa64ffcbb296ebe057c1bd22d6 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Sun, 13 Dec 2020 10:48:29 +0800 Subject: [PATCH 09/14] Update timeseries.rst --- doc/source/user_guide/timeseries.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 7d75be80f6baf..1d8972a8b4422 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1901,7 +1901,7 @@ We can set ``origin`` to ``'end'``. The value for a specific ``Timestamp`` index ts.resample('17min', origin='end').sum() -Besides, in contrast with the ``'start_day'`` option, ``end_day`` is supported. This will set the origin as the ceiling midnight of the largest ``Timestamp``. +Besides, in contrast with the ``'start_day'`` option, ``end_day`` is supported. This will set the origin as the ceiling midnight of the largest ``Timestamp``. .. ipython:: python From a2e0a57e740bb5d49bf54fcf88b8574f04fc7c42 Mon Sep 17 00:00:00 2001 From: GYHHAHA <1801214626@qq.com> Date: Sun, 13 Dec 2020 11:12:13 +0800 Subject: [PATCH 10/14] Update timeseries.rst --- doc/source/user_guide/timeseries.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 1d8972a8b4422..8909f5b33066b 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1914,6 +1914,7 @@ The above result uses ``2000-10-02 00:29:00`` as the last bin's right edge since ceil_mid = rng.max().ceil('D') freq = pd.offsets.Minute(17) bin_res = ceil_mid - freq * ((ceil_mid - rng.max()) // freq) + bin_res .. _timeseries.periods: From 2ffce3c5eb3bcd12246fe2938067c9311930b11b Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 14 Dec 2020 10:34:47 +0800 Subject: [PATCH 11/14] Update resample.py --- pandas/core/resample.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 81fbafdcd4efa..d0f66b0daa301 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1388,6 +1388,12 @@ def __init__( if label is None: label = "right" else: + # The backward resample sets ``closed`` to ``'right'`` by default + # since the last value should be considered as the edge point for + # the last bin. When origin in "end" or "end_day", the value for a + # specific ``Timestamp`` index stands for the resample result from + # the current ``Timestamp`` minus ``freq`` to the current + # ``Timestamp`` with a right close. if origin in ["end", "end_day"]: if closed is None: closed = "right" From a554cad52af582c6e291ce30fadff3377a26e614 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 14 Dec 2020 10:40:37 +0800 Subject: [PATCH 12/14] Update generic.py --- pandas/core/generic.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3bff3f246654e..4b1766c4c7831 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8323,6 +8323,26 @@ def resample( 2000-10-02 00:21:00 24 Freq: 17T, dtype: int64 + If you want to take the largest Timestamp as the end of the bins: + + >>> ts.resample('17min', origin='end').sum() + 2000-10-01 23:35:00 0 + 2000-10-01 23:52:00 18 + 2000-10-02 00:09:00 27 + 2000-10-02 00:26:00 63 + Freq: 17T, dtype: int64 + + In contrast with the `start_day`, you can use `end_day` to take the ceiling + midnight of the largest Timestamp as the end of the bins and drop the bins + not containing data: + + >>> ts.resample('17min', origin='end_day').sum() + 2000-10-01 23:38:00 3 + 2000-10-01 23:55:00 15 + 2000-10-02 00:12:00 45 + 2000-10-02 00:29:00 45 + Freq: 17T, dtype: int64 + To replace the use of the deprecated `base` argument, you can now use `offset`, in this example it is equivalent to have `base=2`: From ab473ce52b278722b91e2bb304fb8815b344b45c Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Tue, 22 Dec 2020 10:04:14 +0800 Subject: [PATCH 13/14] Update generic.py --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6bbf337f34247..ebf311ae429cb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8050,8 +8050,8 @@ def resample( level : str or int, optional For a MultiIndex, level (name or number) to use for resampling. `level` must be datetime-like. - origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp \ - or str, default 'start_day' + origin : {{'epoch', 'start', 'start_day', 'end', 'end_day'}}, Timestamp + or str, default 'start_day' The timestamp on which to adjust the grouping. The timezone of origin must match the timezone of the index. If a timestamp is not used, these values are also supported: From 8331a8e6bceeb16269c6d4e53ae2dab03719f813 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Tue, 22 Dec 2020 10:51:05 +0800 Subject: [PATCH 14/14] Update grouper.py --- pandas/core/groupby/grouper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 70cf3f1930d31..1e6645686f93f 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -82,8 +82,8 @@ class Grouper: However, loffset is also deprecated for ``.resample(...)`` See: :class:`DataFrame.resample` - origin : {'epoch', 'start', 'start_day', 'end', 'end_day'}, Timestamp \ - or str, default 'start_day' + origin : {{'epoch', 'start', 'start_day', 'end', 'end_day'}}, Timestamp + or str, default 'start_day' The timestamp on which to adjust the grouping. The timezone of origin must match the timezone of the index. If a timestamp is not used, these values are also supported: