From 93d1503ccedf5f9dc3ebec5593414717e4d61ec8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 16 Jan 2021 13:58:08 -0500 Subject: [PATCH 1/6] BUG: Series.resample fails on NaT index --- pandas/core/resample.py | 15 +++++++++---- pandas/tests/resample/test_base.py | 26 +++++++++++++++++++++- pandas/tests/resample/test_period_index.py | 7 +++--- 3 files changed, 40 insertions(+), 8 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index f6c1da723a1d9..6751b5813fb6f 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -454,7 +454,8 @@ def _wrap_result(self, result): if isinstance(result, ABCSeries) and result.empty: obj = self.obj - result.index = _asfreq_compat(obj.index, freq=self.freq) + # When index is all NaT, result is empty but index is not + result.index = _asfreq_compat(obj.index[:0], freq=self.freq) result.name = getattr(obj, "name", None) return result @@ -1653,10 +1654,16 @@ def _get_period_bins(self, ax: PeriodIndex): nat_count = np.sum(memb._isnan) memb = memb[~memb._isnan] - # if index contains no valid (non-NaT) values, return empty index if not len(memb): - binner = labels = PeriodIndex(data=[], freq=self.freq, name=ax.name) - return binner, [], labels + if len(ax) == 0: + # if index is empty, return empty bins + data = bins = [] + else: + # if index is all NaT, return a single bin + data = [NaT] + bins = [len(ax)] + binner = labels = PeriodIndex(data=data, freq=self.freq, name=ax.name) + return binner, bins, labels freq_mult = self.freq.n diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 7389fa31109f8..1154bc3316ae8 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas import DataFrame, Series +from pandas import DataFrame, NaT, PeriodIndex, Series import pandas._testing as tm from pandas.core.groupby.groupby import DataError from pandas.core.groupby.grouper import Grouper @@ -110,6 +110,30 @@ def test_resample_empty_series(freq, empty_series_dti, resample_method): tm.assert_series_equal(result, expected, check_dtype=False) +@all_ts +@pytest.mark.parametrize("freq", ["M", "D", "H"]) +def test_resample_nat_index_series(request, freq, series, resample_method): + # GH39227 + + if freq == "M": + request.node.add_marker(pytest.mark.xfail(reason="Don't know why this fails")) + + s = series.copy() + s.index = PeriodIndex([NaT] * len(s), freq=freq) + result = getattr(s.resample(freq), resample_method)() + + if resample_method == "ohlc": + expected = DataFrame( + [], index=s.index[:0].copy(), columns=["open", "high", "low", "close"] + ) + tm.assert_frame_equal(result, expected, check_dtype=False) + else: + expected = s[:0].copy() + tm.assert_series_equal(result, expected, check_dtype=False) + tm.assert_index_equal(result.index, expected.index) + assert result.index.freq == expected.index.freq + + @all_ts @pytest.mark.parametrize("freq", ["M", "D", "H"]) @pytest.mark.parametrize("resample_method", ["count", "size"]) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index e83196e9c7d56..26a168c8a18d3 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -787,10 +787,11 @@ def test_resample_with_nat(self, periods, values, freq, expected_values): def test_resample_with_only_nat(self): # GH 13224 pi = PeriodIndex([pd.NaT] * 3, freq="S") - frame = DataFrame([2, 3, 5], index=pi) + frame = DataFrame([2, 3, 5], index=pi, columns=["a"]) expected_index = PeriodIndex(data=[], freq=pi.freq) - expected = DataFrame(index=expected_index) - result = frame.resample("1s").mean() + expected = DataFrame(index=expected_index, columns=["a"], dtype=int) + rs = frame.resample("1s") + result = rs.mean() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( From 211b6a6e85208bedcb4f3e67948ddfcce1d1b230 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 17 Jan 2021 09:27:37 -0500 Subject: [PATCH 2/6] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index ab00b749d5725..058e5d5a39cb7 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -338,6 +338,7 @@ Groupby/resample/rolling - Fixed bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` causing loss of precision through using Kahan summation (:issue:`38778`) - Fixed bug in :meth:`DataFrameGroupBy.cumsum`, :meth:`SeriesGroupBy.cumsum`, :meth:`DataFrameGroupBy.mean` and :meth:`SeriesGroupBy.mean` causing loss of precision through using Kahan summation (:issue:`38934`) - Bug in :meth:`.Resampler.aggregate` and :meth:`DataFrame.transform` raising ``TypeError`` instead of ``SpecificationError`` when missing keys had mixed dtypes (:issue:`39025`) +- Bug in :meth:`Series.resample` would raise when index consisted of ``NaT`` (:issue:`39227`) Reshaping ^^^^^^^^^ From 16236a020ceee6eda9a34df2d3fab0208215b41d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 17 Jan 2021 09:41:17 -0500 Subject: [PATCH 3/6] Fixup --- pandas/tests/resample/test_period_index.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 26a168c8a18d3..3721dfa0068f9 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -790,8 +790,7 @@ def test_resample_with_only_nat(self): frame = DataFrame([2, 3, 5], index=pi, columns=["a"]) expected_index = PeriodIndex(data=[], freq=pi.freq) expected = DataFrame(index=expected_index, columns=["a"], dtype=int) - rs = frame.resample("1s") - result = rs.mean() + result = frame.resample("1s").mean() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( From 43bf26f5e18b2dcfb294dee7407767babab3a95f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 17 Jan 2021 11:29:19 -0500 Subject: [PATCH 4/6] Fix dtype and mypy errors --- pandas/core/resample.py | 5 +++-- pandas/tests/resample/test_period_index.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 6751b5813fb6f..75eb5a7687f85 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1657,11 +1657,12 @@ def _get_period_bins(self, ax: PeriodIndex): if not len(memb): if len(ax) == 0: # if index is empty, return empty bins - data = bins = [] + data = [] + bins = np.array([], dtype=np.int64) else: # if index is all NaT, return a single bin data = [NaT] - bins = [len(ax)] + bins = np.array([len(ax)]) binner = labels = PeriodIndex(data=data, freq=self.freq, name=ax.name) return binner, bins, labels diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 3721dfa0068f9..2fe3fb91768e6 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -789,7 +789,7 @@ def test_resample_with_only_nat(self): pi = PeriodIndex([pd.NaT] * 3, freq="S") frame = DataFrame([2, 3, 5], index=pi, columns=["a"]) expected_index = PeriodIndex(data=[], freq=pi.freq) - expected = DataFrame(index=expected_index, columns=["a"], dtype=int) + expected = DataFrame(index=expected_index, columns=["a"], dtype="int64") result = frame.resample("1s").mean() tm.assert_frame_equal(result, expected) From 3a47781221860b75d3389edb90fe02cb05c5c206 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 17 Jan 2021 14:22:14 -0500 Subject: [PATCH 5/6] whatsnew - indicate issue only for PeriodIndex --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 95784c780c4f5..c1c5224c09ad6 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -339,7 +339,7 @@ Groupby/resample/rolling - Fixed bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` causing loss of precision through using Kahan summation (:issue:`38778`) - Fixed bug in :meth:`DataFrameGroupBy.cumsum`, :meth:`SeriesGroupBy.cumsum`, :meth:`DataFrameGroupBy.mean` and :meth:`SeriesGroupBy.mean` causing loss of precision through using Kahan summation (:issue:`38934`) - Bug in :meth:`.Resampler.aggregate` and :meth:`DataFrame.transform` raising ``TypeError`` instead of ``SpecificationError`` when missing keys had mixed dtypes (:issue:`39025`) -- Bug in :meth:`Series.resample` would raise when index consisted of ``NaT`` (:issue:`39227`) +- Bug in :meth:`Series.resample` would raise when the index was a :class:`PeriodIndex` consisting of ``NaT`` (:issue:`39227`) Reshaping ^^^^^^^^^ From 252cc29b4ecbfb049b0bf084d447c4c044473a13 Mon Sep 17 00:00:00 2001 From: rhshadrach Date: Mon, 25 Jan 2021 17:56:45 -0500 Subject: [PATCH 6/6] Consolidate logic into _insert_nat_bin --- pandas/core/resample.py | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 2412cf13c48fc..1c8f47374860c 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1653,15 +1653,12 @@ def _get_period_bins(self, ax: PeriodIndex): memb = memb[~memb._isnan] if not len(memb): - if len(ax) == 0: - # if index is empty, return empty bins - data = [] - bins = np.array([], dtype=np.int64) - else: - # if index is all NaT, return a single bin - data = [NaT] - bins = np.array([len(ax)]) - binner = labels = PeriodIndex(data=data, freq=self.freq, name=ax.name) + # index contains no valid (non-NaT) values + bins = np.array([], dtype=np.int64) + binner = labels = PeriodIndex(data=[], freq=self.freq, name=ax.name) + if len(ax) > 0: + # index is all NaT + binner, bins, labels = _insert_nat_bin(binner, bins, labels, len(ax)) return binner, bins, labels freq_mult = self.freq.n @@ -1708,12 +1705,7 @@ def _get_period_bins(self, ax: PeriodIndex): bins = memb.searchsorted(prng, side="left") if nat_count > 0: - # NaT handling as in pandas._lib.lib.generate_bins_dt64() - # shift bins by the number of NaT - bins += nat_count - bins = np.insert(bins, 0, nat_count) - binner = binner.insert(0, NaT) - labels = labels.insert(0, NaT) + binner, bins, labels = _insert_nat_bin(binner, bins, labels, nat_count) return binner, bins, labels @@ -1857,6 +1849,19 @@ def _get_period_range_edges( return first, last +def _insert_nat_bin( + binner: PeriodIndex, bins: np.ndarray, labels: PeriodIndex, nat_count: int +) -> Tuple[PeriodIndex, np.ndarray, PeriodIndex]: + # NaT handling as in pandas._lib.lib.generate_bins_dt64() + # shift bins by the number of NaT + assert nat_count > 0 + bins += nat_count + bins = np.insert(bins, 0, nat_count) + binner = binner.insert(0, NaT) + labels = labels.insert(0, NaT) + return binner, bins, labels + + def _adjust_dates_anchored( first, last, freq, closed="right", origin="start_day", offset=None ):