From 222aa32591e5203f6df899e7cbd388f22b9d98e5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Dec 2023 15:16:46 -0800 Subject: [PATCH 1/6] BUG: resample with ArrowDtype --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/resample.py | 19 +++++++++++++++++-- pandas/tests/resample/test_datetime_index.py | 12 ++++++++++++ pandas/tests/resample/test_timedelta.py | 10 ++++++++++ 4 files changed, 40 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 67b4052b386c0..b4e3389d89f75 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -643,6 +643,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.asfreq` and :meth:`Series.asfreq` with a :class:`DatetimeIndex` with non-nanosecond resolution incorrectly converting to nanosecond resolution (:issue:`55958`) - Bug in :meth:`DataFrame.ewm` when passed ``times`` with non-nanosecond ``datetime64`` or :class:`DatetimeTZDtype` dtype (:issue:`56262`) - Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) +- Bug in :meth:`DataFrame.resample` when resampling on a :class:`ArrowDtype` of ``pyarrow.timestamp`` or ``pyarrow.duration`` type (:issue:`55989`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) - diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 8af81cd43d62e..b559d8bfc1a66 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -38,6 +38,7 @@ rewrite_warning, ) +from pandas.core.dtypes.dtypes import ArrowDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -68,6 +69,7 @@ from pandas.core.groupby.grouper import Grouper from pandas.core.groupby.ops import BinGrouper from pandas.core.indexes.api import MultiIndex +from pandas.core.indexes.base import Index from pandas.core.indexes.datetimes import ( DatetimeIndex, date_range, @@ -109,7 +111,6 @@ from pandas import ( DataFrame, - Index, Series, ) @@ -511,6 +512,9 @@ def _wrap_result(self, result): result.index = _asfreq_compat(obj.index[:0], freq=self.freq) result.name = getattr(obj, "name", None) + if self._timegrouper._arrow_dtype is not None: + result.index = result.index.astype(self._timegrouper._arrow_dtype) + return result @final @@ -2163,6 +2167,7 @@ def __init__( self.fill_method = fill_method self.limit = limit self.group_keys = group_keys + self._arrow_dtype: ArrowDtype | None = None if origin in ("epoch", "start", "start_day", "end", "end_day"): # error: Incompatible types in assignment (expression has type "Union[Union[ @@ -2213,7 +2218,7 @@ def _get_resampler(self, obj: NDFrame, kind=None) -> Resampler: TypeError if incompatible axis """ - _, ax, indexer = self._set_grouper(obj, gpr_index=None) + _, ax, _ = self._set_grouper(obj, gpr_index=None) if isinstance(ax, DatetimeIndex): return DatetimeIndexResampler( obj, @@ -2495,6 +2500,16 @@ def _get_period_bins(self, ax: PeriodIndex): return binner, bins, labels + @final + def _set_grouper( + self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None + ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]: + obj, ax, indexer = super()._set_grouper(obj, sort, gpr_index=gpr_index) + if isinstance(ax.dtype, ArrowDtype) and ax.dtype.kind in "Mm": + self._arrow_dtype = ax.dtype + ax = Index(ax.array._maybe_convert_datelike_array()) + return obj, ax, indexer + def _take_new_index( obj: NDFrameT, indexer: npt.NDArray[np.intp], new_index: Index, axis: AxisInt = 0 diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 8a725c6e51e3f..03ccfefadf71e 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -7,6 +7,7 @@ from pandas._libs import lib from pandas._typing import DatetimeNaTType +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -2195,3 +2196,14 @@ def test_resample_b_55282(unit): index=exp_dti, ) tm.assert_series_equal(result, expected) + + +@td.skip_if_no("pyarrow") +@pytest.mark.parametrize("tz", [None, "UTC"]) +def test_arrow_timestamp_resample(tz): + idx = Series(date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]") + if tz is not None: + idx = idx.dt.tz_localize(tz) + expected = Series(np.arange(5, dtype=np.float64), index=idx) + result = expected.resample("1D").mean() + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 5d6876343a0c9..57e6628095e5c 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -207,3 +209,11 @@ def test_resample_closed_right(): ), ) tm.assert_series_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_arrow_duration_resample(): + idx = pd.Index(timedelta_range("1 day", periods=5), dtype="duration[ns][pyarrow]") + expected = Series(np.arange(5, dtype=np.float64), index=idx) + result = expected.resample("1D").mean() + tm.assert_series_equal(result, expected) From 72d7fb02b67f57da46b655d5a4f7185a0675f42c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Dec 2023 16:10:04 -0800 Subject: [PATCH 2/6] Typing --- pandas/core/groupby/grouper.py | 1 - pandas/core/resample.py | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index fc914831b7a72..4703c12db602d 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -330,7 +330,6 @@ def _get_grouper( return grouper, obj - @final def _set_grouper( self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index b559d8bfc1a66..3831af23dc16e 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -49,6 +49,7 @@ ResamplerWindowApply, warn_alias_replacement, ) +from pandas.core.arrays import ArrowExtensionArray from pandas.core.base import ( PandasObject, SelectionMixin, @@ -2500,14 +2501,15 @@ def _get_period_bins(self, ax: PeriodIndex): return binner, bins, labels - @final def _set_grouper( self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]: obj, ax, indexer = super()._set_grouper(obj, sort, gpr_index=gpr_index) if isinstance(ax.dtype, ArrowDtype) and ax.dtype.kind in "Mm": self._arrow_dtype = ax.dtype - ax = Index(ax.array._maybe_convert_datelike_array()) + ax = Index( + cast(ArrowExtensionArray, ax.array)._maybe_convert_datelike_array() + ) return obj, ax, indexer From 6e4aa4701c0095eb293ae9e0fd70bb90f0d03c8e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Dec 2023 17:31:15 -0800 Subject: [PATCH 3/6] xfail for windows --- pandas/tests/resample/test_datetime_index.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 03ccfefadf71e..3abb58ac961c0 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -7,6 +7,7 @@ from pandas._libs import lib from pandas._typing import DatetimeNaTType +from pandas.compat import is_platform_windows import pandas.util._test_decorators as td import pandas as pd @@ -2199,7 +2200,22 @@ def test_resample_b_55282(unit): @td.skip_if_no("pyarrow") -@pytest.mark.parametrize("tz", [None, "UTC"]) +@pytest.mark.parametrize( + "tz", + [ + None, + pytest.param( + "UTC", + marks=pytest.mark.xfail( + is_platform_windows(), + reason=( + "TODO: Set ARROW_TIMEZONE_DATABASE environment " + "variable on CI to path to the tzdata for pyarrow.", + ), + ), + ), + ], +) def test_arrow_timestamp_resample(tz): idx = Series(date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]") if tz is not None: From 13cca601989bcaa8ff46199e5c82e68f0bbee200 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 7 Dec 2023 10:10:14 -0800 Subject: [PATCH 4/6] Fix again? --- pandas/tests/resample/test_datetime_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 3abb58ac961c0..0662194a52569 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -2207,7 +2207,7 @@ def test_resample_b_55282(unit): pytest.param( "UTC", marks=pytest.mark.xfail( - is_platform_windows(), + condition=is_platform_windows(), reason=( "TODO: Set ARROW_TIMEZONE_DATABASE environment " "variable on CI to path to the tzdata for pyarrow.", From a2037e988bf25625033934e244b417374222dc32 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 7 Dec 2023 11:15:00 -0800 Subject: [PATCH 5/6] Avoid tuple --- pandas/tests/resample/test_datetime_index.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 0662194a52569..ee32375caff8c 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -2208,10 +2208,7 @@ def test_resample_b_55282(unit): "UTC", marks=pytest.mark.xfail( condition=is_platform_windows(), - reason=( - "TODO: Set ARROW_TIMEZONE_DATABASE environment " - "variable on CI to path to the tzdata for pyarrow.", - ), + reason="TODO: Set ARROW_TIMEZONE_DATABASE env var in CI", ), ), ], From 17ab06a0caac68d52b9e282913573ddd789df1ad Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 7 Dec 2023 16:36:56 -0800 Subject: [PATCH 6/6] Add gh numbers --- pandas/tests/resample/test_datetime_index.py | 1 + pandas/tests/resample/test_timedelta.py | 1 + 2 files changed, 2 insertions(+) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index ee32375caff8c..760ed35bab678 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -2214,6 +2214,7 @@ def test_resample_b_55282(unit): ], ) def test_arrow_timestamp_resample(tz): + # GH 56371 idx = Series(date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]") if tz is not None: idx = idx.dt.tz_localize(tz) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 57e6628095e5c..7c70670d42908 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -213,6 +213,7 @@ def test_resample_closed_right(): @td.skip_if_no("pyarrow") def test_arrow_duration_resample(): + # GH 56371 idx = pd.Index(timedelta_range("1 day", periods=5), dtype="duration[ns][pyarrow]") expected = Series(np.arange(5, dtype=np.float64), index=idx) result = expected.resample("1D").mean()