From 61379f76ea5894916625f72bf3f048a83bab18f1 Mon Sep 17 00:00:00 2001 From: Duarte Sousa Date: Sat, 15 Jun 2024 05:15:40 +0000 Subject: [PATCH] BUG: fixed Series.dt methods in ArrowDtype class that were returning incorrect time values. (#57355) --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/arrays/arrow/array.py | 35 ++++++++++++++-------------- pandas/tests/extension/test_arrow.py | 25 ++++++++++++++++++++ 3 files changed, 44 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 80e5e89b79690..36a0dd3718feb 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -608,6 +608,7 @@ Other - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) +- Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d073cf0b11c6b..8c39e0d87df4e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -18,7 +18,6 @@ from pandas._libs import lib from pandas._libs.tslibs import ( - NaT, Timedelta, Timestamp, timezones, @@ -2612,17 +2611,19 @@ def _str_wrap(self, width: int, **kwargs) -> Self: @property def _dt_days(self) -> Self: return type(self)( - pa.array(self._to_timedeltaarray().days, from_pandas=True, type=pa.int32()) + pa.array( + self._to_timedeltaarray().components.days, + from_pandas=True, + type=pa.int32(), + ) ) @property def _dt_hours(self) -> Self: return type(self)( pa.array( - [ - td.components.hours if td is not NaT else None - for td in self._to_timedeltaarray() - ], + self._to_timedeltaarray().components.hours, + from_pandas=True, type=pa.int32(), ) ) @@ -2631,10 +2632,8 @@ def _dt_hours(self) -> Self: def _dt_minutes(self) -> Self: return type(self)( pa.array( - [ - td.components.minutes if td is not NaT else None - for td in self._to_timedeltaarray() - ], + self._to_timedeltaarray().components.minutes, + from_pandas=True, type=pa.int32(), ) ) @@ -2643,7 +2642,9 @@ def _dt_minutes(self) -> Self: def _dt_seconds(self) -> Self: return type(self)( pa.array( - self._to_timedeltaarray().seconds, from_pandas=True, type=pa.int32() + self._to_timedeltaarray().components.seconds, + from_pandas=True, + type=pa.int32(), ) ) @@ -2651,10 +2652,8 @@ def _dt_seconds(self) -> Self: def _dt_milliseconds(self) -> Self: return type(self)( pa.array( - [ - td.components.milliseconds if td is not NaT else None - for td in self._to_timedeltaarray() - ], + self._to_timedeltaarray().components.milliseconds, + from_pandas=True, type=pa.int32(), ) ) @@ -2663,7 +2662,7 @@ def _dt_milliseconds(self) -> Self: def _dt_microseconds(self) -> Self: return type(self)( pa.array( - self._to_timedeltaarray().microseconds, + self._to_timedeltaarray().components.microseconds, from_pandas=True, type=pa.int32(), ) @@ -2673,7 +2672,9 @@ def _dt_microseconds(self) -> Self: def _dt_nanoseconds(self) -> Self: return type(self)( pa.array( - self._to_timedeltaarray().nanoseconds, from_pandas=True, type=pa.int32() + self._to_timedeltaarray().components.nanoseconds, + from_pandas=True, + type=pa.int32(), ) ) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 5926d23b44dd0..f2e9d2321f33e 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2905,6 +2905,31 @@ def test_dt_components(): tm.assert_frame_equal(result, expected) +def test_dt_components_large_values(): + ser = pd.Series( + [ + pd.Timedelta("365 days 23:59:59.999000"), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = ser.dt.components + expected = pd.DataFrame( + [[365, 23, 59, 59, 999, 0, 0], [None, None, None, None, None, None, None]], + columns=[ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ], + dtype="int32[pyarrow]", + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("skipna", [True, False]) def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna): # GH51624