From 6a6f51c0c6f5ff5af5f6e5ecde82491d20d1059d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 14 Apr 2023 16:11:09 -0700 Subject: [PATCH 1/6] BUG: tz_localize with ArrowDtype --- doc/source/whatsnew/v2.0.1.rst | 1 + pandas/core/arrays/arrow/array.py | 19 ++++++++++++------- pandas/tests/extension/test_arrow.py | 26 +++++++++++++++++++++++--- 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index c64f7a46d3058..37434de0d25c3 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.sort_values` not resetting index when :class:`DataFrame` is already sorted and ``ignore_index=True`` (:issue:`52553`) - Fixed regression in :meth:`MultiIndex.isin` raising ``TypeError`` for ``Generator`` (:issue:`52568`) - Fixed regression in :meth:`Series.describe` showing ``RuntimeWarning`` for extension dtype :class:`Series` with one element (:issue:`52515`) +- Fixed regression in :meth:`Series.dt.tz_localize` incorrectly localizing timestamps (:issue:`?`) .. --------------------------------------------------------------------------- .. _whatsnew_201.bug_fixes: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 8982465d28417..2daf3c653ff13 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2207,12 +2207,17 @@ def _dt_tz_localize( ): if ambiguous != "raise": raise NotImplementedError(f"{ambiguous=} is not supported") - if nonexistent != "raise": + nonexistent_pa = { + "raise": "raise", + "shift_backward": "earliest", + "shift_forward": "latest", + }.get(nonexistent, None) + if nonexistent_pa is None: raise NotImplementedError(f"{nonexistent=} is not supported") if tz is None: - new_type = pa.timestamp(self.dtype.pyarrow_dtype.unit) - return type(self)(self._pa_array.cast(new_type)) - pa_tz = str(tz) - return type(self)( - self._pa_array.cast(pa.timestamp(self.dtype.pyarrow_dtype.unit, pa_tz)) - ) + result = self._pa_array.cast(pa.timestamp(self.dtype.pyarrow_dtype.unit)) + else: + result = pc.assume_timezone( + self._pa_array, str(tz), ambiguous=ambiguous, nonexistent=nonexistent_pa + ) + return type(self)(result) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 49b7679392590..acde0ec9b2d8f 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2398,10 +2398,30 @@ def test_dt_tz_localize(unit): dtype=ArrowDtype(pa.timestamp(unit)), ) result = ser.dt.tz_localize("US/Pacific") - expected = pd.Series( - [datetime(year=2023, month=1, day=2, hour=3), None], - dtype=ArrowDtype(pa.timestamp(unit, "US/Pacific")), + exp_data = pa.array( + [datetime(year=2023, month=1, day=2, hour=3), None], type=pa.timestamp(unit) + ) + exp_data = pa.compute.assume_timezone(exp_data, "US/Pacific") + expected = pd.Series(ArrowExtensionArray(exp_data)) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "nonexistent, exp_date", + [ + ["shift_forward", datetime(year=2023, month=3, day=12, hour=3)], + ["shift_backward", pd.Timestamp("2023-03-12 01:59:59.999999999")], + ], +) +def test_dt_tz_localize_nonexistent(nonexistent, exp_date): + ser = pd.Series( + [datetime(year=2023, month=3, day=12, hour=2, minute=30), None], + dtype=ArrowDtype(pa.timestamp("ns")), ) + result = ser.dt.tz_localize("US/Pacific", nonexistent=nonexistent) + exp_data = pa.array([exp_date, None], type=pa.timestamp("ns")) + exp_data = pa.compute.assume_timezone(exp_data, "US/Pacific") + expected = pd.Series(ArrowExtensionArray(exp_data)) tm.assert_series_equal(result, expected) From 3a33c6eb46797defed3c1e76901f9ff18ba08928 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 14 Apr 2023 16:12:03 -0700 Subject: [PATCH 2/6] Add issue number --- doc/source/whatsnew/v2.0.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 37434de0d25c3..571e4432ce746 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -18,7 +18,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.sort_values` not resetting index when :class:`DataFrame` is already sorted and ``ignore_index=True`` (:issue:`52553`) - Fixed regression in :meth:`MultiIndex.isin` raising ``TypeError`` for ``Generator`` (:issue:`52568`) - Fixed regression in :meth:`Series.describe` showing ``RuntimeWarning`` for extension dtype :class:`Series` with one element (:issue:`52515`) -- Fixed regression in :meth:`Series.dt.tz_localize` incorrectly localizing timestamps (:issue:`?`) +- Fixed regression in :meth:`Series.dt.tz_localize` incorrectly localizing timestamps (:issue:`52677`) .. --------------------------------------------------------------------------- .. _whatsnew_201.bug_fixes: From 4627a29fc3a93415672723c112a42936f257f6d7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 14 Apr 2023 16:12:42 -0700 Subject: [PATCH 3/6] Add issue number --- doc/source/whatsnew/v2.0.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 571e4432ce746..9c867544a324b 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -18,7 +18,6 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.sort_values` not resetting index when :class:`DataFrame` is already sorted and ``ignore_index=True`` (:issue:`52553`) - Fixed regression in :meth:`MultiIndex.isin` raising ``TypeError`` for ``Generator`` (:issue:`52568`) - Fixed regression in :meth:`Series.describe` showing ``RuntimeWarning`` for extension dtype :class:`Series` with one element (:issue:`52515`) -- Fixed regression in :meth:`Series.dt.tz_localize` incorrectly localizing timestamps (:issue:`52677`) .. --------------------------------------------------------------------------- .. _whatsnew_201.bug_fixes: @@ -34,6 +33,7 @@ Bug fixes - Bug in :meth:`ArrowDtype.__from_arrow__` not respecting if dtype is explicitly given (:issue:`52533`) - Bug in :meth:`DataFrame.max` and related casting different :class:`Timestamp` resolutions always to nanoseconds (:issue:`52524`) - Bug in :meth:`Series.describe` not returning :class:`ArrowDtype` with ``pyarrow.float64`` type with numeric data (:issue:`52427`) +- Bug in :meth:`Series.dt.tz_localize` incorrectly localizing timestamps with :class:`ArrowDtype` (:issue:`52677`) - Fixed bug in :func:`merge` when merging with ``ArrowDtype`` one one and a NumPy dtype on the other side (:issue:`52406`) - Fixed segfault in :meth:`Series.to_numpy` with ``null[pyarrow]`` dtype (:issue:`52443`) From 1c78e97721c7b89a3301415c8728003a5a74ea2d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 14 Apr 2023 17:36:44 -0700 Subject: [PATCH 4/6] typing --- pandas/core/arrays/arrow/array.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 2daf3c653ff13..814a6b26b7090 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2211,7 +2211,9 @@ def _dt_tz_localize( "raise": "raise", "shift_backward": "earliest", "shift_forward": "latest", - }.get(nonexistent, None) + }.get( + nonexistent, None + ) # type: ignore[arg-type] if nonexistent_pa is None: raise NotImplementedError(f"{nonexistent=} is not supported") if tz is None: From 93c4663f6ab876eda673de11eb2219442e14c786 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 14 Apr 2023 17:44:55 -0700 Subject: [PATCH 5/6] Re-place --- pandas/core/arrays/arrow/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 814a6b26b7090..bfb00eac13d6b 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2212,8 +2212,8 @@ def _dt_tz_localize( "shift_backward": "earliest", "shift_forward": "latest", }.get( - nonexistent, None - ) # type: ignore[arg-type] + nonexistent, None # type: ignore[arg-type] + ) if nonexistent_pa is None: raise NotImplementedError(f"{nonexistent=} is not supported") if tz is None: From 564941df2a16f117c8bb1377067c051d12b92e56 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 17 Apr 2023 11:15:36 -0700 Subject: [PATCH 6/6] xfail for windows tzdata --- pandas/tests/extension/test_arrow.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index aafed898d4842..7e4532b1ee326 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2392,7 +2392,17 @@ def test_dt_tz_localize_none(): @pytest.mark.parametrize("unit", ["us", "ns"]) -def test_dt_tz_localize(unit): +def test_dt_tz_localize(unit, request): + if is_platform_windows() and is_ci_environment(): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowInvalid, + reason=( + "TODO: Set ARROW_TIMEZONE_DATABASE environment variable " + "on CI to path to the tzdata for pyarrow." + ), + ) + ) ser = pd.Series( [datetime(year=2023, month=1, day=2, hour=3), None], dtype=ArrowDtype(pa.timestamp(unit)), @@ -2413,7 +2423,17 @@ def test_dt_tz_localize(unit): ["shift_backward", pd.Timestamp("2023-03-12 01:59:59.999999999")], ], ) -def test_dt_tz_localize_nonexistent(nonexistent, exp_date): +def test_dt_tz_localize_nonexistent(nonexistent, exp_date, request): + if is_platform_windows() and is_ci_environment(): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowInvalid, + reason=( + "TODO: Set ARROW_TIMEZONE_DATABASE environment variable " + "on CI to path to the tzdata for pyarrow." + ), + ) + ) ser = pd.Series( [datetime(year=2023, month=3, day=12, hour=2, minute=30), None], dtype=ArrowDtype(pa.timestamp("ns")),