From 234021aee5aa387a8875fe53cad6a30d9562dc9b Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 18 May 2023 21:50:01 -0700 Subject: [PATCH 01/11] BUG: to_datetime re-parsing Arrow-backed objects --- pandas/core/tools/datetimes.py | 17 ++++++++++++++++- pandas/tests/tools/test_to_datetime.py | 14 ++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 74210a1ce5ad8..a409d964ddeb1 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -57,7 +57,10 @@ is_list_like, is_numeric_dtype, ) -from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + DatetimeTZDtype, +) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -400,6 +403,18 @@ def _convert_listlike_datetimes( arg = arg.tz_convert(None).tz_localize("utc") return arg + elif isinstance(arg_dtype, ArrowDtype) and np.issubdtype( + arg_dtype.numpy_dtype, np.datetime64 + ): + # TODO: Combine with above if DTI/DTA supports Arrow timestamps + if utc: + import pyarrow as pa + + # array attribute has to exist since arg is Index/Series at this point + arg_arr = arg.array._pa_array + arg._pa_array = arg_arr.cast(pa.timestamp(arg_arr.type.unit, "utc")) + return arg + elif lib.is_np_dtype(arg_dtype, "M"): arg_dtype = cast(np.dtype, arg_dtype) if not is_supported_unit(get_unit_from_dtype(arg_dtype)): diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 92b4370e9d736..bbf3264bcd824 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -933,6 +933,20 @@ def test_to_datetime_dtarr(self, tz): result = to_datetime(arr) assert result is arr + @td.skip_if_no("pyarrow") + @pytest.mark.parametrize("utc", [True, False]) + @pytest.mark.parametrize("tz", [None, "US/Central"]) + def test_to_datetime_arrow(self, tz, utc): + import pyarrow as pa + + dti = date_range("1965-04-03", periods=19, freq="2W", tz=tz) + dti_arrow = dti.astype(pd.ArrowDtype(pa.timestamp(unit="ns"))) + + result = to_datetime(dti_arrow, utc=utc) + expected = to_datetime(dti, utc=utc).astype("timestamp[ns][pyarrow]") + assert result is dti_arrow + tm.assert_index_equal(result, expected, exact=False) + def test_to_datetime_pydatetime(self): actual = to_datetime(datetime(2008, 1, 15)) assert actual == datetime(2008, 1, 15) From e4b3d0f67d9f2dc80dd3aca972e7ae62b125c8d6 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sat, 20 May 2023 11:41:38 -0700 Subject: [PATCH 02/11] Address code comments --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/tools/datetimes.py | 4 +--- pandas/tests/tools/test_to_datetime.py | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index da95bca7fb2a6..710572c9794f2 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -315,11 +315,11 @@ Datetimelike ^^^^^^^^^^^^ - :meth:`DatetimeIndex.map` with ``na_action="ignore"`` now works as expected. (:issue:`51644`) - Bug in :func:`date_range` when ``freq`` was a :class:`DateOffset` with ``nanoseconds`` (:issue:`46877`) +- Bug in :func:`to_datetime` converting :class:`Series` or :class:`DataFrame` containing :class:`arrays.ArrowExtensionArray` of ``pyarrow`` timestamps to numpy datetimes (:issue:`52545`) - Bug in :meth:`Timestamp.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsDatetime`` (:issue:`51494`) - Bug in :meth:`arrays.DatetimeArray.map` and :meth:`DatetimeIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`) - Bug in constructing a :class:`Series` or :class:`DataFrame` from a datetime or timedelta scalar always inferring nanosecond resolution instead of inferring from the input (:issue:`52212`) - Bug in parsing datetime strings with weekday but no day e.g. "2023 Sept Thu" incorrectly raising ``AttributeError`` instead of ``ValueError`` (:issue:`52659`) -- Timedelta ^^^^^^^^^ diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index a409d964ddeb1..469a99f2bceff 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -403,9 +403,7 @@ def _convert_listlike_datetimes( arg = arg.tz_convert(None).tz_localize("utc") return arg - elif isinstance(arg_dtype, ArrowDtype) and np.issubdtype( - arg_dtype.numpy_dtype, np.datetime64 - ): + elif isinstance(arg_dtype, ArrowDtype) and arg_dtype.kind == "M": # TODO: Combine with above if DTI/DTA supports Arrow timestamps if utc: import pyarrow as pa diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index bbf3264bcd824..843c4c7952278 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -933,11 +933,10 @@ def test_to_datetime_dtarr(self, tz): result = to_datetime(arr) assert result is arr - @td.skip_if_no("pyarrow") @pytest.mark.parametrize("utc", [True, False]) @pytest.mark.parametrize("tz", [None, "US/Central"]) def test_to_datetime_arrow(self, tz, utc): - import pyarrow as pa + pa = pytest.importorskip("pyarrow") dti = date_range("1965-04-03", periods=19, freq="2W", tz=tz) dti_arrow = dti.astype(pd.ArrowDtype(pa.timestamp(unit="ns"))) From 76fd635978d36e9ce272093ce2e04dc32078d727 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 26 May 2023 16:16:17 -0700 Subject: [PATCH 03/11] address code review --- pandas/core/tools/datetimes.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 469a99f2bceff..b863f7e8a9c65 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -406,11 +406,7 @@ def _convert_listlike_datetimes( elif isinstance(arg_dtype, ArrowDtype) and arg_dtype.kind == "M": # TODO: Combine with above if DTI/DTA supports Arrow timestamps if utc: - import pyarrow as pa - - # array attribute has to exist since arg is Index/Series at this point - arg_arr = arg.array._pa_array - arg._pa_array = arg_arr.cast(pa.timestamp(arg_arr.type.unit, "utc")) + arg = arg.astype("timestamp[ns, UTC][pyarrow]") return arg elif lib.is_np_dtype(arg_dtype, "M"): From 4ba7a656c1a13400298d3660fcda3b601f3c112b Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 26 May 2023 16:21:38 -0700 Subject: [PATCH 04/11] fix test --- pandas/tests/tools/test_to_datetime.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 8768a698e62cc..dc8d9e3ecb65c 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -943,7 +943,9 @@ def test_to_datetime_arrow(self, tz, utc): result = to_datetime(dti_arrow, utc=utc) expected = to_datetime(dti, utc=utc).astype("timestamp[ns][pyarrow]") - assert result is dti_arrow + if not utc: + # Doesn't hold for utc=True, since that will astype + assert result is dti_arrow tm.assert_index_equal(result, expected, exact=False) def test_to_datetime_pydatetime(self): From ed80297044ebbf203c74327303497ae3d9b51da7 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 1 Jun 2023 17:18:55 -0700 Subject: [PATCH 05/11] fix test --- pandas/tests/tools/test_to_datetime.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index dc8d9e3ecb65c..65431bca22916 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -942,7 +942,9 @@ def test_to_datetime_arrow(self, tz, utc): dti_arrow = dti.astype(pd.ArrowDtype(pa.timestamp(unit="ns"))) result = to_datetime(dti_arrow, utc=utc) - expected = to_datetime(dti, utc=utc).astype("timestamp[ns][pyarrow]") + expected = to_datetime(dti, utc=utc).astype( + f"timestamp[{'ns, UTC' if utc else 'ns'}][pyarrow]" + ) if not utc: # Doesn't hold for utc=True, since that will astype assert result is dti_arrow From 2630740a16994314f535ced27670c2fad9181a3d Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 5 Jul 2023 20:28:57 -0700 Subject: [PATCH 06/11] update --- pandas/core/tools/datetimes.py | 9 +++++++-- pandas/tests/tools/test_to_datetime.py | 12 +++++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index b1fc97ba63a2f..9e3332f208ecf 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -406,10 +406,15 @@ def _convert_listlike_datetimes( arg = arg.tz_convert(None).tz_localize("utc") return arg - elif isinstance(arg_dtype, ArrowDtype) and arg_dtype.kind == "M": + elif isinstance(arg_dtype, ArrowDtype) and arg_dtype.type is Timestamp: # TODO: Combine with above if DTI/DTA supports Arrow timestamps if utc: - arg = arg.astype("timestamp[ns, UTC][pyarrow]") + # pyarrow uses UTC, not lowercase utc + if isinstance(arg, Index): + arg = Index(arg.array._dt_tz_localize("UTC")) + else: + # ArrowExtensionArray + arg = arg._dt_tz_localize("UTC") return arg elif lib.is_np_dtype(arg_dtype, "M"): diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 65431bca22916..eb36bea051195 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -933,22 +933,28 @@ def test_to_datetime_dtarr(self, tz): result = to_datetime(arr) assert result is arr + @pytest.mark.parametrize("arg_class", [Series, Index]) @pytest.mark.parametrize("utc", [True, False]) @pytest.mark.parametrize("tz", [None, "US/Central"]) - def test_to_datetime_arrow(self, tz, utc): + def test_to_datetime_arrow(self, tz, utc, arg_class): pa = pytest.importorskip("pyarrow") dti = date_range("1965-04-03", periods=19, freq="2W", tz=tz) + dti = arg_class(dti) dti_arrow = dti.astype(pd.ArrowDtype(pa.timestamp(unit="ns"))) result = to_datetime(dti_arrow, utc=utc) expected = to_datetime(dti, utc=utc).astype( f"timestamp[{'ns, UTC' if utc else 'ns'}][pyarrow]" ) - if not utc: + if not utc and arg_class is not Series: # Doesn't hold for utc=True, since that will astype + # to_datetime also returns a new object for series assert result is dti_arrow - tm.assert_index_equal(result, expected, exact=False) + if arg_class is Series: + tm.assert_series_equal(result, expected) + else: + tm.assert_index_equal(result, expected, exact=False) def test_to_datetime_pydatetime(self): actual = to_datetime(datetime(2008, 1, 15)) From 325e7922e95e1dffbc6d221b5694647772a57543 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 6 Jul 2023 06:23:34 -0700 Subject: [PATCH 07/11] xfail on windows --- pandas/tests/tools/test_to_datetime.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index eb36bea051195..4b9bc8de8dfd6 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -22,6 +22,7 @@ iNaT, parsing, ) +from pandas.compat import is_platform_windows from pandas.errors import ( OutOfBoundsDatetime, OutOfBoundsTimedelta, @@ -933,6 +934,7 @@ def test_to_datetime_dtarr(self, tz): result = to_datetime(arr) assert result is arr + @pytest.mark.xfail(is_platform_windows(), reason="TZDATA path not correct") @pytest.mark.parametrize("arg_class", [Series, Index]) @pytest.mark.parametrize("utc", [True, False]) @pytest.mark.parametrize("tz", [None, "US/Central"]) From bccd932e6bc2b32b125055af31ccaf1910aa78c6 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 6 Jul 2023 08:40:50 -0700 Subject: [PATCH 08/11] skip on windows instead --- pandas/tests/tools/test_to_datetime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 4b9bc8de8dfd6..63519dd5d308c 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -934,7 +934,7 @@ def test_to_datetime_dtarr(self, tz): result = to_datetime(arr) assert result is arr - @pytest.mark.xfail(is_platform_windows(), reason="TZDATA path not correct") + @pytest.mark.skipif(is_platform_windows(), reason="TZDATA path not correct") @pytest.mark.parametrize("arg_class", [Series, Index]) @pytest.mark.parametrize("utc", [True, False]) @pytest.mark.parametrize("tz", [None, "US/Central"]) From f372d25d7b9476cdb80e1842a28b221a0c03ffa6 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 6 Jul 2023 11:42:20 -0700 Subject: [PATCH 09/11] fix mypy --- pandas/core/tools/datetimes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 9e3332f208ecf..fb71bf788ac3d 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -74,6 +74,7 @@ ) from pandas.core import algorithms from pandas.core.algorithms import unique +from pandas.core.arrays import ArrowExtensionArray from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.datetimes import ( maybe_convert_dtype, @@ -411,7 +412,8 @@ def _convert_listlike_datetimes( if utc: # pyarrow uses UTC, not lowercase utc if isinstance(arg, Index): - arg = Index(arg.array._dt_tz_localize("UTC")) + arg_array = cast(ArrowExtensionArray, arg.array) + arg = Index(arg_array._dt_tz_localize("UTC")) else: # ArrowExtensionArray arg = arg._dt_tz_localize("UTC") From 64478d762b286f4b26d0d61e74356b88963574ff Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 6 Jul 2023 13:56:10 -0700 Subject: [PATCH 10/11] fix --- pandas/core/tools/datetimes.py | 11 +++++++++-- pandas/tests/tools/test_to_datetime.py | 9 +++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index fb71bf788ac3d..b422ee1a08e9f 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -413,10 +413,17 @@ def _convert_listlike_datetimes( # pyarrow uses UTC, not lowercase utc if isinstance(arg, Index): arg_array = cast(ArrowExtensionArray, arg.array) - arg = Index(arg_array._dt_tz_localize("UTC")) + if arg_dtype.pyarrow_dtype.tz is not None: + arg_array = arg_array._dt_tz_convert("UTC") + else: + arg_array = arg_array._dt_tz_localize("UTC") + arg = Index(arg_array) else: # ArrowExtensionArray - arg = arg._dt_tz_localize("UTC") + if arg_dtype.pyarrow_dtype.tz is not None: + arg = arg._dt_tz_convert("UTC") + else: + arg = arg._dt_tz_localize("UTC") return arg elif lib.is_np_dtype(arg_dtype, "M"): diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 63519dd5d308c..67ec0c3023ad4 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -22,7 +22,6 @@ iNaT, parsing, ) -from pandas.compat import is_platform_windows from pandas.errors import ( OutOfBoundsDatetime, OutOfBoundsTimedelta, @@ -934,7 +933,8 @@ def test_to_datetime_dtarr(self, tz): result = to_datetime(arr) assert result is arr - @pytest.mark.skipif(is_platform_windows(), reason="TZDATA path not correct") + # Doesn't work on Windows since tzpath not set correctly + @td.skip_if_windows @pytest.mark.parametrize("arg_class", [Series, Index]) @pytest.mark.parametrize("utc", [True, False]) @pytest.mark.parametrize("tz", [None, "US/Central"]) @@ -943,11 +943,12 @@ def test_to_datetime_arrow(self, tz, utc, arg_class): dti = date_range("1965-04-03", periods=19, freq="2W", tz=tz) dti = arg_class(dti) - dti_arrow = dti.astype(pd.ArrowDtype(pa.timestamp(unit="ns"))) + + dti_arrow = dti.astype(pd.ArrowDtype(pa.timestamp(unit="ns", tz=tz))) result = to_datetime(dti_arrow, utc=utc) expected = to_datetime(dti, utc=utc).astype( - f"timestamp[{'ns, UTC' if utc else 'ns'}][pyarrow]" + pd.ArrowDtype(pa.timestamp(unit="ns", tz=tz if not utc else "UTC")) ) if not utc and arg_class is not Series: # Doesn't hold for utc=True, since that will astype From a03c823d30cfa839b6f87e8423f56d6314a4c894 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sat, 8 Jul 2023 15:05:31 -0700 Subject: [PATCH 11/11] remove accidental --- pandas/tests/tools/test_to_datetime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 67ec0c3023ad4..5ea0ca1fddbd3 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -957,7 +957,7 @@ def test_to_datetime_arrow(self, tz, utc, arg_class): if arg_class is Series: tm.assert_series_equal(result, expected) else: - tm.assert_index_equal(result, expected, exact=False) + tm.assert_index_equal(result, expected) def test_to_datetime_pydatetime(self): actual = to_datetime(datetime(2008, 1, 15))