From 1992db14699be673be9b5e9ed7bc291a2be8b796 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 20 Dec 2022 18:34:05 -0800 Subject: [PATCH 1/6] ENH/API: preserve non-nano in to_datetime --- doc/source/whatsnew/v2.0.0.rst | 3 +- pandas/_libs/tslibs/np_datetime.pxd | 1 + pandas/_libs/tslibs/np_datetime.pyi | 1 + pandas/_libs/tslibs/np_datetime.pyx | 9 ++++-- pandas/core/reshape/tile.py | 9 ++++-- pandas/core/tools/datetimes.py | 16 +++++---- pandas/tests/tools/test_to_datetime.py | 45 +++++++++++++------------- 7 files changed, 50 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 208bbfa10b9b2..0daa3f569e832 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -465,7 +465,8 @@ Other API changes - :meth:`Index.astype` now allows casting from ``float64`` dtype to datetime-like dtypes, matching :class:`Series` behavior (:issue:`49660`) - Passing data with dtype of "timedelta64[s]", "timedelta64[ms]", or "timedelta64[us]" to :class:`TimedeltaIndex`, :class:`Series`, or :class:`DataFrame` constructors will now retain that dtype instead of casting to "timedelta64[ns]"; timedelta64 data with lower resolution will be cast to the lowest supported resolution "timedelta64[s]" (:issue:`49014`) - Passing ``dtype`` of "timedelta64[s]", "timedelta64[ms]", or "timedelta64[us]" to :class:`TimedeltaIndex`, :class:`Series`, or :class:`DataFrame` constructors will now retain that dtype instead of casting to "timedelta64[ns]"; passing a dtype with lower resolution for :class:`Series` or :class:`DataFrame` will be cast to the lowest supported resolution "timedelta64[s]" (:issue:`49014`) -- Passing a ``np.datetime64`` object with non-nanosecond resolution to :class:`Timestamp` will retain the input resolution if it is "s", "ms", or "ns"; otherwise it will be cast to the closest supported resolution (:issue:`49008`) +- Passing a ``np.datetime64`` object with non-nanosecond resolution to :class:`Timestamp` will retain the input resolution if it is "s", "ms", "us", or "ns"; otherwise it will be cast to the closest supported resolution (:issue:`49008`) +- Passing ``datetime64`` values with resolution other than nanosecond to :func:`to_datetime` will retain the input resolution if it is "s", "ms", "us", or "ns"; otherwise it will be cast to the closest supported resolution (:issue:`???`) - The ``other`` argument in :meth:`DataFrame.mask` and :meth:`Series.mask` now defaults to ``no_default`` instead of ``np.nan`` consistent with :meth:`DataFrame.where` and :meth:`Series.where`. Entries will be filled with the corresponding NULL value (``np.nan`` for numpy dtypes, ``pd.NA`` for extension dtypes). (:issue:`49111`) - Changed behavior of :meth:`Series.quantile` and :meth:`DataFrame.quantile` with :class:`SparseDtype` to retain sparse dtype (:issue:`49583`) - When creating a :class:`Series` with a object-dtype :class:`Index` of datetime objects, pandas no longer silently converts the index to a :class:`DatetimeIndex` (:issue:`39307`, :issue:`23598`) diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index de81c611c9ee9..dedb0ad4d75e2 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -106,6 +106,7 @@ cpdef cnp.ndarray astype_overflowsafe( cnp.dtype dtype, # ndarray[datetime64[anyunit]] bint copy=*, bint round_ok=*, + bint coerce=*, ) cdef int64_t get_conversion_factor(NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit) except? -1 diff --git a/pandas/_libs/tslibs/np_datetime.pyi b/pandas/_libs/tslibs/np_datetime.pyi index d80d26375412b..11cbb281bbd24 100644 --- a/pandas/_libs/tslibs/np_datetime.pyi +++ b/pandas/_libs/tslibs/np_datetime.pyi @@ -13,6 +13,7 @@ def astype_overflowsafe( dtype: np.dtype, copy: bool = ..., round_ok: bool = ..., + coerce: bool = ..., ) -> np.ndarray: ... def is_unitless(dtype: np.dtype) -> bool: ... def compare_mismatched_resolutions( diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 9db3f7cb4648e..ca52209b41d3b 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -304,6 +304,7 @@ cpdef ndarray astype_overflowsafe( cnp.dtype dtype, bint copy=True, bint round_ok=True, + bint coerce=False, ): """ Convert an ndarray with datetime64[X] to datetime64[Y] @@ -381,7 +382,9 @@ cpdef ndarray astype_overflowsafe( try: check_dts_bounds(&dts, to_unit) except OutOfBoundsDatetime as err: - if is_td: + if coerce: + new_value = NPY_DATETIME_NAT + elif is_td: from_abbrev = np.datetime_data(values.dtype)[0] np_val = np.timedelta64(value, from_abbrev) msg = ( @@ -391,8 +394,8 @@ cpdef ndarray astype_overflowsafe( raise OutOfBoundsTimedelta(msg) from err else: raise - - new_value = npy_datetimestruct_to_datetime(to_unit, &dts) + else: + new_value = npy_datetimestruct_to_datetime(to_unit, &dts) # Analogous to: iresult[i] = new_value (cnp.PyArray_MultiIter_DATA(mi, 0))[0] = new_value diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 243a7c547bbb5..267abdb8d0104 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -483,7 +483,7 @@ def _coerce_to_type(x): if is_datetime64tz_dtype(x.dtype): dtype = x.dtype elif is_datetime64_dtype(x.dtype): - x = to_datetime(x) + x = to_datetime(x).astype("datetime64[ns]", copy=False) dtype = np.dtype("datetime64[ns]") elif is_timedelta64_dtype(x.dtype): x = to_timedelta(x) @@ -527,7 +527,12 @@ def _convert_bin_to_numeric_type(bins, dtype): raise ValueError("bins must be of timedelta64 dtype") elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): if bins_dtype in ["datetime", "datetime64"]: - bins = to_datetime(bins).view(np.int64) + bins = to_datetime(bins) + if is_datetime64_dtype(bins): + # As of 2.0, to_datetime may give non-nano, so we need to convert + # here until the rest of this file recognizes non-nano + bins = bins.astype("datetime64[ns]", copy=False) + bins = bins.view(np.int64) else: raise ValueError("bins must be of datetime64 dtype") diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index a97a866a8406e..88e88ff6b7386 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -28,7 +28,9 @@ Timedelta, Timestamp, astype_overflowsafe, + get_unit_from_dtype, iNaT, + is_supported_unit, nat_strings, parsing, timezones as libtimezones, @@ -51,7 +53,6 @@ from pandas.core.dtypes.common import ( ensure_object, is_datetime64_dtype, - is_datetime64_ns_dtype, is_datetime64tz_dtype, is_float, is_integer, @@ -396,12 +397,15 @@ def _convert_listlike_datetimes( arg = arg.tz_convert(None).tz_localize("utc") return arg - elif is_datetime64_ns_dtype(arg_dtype): + elif is_datetime64_dtype(arg_dtype): + if not is_supported_unit(get_unit_from_dtype(arg_dtype)): + # We go to closest supported reso, i.e. "s" + arg = astype_overflowsafe( + np.asarray(arg), np.dtype("M8[s]"), coerce=errors == "coerce" + ) + if not isinstance(arg, (DatetimeArray, DatetimeIndex)): - try: - return DatetimeIndex(arg, tz=tz, name=name) - except ValueError: - pass + return DatetimeIndex(arg, tz=tz, name=name) elif utc: # DatetimeArray, DatetimeIndex return arg.tz_localize("utc") diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 83e40f5f1d98b..31b18002e8cef 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -22,7 +22,6 @@ iNaT, parsing, ) -from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas.errors import ( OutOfBoundsDatetime, OutOfBoundsTimedelta, @@ -854,22 +853,31 @@ def test_to_datetime_dt64s_and_str(self, arg, format): "dt", [np.datetime64("1000-01-01"), np.datetime64("5000-01-02")] ) def test_to_datetime_dt64s_out_of_bounds(self, cache, dt): - msg = "Out of bounds .* present at position 0" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(dt, errors="raise") + # We cast to the nearest supported reso, i.e. "s" + ts = to_datetime(dt, errors="raise", cache=cache) + assert isinstance(ts, Timestamp) + assert ts.unit == "s" + assert ts.asm8 == dt + + ts = to_datetime(dt, errors="coerce", cache=cache) + assert isinstance(ts, Timestamp) + assert ts.unit == "s" + assert ts.asm8 == dt - # TODO(2.0): The Timestamp and to_datetime behaviors should match; - # as of 2022-09-28, the Timestamp constructor has been updated - # to cast to M8[s] but to_datetime has not ts = Timestamp(dt) - assert ts._creso == NpyDatetimeUnit.NPY_FR_s.value + assert ts.unit == "s" assert ts.asm8 == dt + def test_to_datetime_dt64d_out_of_bounds(self, cache): + dt64 = np.datetime64(np.iinfo(np.int64).max, "D") + msg = "Out of bounds nanosecond timestamp" with pytest.raises(OutOfBoundsDatetime, match=msg): - Timestamp(np.datetime64(np.iinfo(np.int64).max, "D")) + Timestamp(dt64) + with pytest.raises(OutOfBoundsDatetime, match=msg): + to_datetime(dt64, errors="raise", cache=cache) - assert to_datetime(dt, errors="coerce", cache=cache) is NaT + assert to_datetime(dt64, errors="coerce", cache=cache) is NaT @pytest.mark.parametrize("unit", ["s", "D"]) def test_to_datetime_array_of_dt64s(self, cache, unit): @@ -2264,23 +2272,16 @@ def test_string_na_nat_conversion_with_name(self, cache): assert dresult.name == "foo" @pytest.mark.parametrize( - "dtype", - [ - "datetime64[h]", - "datetime64[m]", - "datetime64[s]", - "datetime64[ms]", - "datetime64[us]", - "datetime64[ns]", - ], + "unit", + ["h", "m", "s", "ms", "us", "ns"], ) - def test_dti_constructor_numpy_timeunits(self, cache, dtype): + def test_dti_constructor_numpy_timeunits(self, cache, unit): # GH 9114 + dtype = np.dtype(f"M8[{unit}]") base = to_datetime(["2000-01-01T00:00", "2000-01-02T00:00", "NaT"], cache=cache) values = base.values.astype(dtype) - unit = dtype.split("[")[-1][:-1] if unit in ["h", "m"]: # we cast to closest supported unit unit = "s" @@ -2289,7 +2290,7 @@ def test_dti_constructor_numpy_timeunits(self, cache, dtype): assert expected.dtype == exp_dtype tm.assert_index_equal(DatetimeIndex(values), expected) - tm.assert_index_equal(to_datetime(values, cache=cache), base) + tm.assert_index_equal(to_datetime(values, cache=cache), expected) def test_dayfirst(self, cache): # GH 5917 From 06caddf8be367b3fd066efbfe184d11381f25c73 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 20 Dec 2022 18:35:18 -0800 Subject: [PATCH 2/6] GH ref --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 0daa3f569e832..d9b88d336b548 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -466,7 +466,7 @@ Other API changes - Passing data with dtype of "timedelta64[s]", "timedelta64[ms]", or "timedelta64[us]" to :class:`TimedeltaIndex`, :class:`Series`, or :class:`DataFrame` constructors will now retain that dtype instead of casting to "timedelta64[ns]"; timedelta64 data with lower resolution will be cast to the lowest supported resolution "timedelta64[s]" (:issue:`49014`) - Passing ``dtype`` of "timedelta64[s]", "timedelta64[ms]", or "timedelta64[us]" to :class:`TimedeltaIndex`, :class:`Series`, or :class:`DataFrame` constructors will now retain that dtype instead of casting to "timedelta64[ns]"; passing a dtype with lower resolution for :class:`Series` or :class:`DataFrame` will be cast to the lowest supported resolution "timedelta64[s]" (:issue:`49014`) - Passing a ``np.datetime64`` object with non-nanosecond resolution to :class:`Timestamp` will retain the input resolution if it is "s", "ms", "us", or "ns"; otherwise it will be cast to the closest supported resolution (:issue:`49008`) -- Passing ``datetime64`` values with resolution other than nanosecond to :func:`to_datetime` will retain the input resolution if it is "s", "ms", "us", or "ns"; otherwise it will be cast to the closest supported resolution (:issue:`???`) +- Passing ``datetime64`` values with resolution other than nanosecond to :func:`to_datetime` will retain the input resolution if it is "s", "ms", "us", or "ns"; otherwise it will be cast to the closest supported resolution (:issue:`50369`) - The ``other`` argument in :meth:`DataFrame.mask` and :meth:`Series.mask` now defaults to ``no_default`` instead of ``np.nan`` consistent with :meth:`DataFrame.where` and :meth:`Series.where`. Entries will be filled with the corresponding NULL value (``np.nan`` for numpy dtypes, ``pd.NA`` for extension dtypes). (:issue:`49111`) - Changed behavior of :meth:`Series.quantile` and :meth:`DataFrame.quantile` with :class:`SparseDtype` to retain sparse dtype (:issue:`49583`) - When creating a :class:`Series` with a object-dtype :class:`Index` of datetime objects, pandas no longer silently converts the index to a :class:`DatetimeIndex` (:issue:`39307`, :issue:`23598`) From 79565d706920014011e25907aab4c03a95b31f75 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 21 Dec 2022 15:46:16 -0800 Subject: [PATCH 3/6] Fix PandasArray case --- pandas/core/tools/datetimes.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 88e88ff6b7386..0adfa0f1f619e 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -70,6 +70,7 @@ from pandas.arrays import ( DatetimeArray, IntegerArray, + PandasArray, ) from pandas.core import algorithms from pandas.core.algorithms import unique @@ -386,6 +387,8 @@ def _convert_listlike_datetimes( """ if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype="O") + elif isinstance(arg, PandasArray): + arg = np.array(arg) arg_dtype = getattr(arg, "dtype", None) # these are shortcutable From f2c3eff9637253a6b3087a8c86f32807c0aae7c0 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 24 Dec 2022 12:11:15 -0800 Subject: [PATCH 4/6] mypy fixup --- pandas/core/tools/datetimes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 0adfa0f1f619e..af7ee4e419e97 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -401,6 +401,7 @@ def _convert_listlike_datetimes( return arg elif is_datetime64_dtype(arg_dtype): + arg_dtype = cast(np.dtype, arg_dtype) if not is_supported_unit(get_unit_from_dtype(arg_dtype)): # We go to closest supported reso, i.e. "s" arg = astype_overflowsafe( From 52c383c388274e4969918658637ad470d2c11692 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 27 Dec 2022 14:47:03 -0800 Subject: [PATCH 5/6] comment about errors=ignore --- pandas/core/tools/datetimes.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index af7ee4e419e97..e1fc9f598a18e 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -405,7 +405,10 @@ def _convert_listlike_datetimes( if not is_supported_unit(get_unit_from_dtype(arg_dtype)): # We go to closest supported reso, i.e. "s" arg = astype_overflowsafe( - np.asarray(arg), np.dtype("M8[s]"), coerce=errors == "coerce" + # TODO: looks like we incorrectly raise with errors=="ignore" + np.asarray(arg), + np.dtype("M8[s]"), + coerce=errors == "coerce", ) if not isinstance(arg, (DatetimeArray, DatetimeIndex)): From dd8b4c893229fd6eec995e95b72fe26ccfc8827d Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 4 Jan 2023 12:57:41 -0800 Subject: [PATCH 6/6] suggested edits --- pandas/_libs/tslibs/np_datetime.pxd | 2 +- pandas/_libs/tslibs/np_datetime.pyi | 2 +- pandas/_libs/tslibs/np_datetime.pyx | 4 ++-- pandas/core/tools/datetimes.py | 2 +- pandas/tests/tools/test_to_datetime.py | 12 ++++-------- 5 files changed, 9 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 8cc2a3f7a55c3..fa560cd0853f6 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -106,7 +106,7 @@ cpdef cnp.ndarray astype_overflowsafe( cnp.dtype dtype, # ndarray[datetime64[anyunit]] bint copy=*, bint round_ok=*, - bint coerce=*, + bint is_coerce=*, ) cdef int64_t get_conversion_factor(NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit) except? -1 diff --git a/pandas/_libs/tslibs/np_datetime.pyi b/pandas/_libs/tslibs/np_datetime.pyi index 11cbb281bbd24..0cb0e3b0237d7 100644 --- a/pandas/_libs/tslibs/np_datetime.pyi +++ b/pandas/_libs/tslibs/np_datetime.pyi @@ -13,7 +13,7 @@ def astype_overflowsafe( dtype: np.dtype, copy: bool = ..., round_ok: bool = ..., - coerce: bool = ..., + is_coerce: bool = ..., ) -> np.ndarray: ... def is_unitless(dtype: np.dtype) -> bool: ... def compare_mismatched_resolutions( diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index fd39e6542f5b9..aa3411385595b 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -308,7 +308,7 @@ cpdef ndarray astype_overflowsafe( cnp.dtype dtype, bint copy=True, bint round_ok=True, - bint coerce=False, + bint is_coerce=False, ): """ Convert an ndarray with datetime64[X] to datetime64[Y] @@ -386,7 +386,7 @@ cpdef ndarray astype_overflowsafe( try: check_dts_bounds(&dts, to_unit) except OutOfBoundsDatetime as err: - if coerce: + if is_coerce: new_value = NPY_DATETIME_NAT elif is_td: from_abbrev = np.datetime_data(values.dtype)[0] diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index e510f1d0a9676..eaa7339f3747a 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -407,7 +407,7 @@ def _convert_listlike_datetimes( # TODO: looks like we incorrectly raise with errors=="ignore" np.asarray(arg), np.dtype("M8[s]"), - coerce=errors == "coerce", + is_coerce=errors == "coerce", ) if not isinstance(arg, (DatetimeArray, DatetimeIndex)): diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index b748e0c3974f6..dda2eda2f1906 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1025,14 +1025,10 @@ def test_to_datetime_dt64s_and_str(self, arg, format): @pytest.mark.parametrize( "dt", [np.datetime64("1000-01-01"), np.datetime64("5000-01-02")] ) - def test_to_datetime_dt64s_out_of_bounds(self, cache, dt): - # We cast to the nearest supported reso, i.e. "s" - ts = to_datetime(dt, errors="raise", cache=cache) - assert isinstance(ts, Timestamp) - assert ts.unit == "s" - assert ts.asm8 == dt - - ts = to_datetime(dt, errors="coerce", cache=cache) + @pytest.mark.parametrize("errors", ["raise", "ignore", "coerce"]) + def test_to_datetime_dt64s_out_of_ns_bounds(self, cache, dt, errors): + # GH#50369 We cast to the nearest supported reso, i.e. "s" + ts = to_datetime(dt, errors=errors, cache=cache) assert isinstance(ts, Timestamp) assert ts.unit == "s" assert ts.asm8 == dt