diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 00f83be0b51c4..132d742b78e9c 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -31,6 +31,7 @@ from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, NPY_FR_ns, _string_to_dts, + astype_overflowsafe, check_dts_bounds, dt64_to_dtstruct, dtstruct_to_dt64, @@ -215,54 +216,20 @@ def ensure_datetime64ns(arr: ndarray, copy: bool = True): ------- ndarray with dtype datetime64[ns] """ - cdef: - Py_ssize_t i, n = arr.size - const int64_t[:] ivalues - int64_t[:] iresult - NPY_DATETIMEUNIT unit - npy_datetimestruct dts - - shape = (arr).shape - if (arr).dtype.byteorder == ">": # GH#29684 we incorrectly get OutOfBoundsDatetime if we dont swap dtype = arr.dtype arr = arr.astype(dtype.newbyteorder("<")) if arr.size == 0: + # Fastpath; doesn't matter but we have old tests for result.base + # being arr. result = arr.view(DT64NS_DTYPE) if copy: result = result.copy() return result - if arr.dtype.kind != "M": - raise TypeError("ensure_datetime64ns arr must have datetime64 dtype") - unit = get_unit_from_dtype(arr.dtype) - if unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC: - # without raising explicitly here, we end up with a SystemError - # built-in function ensure_datetime64ns returned a result with an error - raise ValueError("datetime64/timedelta64 must have a unit specified") - - if unit == NPY_FR_ns: - # Check this before allocating result for perf, might save some memory - if copy: - return arr.copy() - return arr - - ivalues = arr.view(np.int64).ravel("K") - - result = np.empty_like(arr, dtype=DT64NS_DTYPE) - iresult = result.ravel("K").view(np.int64) - - for i in range(n): - if ivalues[i] != NPY_NAT: - pandas_datetime_to_datetimestruct(ivalues[i], unit, &dts) - iresult[i] = dtstruct_to_dt64(&dts) - check_dts_bounds(&dts) - else: - iresult[i] = NPY_NAT - - return result + return astype_overflowsafe(arr, DT64NS_DTYPE, copy=copy) def ensure_timedelta64ns(arr: ndarray, copy: bool = True): diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index dd1494404e933..211b47cc3dc20 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -52,6 +52,8 @@ cdef extern from "numpy/ndarraytypes.h": NPY_FR_as NPY_FR_GENERIC + int64_t NPY_DATETIME_NAT # elswhere we call this NPY_NAT + cdef extern from "src/datetime/np_datetime.h": ctypedef struct pandas_timedeltastruct: int64_t days @@ -67,7 +69,7 @@ cdef extern from "src/datetime/np_datetime.h": cdef bint cmp_scalar(int64_t lhs, int64_t rhs, int op) except -1 -cdef check_dts_bounds(npy_datetimestruct *dts) +cdef check_dts_bounds(npy_datetimestruct *dts, NPY_DATETIMEUNIT unit=?) cdef int64_t dtstruct_to_dt64(npy_datetimestruct* dts) nogil cdef void dt64_to_dtstruct(int64_t dt64, npy_datetimestruct* out) nogil @@ -86,3 +88,9 @@ cdef int _string_to_dts(str val, npy_datetimestruct* dts, bint want_exc) except? -1 cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype) + +cpdef cnp.ndarray astype_overflowsafe( + cnp.ndarray values, # ndarray[datetime64[anyunit]] + cnp.dtype dtype, # ndarray[datetime64[anyunit]] + bint copy=*, +) diff --git a/pandas/_libs/tslibs/np_datetime.pyi b/pandas/_libs/tslibs/np_datetime.pyi index 5227de4e72f44..222e770ef03d5 100644 --- a/pandas/_libs/tslibs/np_datetime.pyi +++ b/pandas/_libs/tslibs/np_datetime.pyi @@ -4,3 +4,6 @@ class OutOfBoundsDatetime(ValueError): ... # only exposed for testing def py_get_unit_from_dtype(dtype: np.dtype): ... +def astype_overflowsafe( + arr: np.ndarray, dtype: np.dtype, copy: bool = ... +) -> np.ndarray: ... diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 32d2f1ca4e406..f7f7a29671d48 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -22,7 +22,10 @@ import_datetime() cimport numpy as cnp cnp.import_array() -from numpy cimport int64_t +from numpy cimport ( + int64_t, + ndarray, +) from pandas._libs.tslibs.util cimport get_c_string_buf_and_size @@ -36,7 +39,12 @@ cdef extern from "src/datetime/np_datetime.h": pandas_timedeltastruct *result ) nogil + # AS, FS, PS versions exist but are not imported because they are not used. npy_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS + npy_datetimestruct _US_MIN_DTS, _US_MAX_DTS + npy_datetimestruct _MS_MIN_DTS, _MS_MAX_DTS + npy_datetimestruct _S_MIN_DTS, _S_MAX_DTS + npy_datetimestruct _M_MIN_DTS, _M_MAX_DTS PyArray_DatetimeMetaData get_datetime_metadata_from_dtype(cnp.PyArray_Descr *dtype); @@ -119,22 +127,40 @@ class OutOfBoundsDatetime(ValueError): pass -cdef inline check_dts_bounds(npy_datetimestruct *dts): +cdef check_dts_bounds(npy_datetimestruct *dts, NPY_DATETIMEUNIT unit=NPY_FR_ns): """Raises OutOfBoundsDatetime if the given date is outside the range that can be represented by nanosecond-resolution 64-bit integers.""" cdef: bint error = False - - if (dts.year <= 1677 and - cmp_npy_datetimestruct(dts, &_NS_MIN_DTS) == -1): + npy_datetimestruct cmp_upper, cmp_lower + + if unit == NPY_FR_ns: + cmp_upper = _NS_MAX_DTS + cmp_lower = _NS_MIN_DTS + elif unit == NPY_FR_us: + cmp_upper = _US_MAX_DTS + cmp_lower = _US_MIN_DTS + elif unit == NPY_FR_ms: + cmp_upper = _MS_MAX_DTS + cmp_lower = _MS_MIN_DTS + elif unit == NPY_FR_s: + cmp_upper = _S_MAX_DTS + cmp_lower = _S_MIN_DTS + elif unit == NPY_FR_m: + cmp_upper = _M_MAX_DTS + cmp_lower = _M_MIN_DTS + else: + raise NotImplementedError(unit) + + if cmp_npy_datetimestruct(dts, &cmp_lower) == -1: error = True - elif (dts.year >= 2262 and - cmp_npy_datetimestruct(dts, &_NS_MAX_DTS) == 1): + elif cmp_npy_datetimestruct(dts, &cmp_upper) == 1: error = True if error: fmt = (f'{dts.year}-{dts.month:02d}-{dts.day:02d} ' f'{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}') + # TODO: "nanosecond" in the message assumes NPY_FR_ns raise OutOfBoundsDatetime(f'Out of bounds nanosecond timestamp: {fmt}') @@ -202,3 +228,68 @@ cdef inline int _string_to_dts(str val, npy_datetimestruct* dts, buf = get_c_string_buf_and_size(val, &length) return parse_iso_8601_datetime(buf, length, want_exc, dts, out_local, out_tzoffset) + + +cpdef ndarray astype_overflowsafe( + ndarray values, + cnp.dtype dtype, + bint copy=True, +): + """ + Convert an ndarray with datetime64[X] to datetime64[Y], raising on overflow. + """ + if values.descr.type_num != cnp.NPY_DATETIME: + # aka values.dtype.kind != "M" + raise TypeError("astype_overflowsafe values must have datetime64 dtype") + if dtype.type_num != cnp.NPY_DATETIME: + raise TypeError("astype_overflowsafe dtype must be datetime64") + + cdef: + NPY_DATETIMEUNIT from_unit = get_unit_from_dtype(values.dtype) + NPY_DATETIMEUNIT to_unit = get_unit_from_dtype(dtype) + + if ( + from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC + or to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC + ): + # without raising explicitly here, we end up with a SystemError + # built-in function [...] returned a result with an error + raise ValueError("datetime64 values and dtype must have a unit specified") + + if from_unit == to_unit: + # Check this before allocating result for perf, might save some memory + if copy: + return values.copy() + return values + + cdef: + ndarray i8values = values.view("i8") + + # equiv: result = np.empty((values).shape, dtype="i8") + ndarray iresult = cnp.PyArray_EMPTY( + values.ndim, values.shape, cnp.NPY_INT64, 0 + ) + + cnp.broadcast mi = cnp.PyArray_MultiIterNew2(iresult, i8values) + cnp.flatiter it + Py_ssize_t i, N = values.size + int64_t value, new_value + npy_datetimestruct dts + + for i in range(N): + # Analogous to: item = values[i] + value = (cnp.PyArray_MultiIter_DATA(mi, 1))[0] + + if value == NPY_DATETIME_NAT: + new_value = NPY_DATETIME_NAT + else: + pandas_datetime_to_datetimestruct(value, from_unit, &dts) + check_dts_bounds(&dts, to_unit) + new_value = npy_datetimestruct_to_datetime(to_unit, &dts) + + # Analogous to: iresult[i] = new_value + (cnp.PyArray_MultiIter_DATA(mi, 0))[0] = new_value + + cnp.PyArray_MultiIter_NEXT(mi) + + return iresult.view(dtype) diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.c b/pandas/_libs/tslibs/src/datetime/np_datetime.c index 12e20df256293..69c073d223e67 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.c @@ -27,10 +27,40 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #include #include "np_datetime.h" + +const npy_datetimestruct _AS_MIN_DTS = { + 1969, 12, 31, 23, 59, 50, 776627, 963145, 224193}; +const npy_datetimestruct _FS_MIN_DTS = { + 1969, 12, 31, 21, 26, 16, 627963, 145224, 193000}; +const npy_datetimestruct _PS_MIN_DTS = { + 1969, 9, 16, 5, 57, 7, 963145, 224193, 0}; const npy_datetimestruct _NS_MIN_DTS = { 1677, 9, 21, 0, 12, 43, 145224, 193000, 0}; +const npy_datetimestruct _US_MIN_DTS = { + -290308, 12, 21, 19, 59, 05, 224193, 0, 0}; +const npy_datetimestruct _MS_MIN_DTS = { + -292275055, 5, 16, 16, 47, 4, 193000, 0, 0}; +const npy_datetimestruct _S_MIN_DTS = { + -292277022657, 1, 27, 8, 29, 53, 0, 0, 0}; +const npy_datetimestruct _M_MIN_DTS = { + -17536621475646, 5, 4, 5, 53, 0, 0, 0, 0}; + +const npy_datetimestruct _AS_MAX_DTS = { + 1970, 1, 1, 0, 0, 9, 223372, 36854, 775807}; +const npy_datetimestruct _FS_MAX_DTS = { + 1970, 1, 1, 2, 33, 43, 372036, 854775, 807000}; +const npy_datetimestruct _PS_MAX_DTS = { + 1970, 4, 17, 18, 2, 52, 36854, 775807, 0}; const npy_datetimestruct _NS_MAX_DTS = { 2262, 4, 11, 23, 47, 16, 854775, 807000, 0}; +const npy_datetimestruct _US_MAX_DTS = { + 294247, 1, 10, 4, 0, 54, 775807, 0, 0}; +const npy_datetimestruct _MS_MAX_DTS = { + 292278994, 8, 17, 7, 12, 55, 807000, 0, 0}; +const npy_datetimestruct _S_MAX_DTS = { + 292277026596, 12, 4, 15, 30, 7, 0, 0, 0}; +const npy_datetimestruct _M_MAX_DTS = { + 17536621479585, 8, 30, 18, 7, 0, 0, 0, 0}; const int days_per_month_table[2][12] = { diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.h b/pandas/_libs/tslibs/src/datetime/np_datetime.h index 8e58be1ca8383..065f09a6d93b5 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.h @@ -28,8 +28,22 @@ typedef struct { npy_int32 hrs, min, sec, ms, us, ns, seconds, microseconds, nanoseconds; } pandas_timedeltastruct; +extern const npy_datetimestruct _AS_MIN_DTS; +extern const npy_datetimestruct _AS_MAX_DTS; +extern const npy_datetimestruct _FS_MIN_DTS; +extern const npy_datetimestruct _FS_MAX_DTS; +extern const npy_datetimestruct _PS_MIN_DTS; +extern const npy_datetimestruct _PS_MAX_DTS; extern const npy_datetimestruct _NS_MIN_DTS; extern const npy_datetimestruct _NS_MAX_DTS; +extern const npy_datetimestruct _US_MIN_DTS; +extern const npy_datetimestruct _US_MAX_DTS; +extern const npy_datetimestruct _MS_MIN_DTS; +extern const npy_datetimestruct _MS_MAX_DTS; +extern const npy_datetimestruct _S_MIN_DTS; +extern const npy_datetimestruct _S_MAX_DTS; +extern const npy_datetimestruct _M_MIN_DTS; +extern const npy_datetimestruct _M_MAX_DTS; // stuff pandas needs // ---------------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index ff77566e1d559..9190585b2882d 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -534,7 +534,7 @@ cdef const int64_t[:] _tz_convert_from_utc(const int64_t[:] stamps, tzinfo tz): int64_t[::1] result - if is_utc(tz): + if is_utc(tz) or tz is None: # Much faster than going through the "standard" pattern below return stamps.copy() diff --git a/pandas/tests/tslibs/test_np_datetime.py b/pandas/tests/tslibs/test_np_datetime.py index 00a2f90217434..8fff63eaeac05 100644 --- a/pandas/tests/tslibs/test_np_datetime.py +++ b/pandas/tests/tslibs/test_np_datetime.py @@ -1,6 +1,13 @@ import numpy as np +import pytest -from pandas._libs.tslibs.np_datetime import py_get_unit_from_dtype +from pandas._libs.tslibs.np_datetime import ( + OutOfBoundsDatetime, + astype_overflowsafe, + py_get_unit_from_dtype, +) + +import pandas._testing as tm def test_get_unit_from_dtype(): @@ -35,3 +42,50 @@ def test_get_unit_from_dtype(): assert py_get_unit_from_dtype(np.dtype("m8[ps]")) == 11 assert py_get_unit_from_dtype(np.dtype("m8[fs]")) == 12 assert py_get_unit_from_dtype(np.dtype("m8[as]")) == 13 + + +class TestAstypeOverflowSafe: + def test_pass_non_dt64_array(self): + # check that we raise, not segfault + arr = np.arange(5) + dtype = np.dtype("M8[ns]") + + msg = "astype_overflowsafe values must have datetime64 dtype" + with pytest.raises(TypeError, match=msg): + astype_overflowsafe(arr, dtype, copy=True) + + with pytest.raises(TypeError, match=msg): + astype_overflowsafe(arr, dtype, copy=False) + + def test_pass_non_dt64_dtype(self): + # check that we raise, not segfault + arr = np.arange(5, dtype="i8").view("M8[D]") + dtype = np.dtype("m8[ns]") + + msg = "astype_overflowsafe dtype must be datetime64" + with pytest.raises(TypeError, match=msg): + astype_overflowsafe(arr, dtype, copy=True) + + with pytest.raises(TypeError, match=msg): + astype_overflowsafe(arr, dtype, copy=False) + + def test_astype_overflowsafe(self): + dtype = np.dtype("M8[ns]") + + dt = np.datetime64("2262-04-05", "D") + arr = dt + np.arange(10, dtype="m8[D]") + + # arr.astype silently overflows, so this + wrong = arr.astype(dtype) + roundtrip = wrong.astype(arr.dtype) + assert not (wrong == roundtrip).all() + + msg = "Out of bounds nanosecond timestamp" + with pytest.raises(OutOfBoundsDatetime, match=msg): + astype_overflowsafe(arr, dtype) + + # But converting to microseconds is fine, and we match numpy's results. + dtype2 = np.dtype("M8[us]") + result = astype_overflowsafe(arr, dtype2) + expected = arr.astype(dtype2) + tm.assert_numpy_array_equal(result, expected)