From 97eeba9e7d4f82c2e0e794f97bdee38fa440d56f Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 18 Jul 2022 14:21:25 -0700 Subject: [PATCH 1/3] BUG: PeriodIndex-with-Nat + TimedeltaArray --- pandas/_libs/tslibs/dtypes.pxd | 2 +- pandas/_libs/tslibs/dtypes.pyi | 1 + pandas/_libs/tslibs/dtypes.pyx | 2 +- pandas/_libs/tslibs/np_datetime.pxd | 1 + pandas/_libs/tslibs/np_datetime.pyi | 5 +- pandas/_libs/tslibs/np_datetime.pyx | 72 +++++++++++++++++++++---- pandas/core/arrays/period.py | 33 ++++++++---- pandas/tests/arithmetic/test_period.py | 15 ++++++ pandas/tests/tslibs/test_np_datetime.py | 12 +++++ 9 files changed, 119 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index 6e41a55f30929..352680143113d 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -3,7 +3,7 @@ from numpy cimport int64_t from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT -cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit) +cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit) cdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev) cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) nogil cpdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=*) except? -1 diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi index 041c51533d8da..82f62e16c4205 100644 --- a/pandas/_libs/tslibs/dtypes.pyi +++ b/pandas/_libs/tslibs/dtypes.pyi @@ -8,6 +8,7 @@ _period_code_map: dict[str, int] def periods_per_day(reso: int) -> int: ... def periods_per_second(reso: int) -> int: ... def is_supported_unit(reso: int) -> bool: ... +def npy_unit_to_abbrev(reso: int) -> str: ... class PeriodDtypeBase: _dtype_code: int # PeriodDtypeCode diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index a0a7ab90ebb30..c09ac2a686d5c 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -289,7 +289,7 @@ def is_supported_unit(NPY_DATETIMEUNIT reso): ) -cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit): +cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit): if unit == NPY_DATETIMEUNIT.NPY_FR_ns or unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC: # generic -> default to nanoseconds return "ns" diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 420d83909a78d..bfccedba9431e 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -101,6 +101,7 @@ cpdef cnp.ndarray astype_overflowsafe( cnp.ndarray values, # ndarray[datetime64[anyunit]] cnp.dtype dtype, # ndarray[datetime64[anyunit]] bint copy=*, + bint round_ok=*, ) cdef int64_t get_conversion_factor(NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit) except? -1 diff --git a/pandas/_libs/tslibs/np_datetime.pyi b/pandas/_libs/tslibs/np_datetime.pyi index 757165fbad268..d80d26375412b 100644 --- a/pandas/_libs/tslibs/np_datetime.pyi +++ b/pandas/_libs/tslibs/np_datetime.pyi @@ -9,7 +9,10 @@ class OutOfBoundsTimedelta(ValueError): ... def py_get_unit_from_dtype(dtype: np.dtype): ... def py_td64_to_tdstruct(td64: int, unit: int) -> dict: ... def astype_overflowsafe( - arr: np.ndarray, dtype: np.dtype, copy: bool = ... + arr: np.ndarray, + dtype: np.dtype, + copy: bool = ..., + round_ok: bool = ..., ) -> np.ndarray: ... def is_unitless(dtype: np.dtype) -> bool: ... def compare_mismatched_resolutions( diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 494eb5da7e107..87d845d03a5a8 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -282,6 +282,7 @@ cpdef ndarray astype_overflowsafe( ndarray values, cnp.dtype dtype, bint copy=True, + bint round_ok=True, ): """ Convert an ndarray with datetime64[X] to datetime64[Y] @@ -314,10 +315,6 @@ cpdef ndarray astype_overflowsafe( "datetime64/timedelta64 values and dtype must have a unit specified" ) - if (values).dtype.byteorder == ">": - # GH#29684 we incorrectly get OutOfBoundsDatetime if we dont swap - values = values.astype(values.dtype.newbyteorder("<")) - if from_unit == to_unit: # Check this before allocating result for perf, might save some memory if copy: @@ -325,9 +322,17 @@ cpdef ndarray astype_overflowsafe( return values elif from_unit > to_unit: - # e.g. ns -> us, so there is no risk of overflow, so we can use - # numpy's astype safely. Note there _is_ risk of truncation. - return values.astype(dtype) + if round_ok: + # e.g. ns -> us, so there is no risk of overflow, so we can use + # numpy's astype safely. Note there _is_ risk of truncation. + return values.astype(dtype) + else: + iresult2 = astype_round_check(values.view("i8"), from_unit, to_unit) + return iresult2.view(dtype) + + if (values).dtype.byteorder == ">": + # GH#29684 we incorrectly get OutOfBoundsDatetime if we dont swap + values = values.astype(values.dtype.newbyteorder("<")) cdef: ndarray i8values = values.view("i8") @@ -356,10 +361,11 @@ cpdef ndarray astype_overflowsafe( check_dts_bounds(&dts, to_unit) except OutOfBoundsDatetime as err: if is_td: - tdval = np.timedelta64(value).view(values.dtype) + from_abbrev = str(values.dtype).split("[")[-1][:-1] + np_val = np.timedelta64(value, from_abbrev) msg = ( - "Cannot convert {tdval} to {dtype} without overflow" - .format(tdval=str(tdval), dtype=str(dtype)) + "Cannot convert {np_val} to {dtype} without overflow" + .format(np_val=str(np_val), dtype=str(dtype)) ) raise OutOfBoundsTimedelta(msg) from err else: @@ -453,6 +459,52 @@ cdef int op_to_op_code(op): return Py_GT +cdef ndarray astype_round_check( + ndarray i8values, + NPY_DATETIMEUNIT from_unit, + NPY_DATETIMEUNIT to_unit +): + # cases with from_unit > to_unit, e.g. ns->us, raise if the conversion + # involves truncation, e.g. 1500ns->1us + cdef: + Py_ssize_t i, N = i8values.size + + # equiv: iresult = np.empty((i8values).shape, dtype="i8") + ndarray iresult = cnp.PyArray_EMPTY( + i8values.ndim, i8values.shape, cnp.NPY_INT64, 0 + ) + cnp.broadcast mi = cnp.PyArray_MultiIterNew2(iresult, i8values) + + # Note the arguments to_unit, from unit are swapped vs how they + # are passed when going to a higher-frequency reso. + int64_t mult = get_conversion_factor(to_unit, from_unit) + int64_t value, mod + + for i in range(N): + # Analogous to: item = i8values[i] + value = (cnp.PyArray_MultiIter_DATA(mi, 1))[0] + + if value == NPY_DATETIME_NAT: + new_value = NPY_DATETIME_NAT + else: + new_value, mod = divmod(value, mult) + if mod != 0: + # TODO: avoid runtime import + from pandas._libs.tslibs.dtypes import npy_unit_to_abbrev + from_abbrev = npy_unit_to_abbrev(from_unit) + to_abbrev = npy_unit_to_abbrev(to_unit) + raise ValueError( + f"Cannot losslessly cast '{value} {from_abbrev}' to {to_abbrev}" + ) + + # Analogous to: iresult[i] = new_value + (cnp.PyArray_MultiIter_DATA(mi, 0))[0] = new_value + + cnp.PyArray_MultiIter_NEXT(mi) + + return iresult + + @cython.overflowcheck(True) cdef int64_t get_conversion_factor(NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit) except? -1: """ diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 6e6de8399cc38..8a1ed871900d2 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -72,10 +72,7 @@ ABCSeries, ABCTimedeltaArray, ) -from pandas.core.dtypes.missing import ( - isna, - notna, -) +from pandas.core.dtypes.missing import notna import pandas.core.algorithms as algos from pandas.core.arrays import datetimelike as dtl @@ -798,14 +795,28 @@ def _add_timedelta_arraylike( f"Cannot add or subtract timedelta64[ns] dtype from {self.dtype}" ) - if not np.all(isna(other)): - delta = self._check_timedeltalike_freq_compat(other) - else: - # all-NaT TimedeltaIndex is equivalent to a single scalar td64 NaT - return self + np.timedelta64("NaT") + # FIXME: kludge + abbrev = self.freq._prefix + abbrev = {"H": "h", "T": "m", "S": "s", "L": "ms", "U": "us", "N": "ns"}.get( + abbrev, abbrev + ) + dtype = np.dtype(f"m8[{abbrev}]") + + try: + delta = astype_overflowsafe( + np.asarray(other), dtype=dtype, copy=False, round_ok=False + ) + except ValueError as err: + # TODO: not actually a great exception message in this case + raise raise_on_incompatible(self, other) from err + + b_mask = np.isnat(delta) - ordinals = self._addsub_int_array_or_scalar(delta, operator.add).asi8 - return type(self)(ordinals, dtype=self.dtype) + res_values = algos.checked_add_with_arr( + self.asi8, delta.view("i8"), arr_mask=self._isnan, b_mask=b_mask + ) + np.putmask(res_values, self._isnan | b_mask, iNaT) + return type(self)(res_values, freq=self.freq) def _check_timedeltalike_freq_compat(self, other): """ diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 7adc407fd5de1..50f5ab8aee9dd 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -1243,6 +1243,21 @@ def test_parr_add_sub_tdt64_nat_array(self, box_with_array, other): with pytest.raises(TypeError, match=msg): other - obj + # some but not *all* NaT + other = other.copy() + other[0] = np.timedelta64(0, "ns") + expected = PeriodIndex([pi[0]] + ["NaT"] * 8, freq="19D") + expected = tm.box_expected(expected, box_with_array) + + result = obj + other + tm.assert_equal(result, expected) + result = other + obj + tm.assert_equal(result, expected) + result = obj - other + tm.assert_equal(result, expected) + with pytest.raises(TypeError, match=msg): + other - obj + # --------------------------------------------------------------- # Unsorted diff --git a/pandas/tests/tslibs/test_np_datetime.py b/pandas/tests/tslibs/test_np_datetime.py index cc09f0fc77039..02edf1a093877 100644 --- a/pandas/tests/tslibs/test_np_datetime.py +++ b/pandas/tests/tslibs/test_np_datetime.py @@ -208,3 +208,15 @@ def test_astype_overflowsafe_td64(self): result = astype_overflowsafe(arr, dtype2) expected = arr.astype(dtype2) tm.assert_numpy_array_equal(result, expected) + + def test_astype_overflowsafe_disallow_rounding(self): + arr = np.array([-1500, 1500], dtype="M8[ns]") + dtype = np.dtype("M8[us]") + + msg = "Cannot losslessly cast '-1500 ns' to us" + with pytest.raises(ValueError, match=msg): + astype_overflowsafe(arr, dtype, round_ok=False) + + result = astype_overflowsafe(arr, dtype, round_ok=True) + expected = arr.astype(dtype) + tm.assert_numpy_array_equal(result, expected) From 1532a46723d23107bcef6da4ebd2829fab04f0ca Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 18 Jul 2022 15:10:54 -0700 Subject: [PATCH 2/3] mypy fixup --- pandas/core/arrays/period.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 8a1ed871900d2..baf2f04feb4e0 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -789,14 +789,16 @@ def _add_timedelta_arraylike( ------- result : ndarray[int64] """ - if not isinstance(self.freq, Tick): + freq = self.freq + if not isinstance(freq, Tick): # We cannot add timedelta-like to non-tick PeriodArray raise TypeError( f"Cannot add or subtract timedelta64[ns] dtype from {self.dtype}" ) # FIXME: kludge - abbrev = self.freq._prefix + # error: "Tick" has no attribute "_prefix" + abbrev = freq._prefix # type: ignore[attr-defined] abbrev = {"H": "h", "T": "m", "S": "s", "L": "ms", "U": "us", "N": "ns"}.get( abbrev, abbrev ) From e8d329af59893a2ff0b8d7e8092a0e32a73c4ad8 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 19 Jul 2022 11:21:35 -0700 Subject: [PATCH 3/3] de-kludge --- pandas/_libs/tslibs/np_datetime.pyx | 2 +- pandas/_libs/tslibs/offsets.pyi | 2 ++ pandas/_libs/tslibs/offsets.pyx | 8 ++++++++ pandas/core/arrays/period.py | 8 +------- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 87d845d03a5a8..5f8bad8398076 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -361,7 +361,7 @@ cpdef ndarray astype_overflowsafe( check_dts_bounds(&dts, to_unit) except OutOfBoundsDatetime as err: if is_td: - from_abbrev = str(values.dtype).split("[")[-1][:-1] + from_abbrev = np.datetime_data(values.dtype)[0] np_val = np.timedelta64(value, from_abbrev) msg = ( "Cannot convert {np_val} to {dtype} without overflow" diff --git a/pandas/_libs/tslibs/offsets.pyi b/pandas/_libs/tslibs/offsets.pyi index ed6ddf7b02be1..c885b869f983a 100644 --- a/pandas/_libs/tslibs/offsets.pyi +++ b/pandas/_libs/tslibs/offsets.pyi @@ -111,6 +111,8 @@ def to_offset(freq: timedelta | str) -> BaseOffset: ... class Tick(SingleConstructorOffset): _reso: int + _prefix: str + _td64_unit: str def __init__(self, n: int = ..., normalize: bool = ...) -> None: ... @property def delta(self) -> Timedelta: ... diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 81b59db6f0e18..5f4f6b998a60a 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -796,6 +796,7 @@ cdef class SingleConstructorOffset(BaseOffset): cdef class Tick(SingleConstructorOffset): _adjust_dst = False _prefix = "undefined" + _td64_unit = "undefined" _attributes = tuple(["n", "normalize"]) def __init__(self, n=1, normalize=False): @@ -968,6 +969,7 @@ cdef class Tick(SingleConstructorOffset): cdef class Day(Tick): _nanos_inc = 24 * 3600 * 1_000_000_000 _prefix = "D" + _td64_unit = "D" _period_dtype_code = PeriodDtypeCode.D _reso = NPY_DATETIMEUNIT.NPY_FR_D @@ -975,6 +977,7 @@ cdef class Day(Tick): cdef class Hour(Tick): _nanos_inc = 3600 * 1_000_000_000 _prefix = "H" + _td64_unit = "h" _period_dtype_code = PeriodDtypeCode.H _reso = NPY_DATETIMEUNIT.NPY_FR_h @@ -982,6 +985,7 @@ cdef class Hour(Tick): cdef class Minute(Tick): _nanos_inc = 60 * 1_000_000_000 _prefix = "T" + _td64_unit = "m" _period_dtype_code = PeriodDtypeCode.T _reso = NPY_DATETIMEUNIT.NPY_FR_m @@ -989,6 +993,7 @@ cdef class Minute(Tick): cdef class Second(Tick): _nanos_inc = 1_000_000_000 _prefix = "S" + _td64_unit = "s" _period_dtype_code = PeriodDtypeCode.S _reso = NPY_DATETIMEUNIT.NPY_FR_s @@ -996,6 +1001,7 @@ cdef class Second(Tick): cdef class Milli(Tick): _nanos_inc = 1_000_000 _prefix = "L" + _td64_unit = "ms" _period_dtype_code = PeriodDtypeCode.L _reso = NPY_DATETIMEUNIT.NPY_FR_ms @@ -1003,6 +1009,7 @@ cdef class Milli(Tick): cdef class Micro(Tick): _nanos_inc = 1000 _prefix = "U" + _td64_unit = "us" _period_dtype_code = PeriodDtypeCode.U _reso = NPY_DATETIMEUNIT.NPY_FR_us @@ -1010,6 +1017,7 @@ cdef class Micro(Tick): cdef class Nano(Tick): _nanos_inc = 1 _prefix = "N" + _td64_unit = "ns" _period_dtype_code = PeriodDtypeCode.N _reso = NPY_DATETIMEUNIT.NPY_FR_ns diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index baf2f04feb4e0..2d676f94c6a64 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -796,13 +796,7 @@ def _add_timedelta_arraylike( f"Cannot add or subtract timedelta64[ns] dtype from {self.dtype}" ) - # FIXME: kludge - # error: "Tick" has no attribute "_prefix" - abbrev = freq._prefix # type: ignore[attr-defined] - abbrev = {"H": "h", "T": "m", "S": "s", "L": "ms", "U": "us", "N": "ns"}.get( - abbrev, abbrev - ) - dtype = np.dtype(f"m8[{abbrev}]") + dtype = np.dtype(f"m8[{freq._td64_unit}]") try: delta = astype_overflowsafe(