From c89e4223a2d1cc549609fd0687aaa2209ba952db Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 10 Jan 2023 12:29:41 -0800 Subject: [PATCH 1/2] REF: re-use convert_str_to_tsobj --- pandas/_libs/tslib.pyx | 98 ++++++++----------- pandas/_libs/tslibs/conversion.pxd | 4 + pandas/_libs/tslibs/conversion.pyx | 8 +- .../indexes/datetimes/test_scalar_compat.py | 5 +- pandas/tests/tools/test_to_datetime.py | 2 +- 5 files changed, 55 insertions(+), 62 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 10bcf6c9eabbf..916cdabca314a 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -51,7 +51,7 @@ from pandas._libs.tslibs.parsing import parse_datetime_string from pandas._libs.tslibs.conversion cimport ( _TSObject, cast_from_unit, - convert_datetime_to_tsobject, + convert_str_to_tsobject, convert_timezone, get_datetime64_nanos, parse_pydatetime, @@ -482,7 +482,6 @@ cpdef array_to_datetime( object val, tz ndarray[int64_t] iresult npy_datetimestruct dts - NPY_DATETIMEUNIT out_bestunit bint utc_convert = bool(utc) bint seen_datetime_offset = False bint is_raise = errors=="raise" @@ -490,12 +489,8 @@ cpdef array_to_datetime( bint is_coerce = errors=="coerce" bint is_same_offsets _TSObject _ts - int64_t value - int out_local = 0, out_tzoffset = 0 float tz_offset set out_tzoffset_vals = set() - bint string_to_dts_failed - datetime py_dt tzinfo tz_out = None bint found_tz = False, found_naive = False cnp.broadcast mi @@ -557,61 +552,45 @@ cpdef array_to_datetime( # GH#32264 np.str_ object val = str(val) - if len(val) == 0 or val in nat_strings: - iresult[i] = NPY_NAT + if parse_today_now(val, &iresult[i], utc): + # We can't _quite_ dispatch this to convert_str_to_tsobject + # bc there isn't a nice way to pass "utc" cnp.PyArray_MultiIter_NEXT(mi) continue - string_to_dts_failed = string_to_dts( - val, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, None, False + _ts = convert_str_to_tsobject( + val, None, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst ) - if string_to_dts_failed: - # An error at this point is a _parsing_ error - # specifically _not_ OutOfBoundsDatetime - if parse_today_now(val, &iresult[i], utc): - cnp.PyArray_MultiIter_NEXT(mi) - continue - - py_dt = parse_datetime_string(val, - dayfirst=dayfirst, - yearfirst=yearfirst) - # If the dateutil parser returned tzinfo, capture it - # to check if all arguments have the same tzinfo - tz = py_dt.utcoffset() - - if tz is not None: - seen_datetime_offset = True - # dateutil timezone objects cannot be hashed, so - # store the UTC offsets in seconds instead - out_tzoffset_vals.add(tz.total_seconds()) - else: - # Add a marker for naive string, to track if we are - # parsing mixed naive and aware strings - out_tzoffset_vals.add("naive") - - _ts = convert_datetime_to_tsobject(py_dt, None) - iresult[i] = _ts.value + try: + _ts.ensure_reso(NPY_FR_ns) + except OutOfBoundsDatetime as err: + # re-raise with better exception message + raise OutOfBoundsDatetime( + f"Out of bounds nanosecond timestamp: {val}" + ) from err + + iresult[i] = _ts.value + + tz = _ts.tzinfo + if tz is not None: + # dateutil timezone objects cannot be hashed, so + # store the UTC offsets in seconds instead + nsecs = tz.utcoffset(None).total_seconds() + if not (-24 * 3600 < nsecs < 24 * 3600): + # calling timezone(...) below will raise, + # raise here so that we can possibly catch if + # errors=="coerce" + timezone(timedelta(seconds=nsecs)) + out_tzoffset_vals.add(nsecs) + # need to set seen_datetime_offset *after* the + # potentially-raising timezone(timedelta(...)) call, + # otherwise we can go down the is_same_offsets path + # bc len(out_tzoffset_vals) == 0 + seen_datetime_offset = True else: - # No error reported by string_to_dts, pick back up - # where we left off - value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) - if out_local == 1: - seen_datetime_offset = True - # Store the out_tzoffset in seconds - # since we store the total_seconds of - # dateutil.tz.tzoffset objects - out_tzoffset_vals.add(out_tzoffset * 60.) - tz = timezone(timedelta(minutes=out_tzoffset)) - value = tz_localize_to_utc_single(value, tz) - out_local = 0 - out_tzoffset = 0 - else: - # Add a marker for naive string, to track if we are - # parsing mixed naive and aware strings - out_tzoffset_vals.add("naive") - iresult[i] = value - check_dts_bounds(&dts) + # Add a marker for naive string, to track if we are + # parsing mixed naive and aware strings + out_tzoffset_vals.add("naive") else: raise TypeError(f"{type(val)} is not convertible to datetime") @@ -775,6 +754,13 @@ cdef _array_to_datetime_object( yearfirst=yearfirst) pydatetime_to_dt64(oresult[i], &dts) check_dts_bounds(&dts) + + if oresult[i].tzinfo is not None: + # if we have an invalid tzoffset + # (less than -24H or more than 24H), + # calling utcoffset() will raise ValueError + oresult[i].utcoffset() + except (ValueError, OverflowError) as ex: ex.args = (f"{ex}, at position {i}", ) if is_coerce: diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 332ff1522ccf5..756ab67aa7084 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -35,6 +35,10 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, tzinfo tz, int32_t nanos=*, NPY_DATETIMEUNIT reso=*) +cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit, + bint dayfirst=*, + bint yearfirst=*) + cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1 cpdef datetime localize_pydatetime(datetime dt, tzinfo tz) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 7cff269d2191e..aacb06fe36037 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -246,7 +246,7 @@ cdef _TSObject convert_to_tsobject(object ts, tzinfo tz, str unit, obj = _TSObject() if isinstance(ts, str): - return _convert_str_to_tsobject(ts, tz, unit, dayfirst, yearfirst) + return convert_str_to_tsobject(ts, tz, unit, dayfirst, yearfirst) if ts is None or ts is NaT: obj.value = NPY_NAT @@ -463,9 +463,9 @@ cdef _TSObject _create_tsobject_tz_using_offset(npy_datetimestruct dts, return obj -cdef _TSObject _convert_str_to_tsobject(str ts, tzinfo tz, str unit, - bint dayfirst=False, - bint yearfirst=False): +cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit, + bint dayfirst=False, + bint yearfirst=False): """ Convert a string input `ts`, along with optional timezone object`tz` to a _TSObject. diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index be05a649ec0b6..622f41236edb9 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -38,7 +38,10 @@ def test_dti_date(self): @pytest.mark.parametrize("data", [["1400-01-01"], [datetime(1400, 1, 1)]]) def test_dti_date_out_of_range(self, data): # GH#1475 - msg = "^Out of bounds nanosecond timestamp: 1400-01-01 00:00:00, at position 0$" + msg = ( + "^Out of bounds nanosecond timestamp: " + "1400-01-01( 00:00:00)?, at position 0$" + ) with pytest.raises(OutOfBoundsDatetime, match=msg): DatetimeIndex(data) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index a6e40c30d5b82..bf0db0da1c3e3 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2783,7 +2783,7 @@ def test_day_not_in_month_coerce(self, cache, arg, format, warning): assert isna(to_datetime(arg, errors="coerce", format=format, cache=cache)) def test_day_not_in_month_raise(self, cache): - msg = "day is out of range for month" + msg = "could not convert string to Timestamp" with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning( UserWarning, match="Could not infer format" From e4e7ac7b7509da21219eb15a9ecf88f8254b1099 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 12 Jan 2023 17:55:12 -0800 Subject: [PATCH 2/2] revert utcoffset checks --- pandas/_libs/tslib.pyx | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 916cdabca314a..36001248d664b 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -576,11 +576,6 @@ cpdef array_to_datetime( # dateutil timezone objects cannot be hashed, so # store the UTC offsets in seconds instead nsecs = tz.utcoffset(None).total_seconds() - if not (-24 * 3600 < nsecs < 24 * 3600): - # calling timezone(...) below will raise, - # raise here so that we can possibly catch if - # errors=="coerce" - timezone(timedelta(seconds=nsecs)) out_tzoffset_vals.add(nsecs) # need to set seen_datetime_offset *after* the # potentially-raising timezone(timedelta(...)) call, @@ -754,13 +749,6 @@ cdef _array_to_datetime_object( yearfirst=yearfirst) pydatetime_to_dt64(oresult[i], &dts) check_dts_bounds(&dts) - - if oresult[i].tzinfo is not None: - # if we have an invalid tzoffset - # (less than -24H or more than 24H), - # calling utcoffset() will raise ValueError - oresult[i].utcoffset() - except (ValueError, OverflowError) as ex: ex.args = (f"{ex}, at position {i}", ) if is_coerce: