diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index a33322aebab34..014e0c7a0a92e 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -358,6 +358,8 @@ Datetimelike - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` (or :class:`DatetimeTZDtype`) from mixed-numeric inputs treating those as nanoseconds instead of as multiples of the dtype's unit (which would happen with non-mixed numeric inputs) (:issue:`56004`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`) +- Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`) +- Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`) - Timedelta diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 84aceecb09a33..222ff2cde0ede 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -508,7 +508,7 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, npy_datetimestruct dts int out_local = 0, out_tzoffset = 0, string_to_dts_failed datetime dt - int64_t ival + int64_t ival, nanos = 0 NPY_DATETIMEUNIT out_bestunit, reso _TSObject obj @@ -560,10 +560,14 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, return obj dt = parse_datetime_string( - ts, dayfirst=dayfirst, yearfirst=yearfirst, out_bestunit=&out_bestunit + ts, + dayfirst=dayfirst, + yearfirst=yearfirst, + out_bestunit=&out_bestunit, + nanos=&nanos, ) reso = get_supported_reso(out_bestunit) - return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=reso) + return convert_datetime_to_tsobject(dt, tz, nanos=nanos, reso=reso) return convert_datetime_to_tsobject(dt, tz) diff --git a/pandas/_libs/tslibs/parsing.pxd b/pandas/_libs/tslibs/parsing.pxd index 8809c81b530d0..fbe07e68f5adf 100644 --- a/pandas/_libs/tslibs/parsing.pxd +++ b/pandas/_libs/tslibs/parsing.pxd @@ -1,4 +1,5 @@ from cpython.datetime cimport datetime +from numpy cimport int64_t from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT @@ -10,5 +11,6 @@ cdef datetime parse_datetime_string( str date_string, bint dayfirst, bint yearfirst, - NPY_DATETIMEUNIT* out_bestunit + NPY_DATETIMEUNIT* out_bestunit, + int64_t* nanos, ) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 4918de5497c4b..d0872a509c440 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -34,6 +34,7 @@ from numpy cimport ( PyArray_IterNew, flatiter, float64_t, + int64_t, ) cnp.import_array() @@ -272,8 +273,11 @@ def py_parse_datetime_string( # parse_datetime_string cpdef bc it has a pointer argument) cdef: NPY_DATETIMEUNIT out_bestunit + int64_t nanos - return parse_datetime_string(date_string, dayfirst, yearfirst, &out_bestunit) + return parse_datetime_string( + date_string, dayfirst, yearfirst, &out_bestunit, &nanos + ) cdef datetime parse_datetime_string( @@ -283,7 +287,8 @@ cdef datetime parse_datetime_string( str date_string, bint dayfirst, bint yearfirst, - NPY_DATETIMEUNIT* out_bestunit + NPY_DATETIMEUNIT* out_bestunit, + int64_t* nanos, ): """ Parse datetime string, only returns datetime. @@ -311,7 +316,7 @@ cdef datetime parse_datetime_string( default = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) dt = dateutil_parse(date_string, default=default, dayfirst=dayfirst, yearfirst=yearfirst, - ignoretz=False, out_bestunit=out_bestunit) + ignoretz=False, out_bestunit=out_bestunit, nanos=nanos) return dt dt = _parse_delimited_date(date_string, dayfirst, out_bestunit) @@ -330,7 +335,7 @@ cdef datetime parse_datetime_string( dt = dateutil_parse(date_string, default=_DEFAULT_DATETIME, dayfirst=dayfirst, yearfirst=yearfirst, - ignoretz=False, out_bestunit=out_bestunit) + ignoretz=False, out_bestunit=out_bestunit, nanos=nanos) return dt @@ -436,7 +441,7 @@ def parse_datetime_string_with_reso( parsed = dateutil_parse(date_string, _DEFAULT_DATETIME, dayfirst=dayfirst, yearfirst=yearfirst, - ignoretz=False, out_bestunit=&out_bestunit) + ignoretz=False, out_bestunit=&out_bestunit, nanos=NULL) reso = npy_unit_to_attrname[out_bestunit] return parsed, reso @@ -639,7 +644,8 @@ cdef datetime dateutil_parse( bint ignoretz, bint dayfirst, bint yearfirst, - NPY_DATETIMEUNIT* out_bestunit + NPY_DATETIMEUNIT* out_bestunit, + int64_t* nanos, ): """ lifted from dateutil to get resolution""" @@ -671,11 +677,8 @@ cdef datetime dateutil_parse( if reso is None: raise DateParseError(f"Unable to parse datetime string: {timestr}") - if reso == "microsecond": - if repl["microsecond"] == 0: - reso = "second" - elif repl["microsecond"] % 1000 == 0: - reso = "millisecond" + if reso == "microsecond" and repl["microsecond"] % 1000 == 0: + reso = _find_subsecond_reso(timestr, nanos=nanos) try: ret = default.replace(**repl) @@ -745,6 +748,38 @@ cdef datetime dateutil_parse( return ret +cdef object _reso_pattern = re.compile(r"\d:\d{2}:\d{2}\.(?P\d+)") + +cdef _find_subsecond_reso(str timestr, int64_t* nanos): + # GH#55737 + # Check for trailing zeros in a H:M:S.f pattern + match = _reso_pattern.search(timestr) + if not match: + reso = "second" + else: + frac = match.groupdict()["frac"] + if len(frac) <= 3: + reso = "millisecond" + elif len(frac) > 6: + if frac[6:] == "0" * len(frac[6:]): + # corner case where we haven't lost any data + reso = "nanosecond" + elif len(frac) <= 9: + reso = "nanosecond" + if nanos is not NULL: + if len(frac) < 9: + frac = frac + "0" * (9 - len(frac)) + nanos[0] = int(frac[6:]) + else: + # TODO: should we warn/raise in higher-than-nano cases? + reso = "nanosecond" + if nanos is not NULL: + nanos[0] = int(frac[6:9]) + else: + reso = "microsecond" + return reso + + # ---------------------------------------------------------------------- # Parsing for type-inference @@ -916,6 +951,7 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: yearfirst=False, ignoretz=False, out_bestunit=&out_bestunit, + nanos=NULL, ) except (ValueError, OverflowError, InvalidOperation): # In case the datetime can't be parsed, its format cannot be guessed diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 8798a8904e161..0201e5d9af2ee 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -445,6 +445,18 @@ def test_constructor_str_infer_reso(self): ts = Timestamp("300 June 1:30:01.300") assert ts.unit == "ms" + # dateutil path -> don't drop trailing zeros + ts = Timestamp("01-01-2013T00:00:00.000000000+0000") + assert ts.unit == "ns" + + ts = Timestamp("2016/01/02 03:04:05.001000 UTC") + assert ts.unit == "us" + + # higher-than-nanosecond -> we drop the trailing bits + ts = Timestamp("01-01-2013T00:00:00.000000002100+0000") + assert ts == Timestamp("01-01-2013T00:00:00.000000002+0000") + assert ts.unit == "ns" + class TestTimestampConstructors: def test_weekday_but_no_day_raises(self):