From 975e31325469f746a20100960d2983f7dd69256a Mon Sep 17 00:00:00 2001 From: Marco Gorelli <> Date: Fri, 21 Oct 2022 17:09:10 +0200 Subject: [PATCH 1/6] :art: --- pandas/_libs/tslib.pyi | 2 + pandas/_libs/tslib.pyx | 38 ++++++- pandas/_libs/tslibs/conversion.pyx | 4 +- pandas/_libs/tslibs/np_datetime.pxd | 2 + pandas/_libs/tslibs/np_datetime.pyx | 8 +- pandas/_libs/tslibs/parsing.pyx | 4 +- .../tslibs/src/datetime/np_datetime_strings.c | 65 +++++++++++- .../tslibs/src/datetime/np_datetime_strings.h | 4 +- pandas/core/arrays/datetimes.py | 4 + pandas/core/tools/datetimes.py | 13 +-- pandas/tests/tools/test_to_datetime.py | 100 +++++++++++++++++- 11 files changed, 224 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 8fec9ecf27f30..c40cb380058c9 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -25,6 +25,8 @@ def array_to_datetime( utc: bool = ..., require_iso8601: bool = ..., allow_mixed: bool = ..., + format: str | None = ..., + exact: bool = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... # returned ndarray may be object dtype or datetime64[ns] diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 03331f54db892..09407a0746870 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1,3 +1,4 @@ +import re import warnings cimport cython @@ -85,6 +86,8 @@ def _test_parse_iso8601(ts: str): _TSObject obj int out_local = 0, out_tzoffset = 0 NPY_DATETIMEUNIT out_bestunit + char inferred_format + int format_len obj = _TSObject() @@ -93,7 +96,7 @@ def _test_parse_iso8601(ts: str): elif ts == 'today': return Timestamp.now().normalize() - string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True) + string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True, &inferred_format, &format_len) obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts) check_dts_bounds(&obj.dts) if out_local == 1: @@ -449,6 +452,8 @@ cpdef array_to_datetime( bint utc=False, bint require_iso8601=False, bint allow_mixed=False, + str format=None, + bint exact=False, ): """ Converts a 1D array of date-like values to a numpy array of either: @@ -509,6 +514,8 @@ cpdef array_to_datetime( datetime py_dt tzinfo tz_out = None bint found_tz = False, found_naive = False + char inferred_format[100] + int format_len # specify error conditions assert is_raise or is_ignore or is_coerce @@ -568,6 +575,15 @@ cpdef array_to_datetime( iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) elif is_integer_object(val) or is_float_object(val): + if require_iso8601: + if is_coerce: + iresult[i] = NPY_NAT + continue + elif is_raise: + raise ValueError( + f"time data \"{val}\" at position {i} doesn't match format {format}" + ) + return values, tz_out # these must be ns unit by-definition seen_integer = True @@ -598,7 +614,8 @@ cpdef array_to_datetime( string_to_dts_failed = string_to_dts( val, &dts, &out_bestunit, &out_local, - &out_tzoffset, False + &out_tzoffset, False, inferred_format, + &format_len, ) if string_to_dts_failed: # An error at this point is a _parsing_ error @@ -613,7 +630,7 @@ cpdef array_to_datetime( continue elif is_raise: raise ValueError( - f"time data \"{val}\" at position {i} doesn't match format specified" + f"time data \"{val}\" at position {i} doesn't match {format}" ) return values, tz_out @@ -644,6 +661,21 @@ cpdef array_to_datetime( _ts = convert_datetime_to_tsobject(py_dt, None) iresult[i] = _ts.value if not string_to_dts_failed: + if require_iso8601: + guess = inferred_format[:format_len].decode('utf-8') + if ( + (exact and format != guess) + or (not exact and re.search(format, guess) is None) + ): + if is_coerce: + iresult[i] = NPY_NAT + continue + elif is_raise: + raise ValueError( + f"time data \"{val}\" at position {i} doesn't " + f"match format {format}" + ) + return values, tz_out # No error reported by string_to_dts, pick back up # where we left off value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 923dfa3c54d26..6533bc22348d4 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -469,6 +469,8 @@ cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, str unit, datetime dt int64_t ival NPY_DATETIMEUNIT out_bestunit + char inferred_format + int format_len if len(ts) == 0 or ts in nat_strings: ts = NaT @@ -488,7 +490,7 @@ cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, str unit, else: string_to_dts_failed = string_to_dts( ts, &dts, &out_bestunit, &out_local, - &out_tzoffset, False + &out_tzoffset, False, &inferred_format, &format_len ) if not string_to_dts_failed: try: diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index e51bbd4e074e1..2a40e78ff7901 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -95,6 +95,8 @@ cdef int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, + char *inferred_format, + int *format_len, ) except? -1 cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype) diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 07872050dc822..c5d5f284dc59a 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -52,7 +52,8 @@ cdef extern from "src/datetime/np_datetime_strings.h": int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, - int *out_local, int *out_tzoffset) + int *out_local, int *out_tzoffset, + char *inferred_format, int *format_len) # ---------------------------------------------------------------------- @@ -273,6 +274,8 @@ cdef inline int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, + char *inferred_format, + int *format_len ) except? -1: cdef: Py_ssize_t length @@ -280,7 +283,8 @@ cdef inline int string_to_dts( buf = get_c_string_buf_and_size(val, &length) return parse_iso_8601_datetime(buf, length, want_exc, - dts, out_bestunit, out_local, out_tzoffset) + dts, out_bestunit, out_local, out_tzoffset, + inferred_format, format_len) cpdef ndarray astype_overflowsafe( diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 469e0721f1207..230c4ab8dbd02 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -397,6 +397,8 @@ cdef parse_datetime_string_with_reso( NPY_DATETIMEUNIT out_bestunit int out_local int out_tzoffset + char inferred_format + int format_len if not _does_string_look_like_datetime(date_string): raise ValueError(f'Given date string {date_string} not likely a datetime') @@ -409,7 +411,7 @@ cdef parse_datetime_string_with_reso( # TODO: does this render some/all of parse_delimited_date redundant? string_to_dts_failed = string_to_dts( date_string, &dts, &out_bestunit, &out_local, - &out_tzoffset, False + &out_tzoffset, False, &inferred_format, &format_len ) if not string_to_dts_failed: if dts.ps != 0 or out_local: diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index cfbaed01b57c9..7b56c68b4c525 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -69,7 +69,9 @@ This file implements string parsing and creation for NumPy datetime. int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, - int *out_local, int *out_tzoffset) { + int *out_local, int *out_tzoffset, + char *inferred_format, int *format_len) { + int fmt_idx = 0; int year_leap = 0; int i, numdigits; const char *substr; @@ -104,6 +106,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; + inferred_format[fmt_idx] = ' '; + ++fmt_idx; } /* Leading '-' sign for negative year */ @@ -125,6 +129,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, substr += 4; sublen -= 4; + inferred_format[fmt_idx] = '%'; + ++fmt_idx; + inferred_format[fmt_idx] = 'Y'; + ++fmt_idx; } /* Negate the year if necessary */ @@ -156,6 +164,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ymd_sep = valid_ymd_sep[i]; ++substr; --sublen; + inferred_format[fmt_idx] = ymd_sep; + ++fmt_idx; /* Cannot have trailing separator */ if (sublen == 0 || !isdigit(*substr)) { goto parse_error; @@ -183,6 +193,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto error; } + inferred_format[fmt_idx] = '%'; + ++fmt_idx; + inferred_format[fmt_idx] = 'm'; + ++fmt_idx; + /* Next character must be the separator, start of day, or end of string */ if (sublen == 0) { bestunit = NPY_FR_M; @@ -201,6 +216,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (*substr != ymd_sep || sublen == 1) { goto parse_error; } + inferred_format[fmt_idx] = *substr; + ++fmt_idx; ++substr; --sublen; } @@ -230,6 +247,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto error; } + inferred_format[fmt_idx] = '%'; + ++fmt_idx; + inferred_format[fmt_idx] = 'd'; + ++fmt_idx; + /* Next character must be a 'T', ' ', or end of string */ if (sublen == 0) { if (out_local != NULL) { @@ -242,6 +264,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if ((*substr != 'T' && *substr != ' ') || sublen == 1) { goto parse_error; } + inferred_format[fmt_idx] = *substr; + ++fmt_idx; ++substr; --sublen; @@ -269,6 +293,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } } + inferred_format[fmt_idx] = '%'; + ++fmt_idx; + inferred_format[fmt_idx] = 'H'; + ++fmt_idx; + /* Next character must be a ':' or the end of the string */ if (sublen == 0) { if (!hour_was_2_digits) { @@ -279,6 +308,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } if (*substr == ':') { + inferred_format[fmt_idx] = ':'; + ++fmt_idx; has_hms_sep = 1; ++substr; --sublen; @@ -315,6 +346,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto parse_error; } + inferred_format[fmt_idx] = '%'; + ++fmt_idx; + inferred_format[fmt_idx] = 'M'; + ++fmt_idx; + if (sublen == 0) { bestunit = NPY_FR_m; goto finish; @@ -323,6 +359,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* If we make it through this condition block, then the next * character is a digit. */ if (has_hms_sep && *substr == ':') { + inferred_format[fmt_idx] = ':'; + ++fmt_idx; ++substr; --sublen; /* Cannot have a trailing ':' */ @@ -356,15 +394,27 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto parse_error; } + inferred_format[fmt_idx] = '%'; + ++fmt_idx; + inferred_format[fmt_idx] = 'S'; + ++fmt_idx; + /* Next character may be a '.' indicating fractional seconds */ if (sublen > 0 && *substr == '.') { ++substr; --sublen; + inferred_format[fmt_idx] = '.'; + ++fmt_idx; } else { bestunit = NPY_FR_s; goto parse_timezone; } + inferred_format[fmt_idx] = '%'; + ++fmt_idx; + inferred_format[fmt_idx] = 'f'; + ++fmt_idx; + /* PARSE THE MICROSECONDS (0 to 6 digits) */ numdigits = 0; for (i = 0; i < 6; ++i) { @@ -430,6 +480,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; + inferred_format[fmt_idx] = ' '; + ++fmt_idx; } if (sublen == 0) { @@ -439,6 +491,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* UTC specifier */ if (*substr == 'Z') { + inferred_format[fmt_idx] = '%'; + ++fmt_idx; + inferred_format[fmt_idx] = 'Z'; + ++fmt_idx; /* "Z" should be equivalent to tz offset "+00:00" */ if (out_local != NULL) { *out_local = 1; @@ -455,6 +511,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, --sublen; } } else if (*substr == '-' || *substr == '+') { + inferred_format[fmt_idx] = '%'; + ++fmt_idx; + inferred_format[fmt_idx] = 'z'; + ++fmt_idx; /* Time zone offset */ int offset_neg = 0, offset_hour = 0, offset_minute = 0; @@ -538,6 +598,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; + inferred_format[fmt_idx] = ' '; + ++fmt_idx; } if (sublen != 0) { @@ -548,6 +610,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (out_bestunit != NULL) { *out_bestunit = bestunit; } + *format_len = fmt_idx; return 0; parse_error: diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 511d9a401fed2..1c374ca4d6e4c 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -58,7 +58,9 @@ parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, int *out_local, - int *out_tzoffset); + int *out_tzoffset, + char *inferred_format, + int *format_len); /* * Provides a string length to use for converting datetime diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index ca0a745c180e9..d526275b7c5e3 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2180,6 +2180,8 @@ def objects_to_datetime64ns( require_iso8601: bool = False, allow_object: bool = False, allow_mixed: bool = False, + format: str = "", + exact: bool = False, ): """ Convert data to array of timestamps. @@ -2227,6 +2229,8 @@ def objects_to_datetime64ns( yearfirst=yearfirst, require_iso8601=require_iso8601, allow_mixed=allow_mixed, + format=format, + exact=exact, ) result = result.reshape(data.shape, order=order) except OverflowError as err: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 7791ea804a52a..4a33620dc1988 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -426,16 +426,14 @@ def _convert_listlike_datetimes( format_is_iso8601 = format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format - format = None - if format is not None: + if format is not None and not require_iso8601: res = _to_datetime_with_format( arg, orig_arg, name, tz, format, exact, errors, infer_datetime_format ) if res is not None: return res - assert format is None or infer_datetime_format utc = tz == "utc" result, tz_parsed = objects_to_datetime64ns( arg, @@ -445,6 +443,8 @@ def _convert_listlike_datetimes( errors=errors, require_iso8601=require_iso8601, allow_object=True, + format=format, + exact=exact, ) if tz_parsed is not None: @@ -969,13 +969,6 @@ def to_datetime( ... format='%Y-%m-%d %H:%M:%S.%f') Timestamp('2018-10-26 12:00:00.000000001') - :const:`"%S"` without :const:`"%f"` will capture all the way - up to nanoseconds if present as decimal places. - - >>> pd.to_datetime('2017-03-22 15:16:45.433502912', - ... format='%Y-%m-%d %H:%M:%S') - Timestamp('2017-03-22 15:16:45.433502912') - **Non-convertible date/times** If a date does not meet the `timestamp limitations diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index f524bc18793d8..bb2588521ac18 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1751,6 +1751,104 @@ def test_to_datetime_iso8601(self, cache, arg, exp_str): exp = Timestamp(exp_str) assert result[0] == exp + @pytest.mark.parametrize( + "input, format", + [ + ("2012", "%Y-%m"), + ("2012-01", "%Y-%m-%d"), + ("2012-01-01", "%Y-%m-%d %H"), + ("2012-01-01 10", "%Y-%m-%d %H:%M"), + ("2012-01-01 10:00", "%Y-%m-%d %H:%M:%S"), + (0, "%Y-%m-%d"), + ], + ) + @pytest.mark.parametrize("exact", [True, False]) + def test_to_datetime_iso8601_fails(self, input, format, exact): + # https://github.com/pandas-dev/pandas/issues/12649 + with pytest.raises( + ValueError, + match=rf"time data \"{input}\" at position 0 doesn't match format {format}", + ): + to_datetime(input, format=format, exact=exact) + + @pytest.mark.parametrize( + "input, format", + [ + ("2012-01-01", "%Y-%m"), + ("2012-01-01 10", "%Y-%m-%d"), + ("2012-01-01 10:00", "%Y-%m-%d %H"), + ("2012-01-01 10:00:00", "%Y-%m-%d %H:%M"), + (0, "%Y-%m-%d"), + ], + ) + def test_to_datetime_iso8601_exact_fails(self, input, format): + # https://github.com/pandas-dev/pandas/issues/12649 + with pytest.raises( + ValueError, + match=rf"time data \"{input}\" at position 0 doesn't match format {format}", + ): + to_datetime(input, format=format) + + @pytest.mark.parametrize( + "input, format", + [ + ("2012-01-01", "%Y-%m"), + ("2012-01-01 00", "%Y-%m-%d"), + ("2012-01-01 00:00", "%Y-%m-%d %H"), + ("2012-01-01 00:00:00", "%Y-%m-%d %H:%M"), + ], + ) + def test_to_datetime_iso8601_non_exact(self, input, format): + # https://github.com/pandas-dev/pandas/issues/12649 + expected = Timestamp(2012, 1, 1) + result = to_datetime(input, format=format, exact=False) + assert result == expected + + @pytest.mark.parametrize( + "input, format", + [ + ("2020-01", "%Y/%m"), + ("2020-01-01", "%Y/%m/%d"), + ("2020-01-01 00", "%Y/%m/%dT%H"), + ("2020-01-01T00", "%Y/%m/%d %H"), + ("2020-01-01 00:00", "%Y/%m/%dT%H:%M"), + ("2020-01-01T00:00", "%Y/%m/%d %H:%M"), + ("2020-01-01 00:00:00", "%Y/%m/%dT%H:%M:%S"), + ("2020-01-01T00:00:00", "%Y/%m/%d %H:%M:%S"), + ], + ) + def test_to_datetime_iso8601_separator(self, input, format): + # https://github.com/pandas-dev/pandas/issues/12649 + with pytest.raises( + ValueError, + match=( + rf"time data \"{input}\" at position 0 doesn\'t match format {format}" + ), + ): + to_datetime(input, format=format) + + @pytest.mark.parametrize( + "input, format", + [ + ("2020-01", "%Y-%m"), + ("2020-01-01", "%Y-%m-%d"), + ("2020-01-01 00", "%Y-%m-%d %H"), + ("2020-01-01T00", "%Y-%m-%dT%H"), + ("2020-01-01 00:00", "%Y-%m-%d %H:%M"), + ("2020-01-01T00:00", "%Y-%m-%dT%H:%M"), + ("2020-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"), + ("2020-01-01T00:00:00", "%Y-%m-%dT%H:%M:%S"), + ("2020-01-01T00:00:00.000", "%Y-%m-%dT%H:%M:%S.%f"), + ("2020-01-01T00:00:00.000000", "%Y-%m-%dT%H:%M:%S.%f"), + ("2020-01-01T00:00:00.000000000", "%Y-%m-%dT%H:%M:%S.%f"), + ], + ) + def test_to_datetime_iso8601_valid(self, input, format): + # https://github.com/pandas-dev/pandas/issues/12649 + expected = Timestamp(2020, 1, 1) + result = to_datetime(input, format=format) + assert result == expected + def test_to_datetime_default(self, cache): rs = to_datetime("2001", cache=cache) xp = datetime(2001, 1, 1) @@ -2264,7 +2362,7 @@ def test_day_not_in_month_raise(self, cache): @pytest.mark.parametrize("arg", ["2015-02-29", "2015-02-32", "2015-04-31"]) def test_day_not_in_month_raise_value(self, cache, arg): - msg = f'time data "{arg}" at position 0 doesn\'t match format specified' + msg = f'time data "{arg}" at position 0 doesn\'t match format %Y-%m-%d' with pytest.raises(ValueError, match=msg): to_datetime(arg, errors="raise", format="%Y-%m-%d", cache=cache) From c8e9e97a7840764ac8f4fd9754f6b227a7b69626 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <> Date: Fri, 21 Oct 2022 18:57:58 +0200 Subject: [PATCH 2/6] rename --- pandas/_libs/tslib.pyx | 10 +- pandas/_libs/tslibs/conversion.pyx | 4 +- pandas/_libs/tslibs/np_datetime.pxd | 2 +- pandas/_libs/tslibs/np_datetime.pyx | 6 +- pandas/_libs/tslibs/parsing.pyx | 4 +- .../tslibs/src/datetime/np_datetime_strings.c | 114 +++++++++--------- .../tslibs/src/datetime/np_datetime_strings.h | 2 +- 7 files changed, 71 insertions(+), 71 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 09407a0746870..b685491e708ae 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -87,7 +87,7 @@ def _test_parse_iso8601(ts: str): int out_local = 0, out_tzoffset = 0 NPY_DATETIMEUNIT out_bestunit char inferred_format - int format_len + int inferred_format_len obj = _TSObject() @@ -96,7 +96,7 @@ def _test_parse_iso8601(ts: str): elif ts == 'today': return Timestamp.now().normalize() - string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True, &inferred_format, &format_len) + string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True, &inferred_format, &inferred_format_len) obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts) check_dts_bounds(&obj.dts) if out_local == 1: @@ -515,7 +515,7 @@ cpdef array_to_datetime( tzinfo tz_out = None bint found_tz = False, found_naive = False char inferred_format[100] - int format_len + int inferred_format_len # specify error conditions assert is_raise or is_ignore or is_coerce @@ -615,7 +615,7 @@ cpdef array_to_datetime( string_to_dts_failed = string_to_dts( val, &dts, &out_bestunit, &out_local, &out_tzoffset, False, inferred_format, - &format_len, + &inferred_format_len, ) if string_to_dts_failed: # An error at this point is a _parsing_ error @@ -662,7 +662,7 @@ cpdef array_to_datetime( iresult[i] = _ts.value if not string_to_dts_failed: if require_iso8601: - guess = inferred_format[:format_len].decode('utf-8') + guess = inferred_format[:inferred_format_len].decode('utf-8') if ( (exact and format != guess) or (not exact and re.search(format, guess) is None) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 6533bc22348d4..d3e8df8ab1aa4 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -470,7 +470,7 @@ cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, str unit, int64_t ival NPY_DATETIMEUNIT out_bestunit char inferred_format - int format_len + int inferred_format_len if len(ts) == 0 or ts in nat_strings: ts = NaT @@ -490,7 +490,7 @@ cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, str unit, else: string_to_dts_failed = string_to_dts( ts, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, &inferred_format, &format_len + &out_tzoffset, False, &inferred_format, &inferred_format_len ) if not string_to_dts_failed: try: diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 2a40e78ff7901..a877f691e6471 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -96,7 +96,7 @@ cdef int string_to_dts( int* out_tzoffset, bint want_exc, char *inferred_format, - int *format_len, + int *inferred_format_len, ) except? -1 cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype) diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index c5d5f284dc59a..3c1a13a864f46 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -53,7 +53,7 @@ cdef extern from "src/datetime/np_datetime_strings.h": npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, int *out_local, int *out_tzoffset, - char *inferred_format, int *format_len) + char *inferred_format, int *inferred_format_len) # ---------------------------------------------------------------------- @@ -275,7 +275,7 @@ cdef inline int string_to_dts( int* out_tzoffset, bint want_exc, char *inferred_format, - int *format_len + int *inferred_format_len ) except? -1: cdef: Py_ssize_t length @@ -284,7 +284,7 @@ cdef inline int string_to_dts( buf = get_c_string_buf_and_size(val, &length) return parse_iso_8601_datetime(buf, length, want_exc, dts, out_bestunit, out_local, out_tzoffset, - inferred_format, format_len) + inferred_format, inferred_format_len) cpdef ndarray astype_overflowsafe( diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 230c4ab8dbd02..98d50251af0ca 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -398,7 +398,7 @@ cdef parse_datetime_string_with_reso( int out_local int out_tzoffset char inferred_format - int format_len + int inferred_format_len if not _does_string_look_like_datetime(date_string): raise ValueError(f'Given date string {date_string} not likely a datetime') @@ -411,7 +411,7 @@ cdef parse_datetime_string_with_reso( # TODO: does this render some/all of parse_delimited_date redundant? string_to_dts_failed = string_to_dts( date_string, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, &inferred_format, &format_len + &out_tzoffset, False, &inferred_format, &inferred_format_len ) if not string_to_dts_failed: if dts.ps != 0 or out_local: diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 7b56c68b4c525..0847d15d1e8f1 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -70,8 +70,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, int *out_local, int *out_tzoffset, - char *inferred_format, int *format_len) { - int fmt_idx = 0; + char *inferred_format, int *inferred_format_len) { + int inferred_format_idx = 0; int year_leap = 0; int i, numdigits; const char *substr; @@ -106,8 +106,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; - inferred_format[fmt_idx] = ' '; - ++fmt_idx; + inferred_format[inferred_format_idx] = ' '; + ++inferred_format_idx; } /* Leading '-' sign for negative year */ @@ -129,10 +129,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, substr += 4; sublen -= 4; - inferred_format[fmt_idx] = '%'; - ++fmt_idx; - inferred_format[fmt_idx] = 'Y'; - ++fmt_idx; + inferred_format[inferred_format_idx] = '%'; + ++inferred_format_idx; + inferred_format[inferred_format_idx] = 'Y'; + ++inferred_format_idx; } /* Negate the year if necessary */ @@ -164,8 +164,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ymd_sep = valid_ymd_sep[i]; ++substr; --sublen; - inferred_format[fmt_idx] = ymd_sep; - ++fmt_idx; + inferred_format[inferred_format_idx] = ymd_sep; + ++inferred_format_idx; /* Cannot have trailing separator */ if (sublen == 0 || !isdigit(*substr)) { goto parse_error; @@ -193,10 +193,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto error; } - inferred_format[fmt_idx] = '%'; - ++fmt_idx; - inferred_format[fmt_idx] = 'm'; - ++fmt_idx; + inferred_format[inferred_format_idx] = '%'; + ++inferred_format_idx; + inferred_format[inferred_format_idx] = 'm'; + ++inferred_format_idx; /* Next character must be the separator, start of day, or end of string */ if (sublen == 0) { @@ -216,8 +216,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (*substr != ymd_sep || sublen == 1) { goto parse_error; } - inferred_format[fmt_idx] = *substr; - ++fmt_idx; + inferred_format[inferred_format_idx] = *substr; + ++inferred_format_idx; ++substr; --sublen; } @@ -247,10 +247,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto error; } - inferred_format[fmt_idx] = '%'; - ++fmt_idx; - inferred_format[fmt_idx] = 'd'; - ++fmt_idx; + inferred_format[inferred_format_idx] = '%'; + ++inferred_format_idx; + inferred_format[inferred_format_idx] = 'd'; + ++inferred_format_idx; /* Next character must be a 'T', ' ', or end of string */ if (sublen == 0) { @@ -264,8 +264,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if ((*substr != 'T' && *substr != ' ') || sublen == 1) { goto parse_error; } - inferred_format[fmt_idx] = *substr; - ++fmt_idx; + inferred_format[inferred_format_idx] = *substr; + ++inferred_format_idx; ++substr; --sublen; @@ -293,10 +293,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } } - inferred_format[fmt_idx] = '%'; - ++fmt_idx; - inferred_format[fmt_idx] = 'H'; - ++fmt_idx; + inferred_format[inferred_format_idx] = '%'; + ++inferred_format_idx; + inferred_format[inferred_format_idx] = 'H'; + ++inferred_format_idx; /* Next character must be a ':' or the end of the string */ if (sublen == 0) { @@ -308,8 +308,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } if (*substr == ':') { - inferred_format[fmt_idx] = ':'; - ++fmt_idx; + inferred_format[inferred_format_idx] = ':'; + ++inferred_format_idx; has_hms_sep = 1; ++substr; --sublen; @@ -346,10 +346,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto parse_error; } - inferred_format[fmt_idx] = '%'; - ++fmt_idx; - inferred_format[fmt_idx] = 'M'; - ++fmt_idx; + inferred_format[inferred_format_idx] = '%'; + ++inferred_format_idx; + inferred_format[inferred_format_idx] = 'M'; + ++inferred_format_idx; if (sublen == 0) { bestunit = NPY_FR_m; @@ -359,8 +359,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* If we make it through this condition block, then the next * character is a digit. */ if (has_hms_sep && *substr == ':') { - inferred_format[fmt_idx] = ':'; - ++fmt_idx; + inferred_format[inferred_format_idx] = ':'; + ++inferred_format_idx; ++substr; --sublen; /* Cannot have a trailing ':' */ @@ -394,26 +394,26 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto parse_error; } - inferred_format[fmt_idx] = '%'; - ++fmt_idx; - inferred_format[fmt_idx] = 'S'; - ++fmt_idx; + inferred_format[inferred_format_idx] = '%'; + ++inferred_format_idx; + inferred_format[inferred_format_idx] = 'S'; + ++inferred_format_idx; /* Next character may be a '.' indicating fractional seconds */ if (sublen > 0 && *substr == '.') { ++substr; --sublen; - inferred_format[fmt_idx] = '.'; - ++fmt_idx; + inferred_format[inferred_format_idx] = '.'; + ++inferred_format_idx; } else { bestunit = NPY_FR_s; goto parse_timezone; } - inferred_format[fmt_idx] = '%'; - ++fmt_idx; - inferred_format[fmt_idx] = 'f'; - ++fmt_idx; + inferred_format[inferred_format_idx] = '%'; + ++inferred_format_idx; + inferred_format[inferred_format_idx] = 'f'; + ++inferred_format_idx; /* PARSE THE MICROSECONDS (0 to 6 digits) */ numdigits = 0; @@ -480,8 +480,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; - inferred_format[fmt_idx] = ' '; - ++fmt_idx; + inferred_format[inferred_format_idx] = ' '; + ++inferred_format_idx; } if (sublen == 0) { @@ -491,10 +491,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* UTC specifier */ if (*substr == 'Z') { - inferred_format[fmt_idx] = '%'; - ++fmt_idx; - inferred_format[fmt_idx] = 'Z'; - ++fmt_idx; + inferred_format[inferred_format_idx] = '%'; + ++inferred_format_idx; + inferred_format[inferred_format_idx] = 'Z'; + ++inferred_format_idx; /* "Z" should be equivalent to tz offset "+00:00" */ if (out_local != NULL) { *out_local = 1; @@ -511,10 +511,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, --sublen; } } else if (*substr == '-' || *substr == '+') { - inferred_format[fmt_idx] = '%'; - ++fmt_idx; - inferred_format[fmt_idx] = 'z'; - ++fmt_idx; + inferred_format[inferred_format_idx] = '%'; + ++inferred_format_idx; + inferred_format[inferred_format_idx] = 'z'; + ++inferred_format_idx; /* Time zone offset */ int offset_neg = 0, offset_hour = 0, offset_minute = 0; @@ -598,8 +598,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; - inferred_format[fmt_idx] = ' '; - ++fmt_idx; + inferred_format[inferred_format_idx] = ' '; + ++inferred_format_idx; } if (sublen != 0) { @@ -610,7 +610,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (out_bestunit != NULL) { *out_bestunit = bestunit; } - *format_len = fmt_idx; + *inferred_format_len = inferred_format_idx; return 0; parse_error: diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 1c374ca4d6e4c..9238749db8190 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -60,7 +60,7 @@ parse_iso_8601_datetime(const char *str, int len, int want_exc, int *out_local, int *out_tzoffset, char *inferred_format, - int *format_len); + int *inferred_format_len); /* * Provides a string length to use for converting datetime From 403d7e013f2bd7e0b7c66b80c84e6e13c6829f66 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <> Date: Fri, 21 Oct 2022 20:07:07 +0200 Subject: [PATCH 3/6] :label: --- pandas/core/arrays/datetimes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index d526275b7c5e3..ea84d8700c447 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2180,7 +2180,7 @@ def objects_to_datetime64ns( require_iso8601: bool = False, allow_object: bool = False, allow_mixed: bool = False, - format: str = "", + format: str | None = None, exact: bool = False, ): """ @@ -2229,7 +2229,7 @@ def objects_to_datetime64ns( yearfirst=yearfirst, require_iso8601=require_iso8601, allow_mixed=allow_mixed, - format=format, + format=format or "", exact=exact, ) result = result.reshape(data.shape, order=order) From 4ac7bd44167732748b961ddc8f06056b298ac843 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <> Date: Sat, 22 Oct 2022 00:04:15 +0200 Subject: [PATCH 4/6] make bigger --- pandas/_libs/tslib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index b685491e708ae..fa183eaa55b67 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -514,7 +514,7 @@ cpdef array_to_datetime( datetime py_dt tzinfo tz_out = None bint found_tz = False, found_naive = False - char inferred_format[100] + char inferred_format[1000] int inferred_format_len # specify error conditions From 6f48bd5986ee7c1b22abaebe5b4ba482873901ae Mon Sep 17 00:00:00 2001 From: Marco Gorelli <> Date: Sat, 22 Oct 2022 11:08:23 +0200 Subject: [PATCH 5/6] longer array in _test_parse_iso8601 --- pandas/_libs/tslib.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index fa183eaa55b67..a8dafa3121460 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -86,7 +86,7 @@ def _test_parse_iso8601(ts: str): _TSObject obj int out_local = 0, out_tzoffset = 0 NPY_DATETIMEUNIT out_bestunit - char inferred_format + char inferred_format[1000] int inferred_format_len obj = _TSObject() @@ -96,7 +96,7 @@ def _test_parse_iso8601(ts: str): elif ts == 'today': return Timestamp.now().normalize() - string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True, &inferred_format, &inferred_format_len) + string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True, inferred_format, &inferred_format_len) obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts) check_dts_bounds(&obj.dts) if out_local == 1: From de0f30ca062c4a521494261e1c5fe342bb06b988 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <> Date: Sat, 22 Oct 2022 11:56:28 +0200 Subject: [PATCH 6/6] test message --- pandas/tests/tools/test_to_datetime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index bb2588521ac18..73212ad815688 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2362,7 +2362,7 @@ def test_day_not_in_month_raise(self, cache): @pytest.mark.parametrize("arg", ["2015-02-29", "2015-02-32", "2015-04-31"]) def test_day_not_in_month_raise_value(self, cache, arg): - msg = f'time data "{arg}" at position 0 doesn\'t match format %Y-%m-%d' + msg = f'time data "{arg}" at position 0 doesn\'t match %Y-%m-%d' with pytest.raises(ValueError, match=msg): to_datetime(arg, errors="raise", format="%Y-%m-%d", cache=cache)