-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: pandas.to_datetime() does not respect exact format string with ISO8601 #49333
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 13 commits
45f82c3
e473adb
a6ea6d0
8fede1f
2e21e71
afc4d96
12721b0
0531967
a571753
e814a2e
19c34f8
70fb820
eb50dfb
ac61ac5
0dd7407
4d35ea7
3acfdf6
f3060c9
7310e13
b18ade7
3ceb1ee
bde5ef9
080f018
031e0e3
01e8bd1
c1e6bc2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,7 +52,8 @@ cdef extern from "src/datetime/np_datetime_strings.h": | |
int parse_iso_8601_datetime(const char *str, int len, int want_exc, | ||
npy_datetimestruct *out, | ||
NPY_DATETIMEUNIT *out_bestunit, | ||
int *out_local, int *out_tzoffset) | ||
int *out_local, int *out_tzoffset, | ||
const char *format, int format_len, int exact) | ||
|
||
|
||
# ---------------------------------------------------------------------- | ||
|
@@ -273,14 +274,25 @@ cdef inline int string_to_dts( | |
int* out_local, | ||
int* out_tzoffset, | ||
bint want_exc, | ||
format: str | None = None, | ||
bint exact = True, | ||
) except? -1: | ||
cdef: | ||
Py_ssize_t length | ||
const char* buf | ||
Py_ssize_t format_length | ||
const char* format_buf | ||
|
||
buf = get_c_string_buf_and_size(val, &length) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Orthogonal to this but if you want to work more with the C codebase I think we should just replace |
||
if format is None: | ||
format_buf = b'' | ||
format_length = 0 | ||
exact = False | ||
MarcoGorelli marked this conversation as resolved.
Show resolved
Hide resolved
|
||
else: | ||
format_buf = get_c_string_buf_and_size(format, &format_length) | ||
return parse_iso_8601_datetime(buf, length, want_exc, | ||
dts, out_bestunit, out_local, out_tzoffset) | ||
dts, out_bestunit, out_local, out_tzoffset, | ||
format_buf, format_length, exact) | ||
|
||
|
||
cpdef ndarray astype_overflowsafe( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -66,10 +66,25 @@ This file implements string parsing and creation for NumPy datetime. | |
* | ||
* Returns 0 on success, -1 on failure. | ||
*/ | ||
|
||
inline int format_startswith(char ch, int format_len, char format, int exact) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK I think I understand a bit more now what you are trying to do. I think it would be best if you just kept a local variable for characters consumed, which you can increment every time you increment the format pointer (you are kind of doing this anyway). You don't really need a function but can rather just do The order of operations will do the postfix increment after assignment, so this consolidates what you are trying to do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just checking I've understood, is the suggestion to get rid of the function and each time do something like
? I presume I've misunderstood because to me this looks more complicated and loses the docstring 😄 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is possible to even do everything in one single if statement There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Its not a matter of inlining as much as code organization. For instance, this block: if (!format_startswith('%', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
if (!format_startswith('Y', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;} Would seem more naturally expressed as: if (format_len < 2 && exact) {
goto parse_error;
}
if (*format++ != '%') {
goto parse_error;
}
if (*format++ != 'Y') {
goto parse_error;
} There's some code golf you can do therein to shorten it but that's not really what I'm after. Moreso we should refactor this to avoid trying to cram a lot into a function with side effects, as we don't use that pattern much if at all in our code base, and a lot of the character by character checks being done are overkill There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The other advantage of this is you could provide an actual error message as to what went wrong at what position of the format, though not critical to scope for this PR There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree. Can't we just refactor everything and replace it with something like
? Unless doing everything in one go is a performance decision... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you see a better way to refactor I am definitely open to it. I don't think the performance here will be that critical. |
||
/* Check if the current character in `format` is `ch`. | ||
|
||
Always error on character mismatch conditioned on non-exhausted format, | ||
or when format is exhausted in the exact case. | ||
Note that if `format` hasn't been exhausted, it should be advanced | ||
outside of this function. */ | ||
if ((format_len && format != ch) || (exact && !format_len)) { | ||
nikitaved marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return 0; | ||
} | ||
return 1; | ||
} | ||
|
||
int parse_iso_8601_datetime(const char *str, int len, int want_exc, | ||
npy_datetimestruct *out, | ||
NPY_DATETIMEUNIT *out_bestunit, | ||
int *out_local, int *out_tzoffset) { | ||
int *out_local, int *out_tzoffset, | ||
const char* format, int format_len, int exact) { | ||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
int year_leap = 0; | ||
int i, numdigits; | ||
const char *substr; | ||
|
@@ -104,19 +119,28 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
while (sublen > 0 && isspace(*substr)) { | ||
++substr; | ||
--sublen; | ||
if (!format_startswith(' ', format_len, *format, exact)) goto parse_error; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor nit but can you run |
||
if (format_len) {++format; --format_len;} | ||
} | ||
|
||
/* Leading '-' sign for negative year */ | ||
if (*substr == '-') { | ||
++substr; | ||
--sublen; | ||
if (!format_startswith('-', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
} | ||
|
||
if (sublen == 0) { | ||
goto parse_error; | ||
} | ||
|
||
/* PARSE THE YEAR (4 digits) */ | ||
if (!format_startswith('%', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
if (!format_startswith('Y', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
|
||
out->year = 0; | ||
if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) && | ||
isdigit(substr[2]) && isdigit(substr[3])) { | ||
|
@@ -139,6 +163,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
if (out_local != NULL) { | ||
*out_local = 0; | ||
} | ||
if (format_len) { | ||
goto parse_error; | ||
} | ||
bestunit = NPY_FR_Y; | ||
goto finish; | ||
} | ||
|
@@ -156,13 +183,19 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
ymd_sep = valid_ymd_sep[i]; | ||
++substr; | ||
--sublen; | ||
if (!format_startswith(ymd_sep, format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
/* Cannot have trailing separator */ | ||
if (sublen == 0 || !isdigit(*substr)) { | ||
goto parse_error; | ||
} | ||
} | ||
|
||
/* PARSE THE MONTH */ | ||
if (!format_startswith('%', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
if (!format_startswith('m', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
/* First digit required */ | ||
out->month = (*substr - '0'); | ||
++substr; | ||
|
@@ -190,6 +223,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
if (!has_ymd_sep) { | ||
goto parse_error; | ||
} | ||
if (format_len) { | ||
goto parse_error; | ||
} | ||
if (out_local != NULL) { | ||
*out_local = 0; | ||
} | ||
|
@@ -203,9 +239,15 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
} | ||
++substr; | ||
--sublen; | ||
if (!format_startswith(ymd_sep, format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
} | ||
|
||
/* PARSE THE DAY */ | ||
if (!format_startswith('%', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
if (!format_startswith('d', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
/* First digit required */ | ||
if (!isdigit(*substr)) { | ||
goto parse_error; | ||
|
@@ -235,17 +277,26 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
if (out_local != NULL) { | ||
*out_local = 0; | ||
} | ||
if (format_len) { | ||
goto parse_error; | ||
} | ||
MarcoGorelli marked this conversation as resolved.
Show resolved
Hide resolved
|
||
bestunit = NPY_FR_D; | ||
goto finish; | ||
} | ||
|
||
if ((*substr != 'T' && *substr != ' ') || sublen == 1) { | ||
goto parse_error; | ||
} | ||
if (!format_startswith(*substr, format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
++substr; | ||
--sublen; | ||
|
||
/* PARSE THE HOURS */ | ||
if (!format_startswith('%', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
if (!format_startswith('H', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
/* First digit required */ | ||
if (!isdigit(*substr)) { | ||
goto parse_error; | ||
|
@@ -274,6 +325,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
if (!hour_was_2_digits) { | ||
goto parse_error; | ||
} | ||
if (format_len) { | ||
goto parse_error; | ||
} | ||
bestunit = NPY_FR_h; | ||
goto finish; | ||
} | ||
|
@@ -286,6 +340,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
if (sublen == 0 || !isdigit(*substr)) { | ||
goto parse_error; | ||
} | ||
if (!format_startswith(':', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
} else if (!isdigit(*substr)) { | ||
if (!hour_was_2_digits) { | ||
goto parse_error; | ||
|
@@ -294,6 +350,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
} | ||
|
||
/* PARSE THE MINUTES */ | ||
if (!format_startswith('%', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
if (!format_startswith('M', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
/* First digit required */ | ||
out->min = (*substr - '0'); | ||
++substr; | ||
|
@@ -317,12 +377,17 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
|
||
if (sublen == 0) { | ||
bestunit = NPY_FR_m; | ||
if (format_len) { | ||
goto parse_error; | ||
} | ||
goto finish; | ||
} | ||
|
||
/* If we make it through this condition block, then the next | ||
* character is a digit. */ | ||
if (has_hms_sep && *substr == ':') { | ||
if (!format_startswith(':', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
++substr; | ||
--sublen; | ||
/* Cannot have a trailing ':' */ | ||
|
@@ -335,6 +400,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
} | ||
|
||
/* PARSE THE SECONDS */ | ||
if (!format_startswith('%', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
if (!format_startswith('S', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
/* First digit required */ | ||
out->sec = (*substr - '0'); | ||
++substr; | ||
|
@@ -360,12 +429,18 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
if (sublen > 0 && *substr == '.') { | ||
++substr; | ||
--sublen; | ||
if (!format_startswith('.', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
} else { | ||
bestunit = NPY_FR_s; | ||
goto parse_timezone; | ||
} | ||
|
||
/* PARSE THE MICROSECONDS (0 to 6 digits) */ | ||
if (!format_startswith('%', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
if (!format_startswith('f', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
numdigits = 0; | ||
for (i = 0; i < 6; ++i) { | ||
out->us *= 10; | ||
|
@@ -430,15 +505,24 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
while (sublen > 0 && isspace(*substr)) { | ||
++substr; | ||
--sublen; | ||
if (!format_startswith(' ', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
} | ||
|
||
if (sublen == 0) { | ||
// Unlike NumPy, treating no time zone as naive | ||
if (format_len > 0) { | ||
goto parse_error; | ||
} | ||
goto finish; | ||
} | ||
|
||
/* UTC specifier */ | ||
if (*substr == 'Z') { | ||
if (!format_startswith('%', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
if (!format_startswith('Z', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
/* "Z" should be equivalent to tz offset "+00:00" */ | ||
if (out_local != NULL) { | ||
*out_local = 1; | ||
|
@@ -449,12 +533,19 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
} | ||
|
||
if (sublen == 1) { | ||
if (format_len > 0) { | ||
goto parse_error; | ||
} | ||
goto finish; | ||
} else { | ||
++substr; | ||
--sublen; | ||
} | ||
} else if (*substr == '-' || *substr == '+') { | ||
if (!format_startswith('%', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
if (!format_startswith('z', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
/* Time zone offset */ | ||
int offset_neg = 0, offset_hour = 0, offset_minute = 0; | ||
|
||
|
@@ -538,9 +629,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
while (sublen > 0 && isspace(*substr)) { | ||
++substr; | ||
--sublen; | ||
if (!format_startswith(' ', format_len, *format, exact)) goto parse_error; | ||
if (format_len) {++format; --format_len;} | ||
} | ||
|
||
if (sublen != 0) { | ||
if ((sublen != 0) || (format_len != 0)) { | ||
goto parse_error; | ||
} | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.