Skip to content

Commit 3490468

Browse files
author
MarcoGorelli
committed
share paths and fix bugs
1 parent dd8b718 commit 3490468

File tree

10 files changed

+248
-169
lines changed

10 files changed

+248
-169
lines changed

doc/source/whatsnew/v2.0.0.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -779,6 +779,7 @@ Performance improvements
779779
- Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`)
780780
- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`)
781781
- Performance improvement in :func:`read_html` when there are multiple tables (:issue:`49929`)
782+
- Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`)
782783

783784
.. ---------------------------------------------------------------------------
784785
.. _whatsnew_200.bug_fixes:
@@ -808,6 +809,13 @@ Datetimelike
808809
- Bug in :meth:`Timestamp.round` when the ``freq`` argument has zero-duration (e.g. "0ns") returning incorrect results instead of raising (:issue:`49737`)
809810
- Bug in :func:`to_datetime` was not raising ``ValueError`` when invalid format was passed and ``errors`` was ``'ignore'`` or ``'coerce'`` (:issue:`50266`)
810811
- Bug in :class:`DateOffset` was throwing ``TypeError`` when constructing with milliseconds and another super-daily argument (:issue:`49897`)
812+
- Bug in :func:`to_datetime` was not raising ``ValueError`` when parsing string with decimal date with format ``'%Y%m%d'`` (:issue:`50051`)
813+
- Bug in :func:`to_datetime` was not converting ``None`` to ``NaT`` when parsing mixed-offset date strings with ISO8601 format (:issue:`50071`)
814+
- Bug in :func:`to_datetime` was not returning input when parsing out-of-bounds date string with ``errors='ignore'`` and ``format='%Y%m%d'`` (:issue:`14487`)
815+
- Bug in :func:`to_datetime` was converting timezone-naive ``datetime.datetime`` to timezone-aware when parsing with timezone-aware strings, ISO8601 format, and ``utc=False`` (:issue:`50254`)
816+
- Bug in :func:`to_datetime` was throwing ``ValueError`` when parsing dates with ISO8601 format where some values were not zero-padded (:issue:`21422`)
817+
- Bug in :func:`to_datetime` was giving incorrect results when using ``format='%Y%m%d'`` and ``errors='ignore'`` (:issue:`26493`)
818+
- Bug in :func:`to_datetime` was failing to parse date strings ``'today'`` and ``'now'`` if ``format`` was not ISO8601 (:issue:`50359`)
811819
-
812820

813821
Timedelta

pandas/_libs/tslib.pyi

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,6 @@ def array_to_datetime(
2323
dayfirst: bool = ...,
2424
yearfirst: bool = ...,
2525
utc: bool = ...,
26-
require_iso8601: bool = ...,
27-
format: str | None = ...,
28-
exact: bool = ...,
2926
) -> tuple[np.ndarray, tzinfo | None]: ...
3027

3128
# returned ndarray may be object dtype or datetime64[ns]

pandas/_libs/tslib.pyx

Lines changed: 3 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ from pandas._libs.tslibs.np_datetime cimport (
3838
pydatetime_to_dt64,
3939
string_to_dts,
4040
)
41+
from pandas._libs.tslibs.strptime cimport parse_today_now
4142
from pandas._libs.util cimport (
4243
is_datetime64_object,
4344
is_float_object,
@@ -401,9 +402,6 @@ cpdef array_to_datetime(
401402
bint dayfirst=False,
402403
bint yearfirst=False,
403404
bint utc=False,
404-
bint require_iso8601=False,
405-
format: str | None=None,
406-
bint exact=True,
407405
):
408406
"""
409407
Converts a 1D array of date-like values to a numpy array of either:
@@ -430,8 +428,6 @@ cpdef array_to_datetime(
430428
yearfirst parsing behavior when encountering datetime strings
431429
utc : bool, default False
432430
indicator whether the dates should be UTC
433-
require_iso8601 : bool, default False
434-
indicator whether the datetime string should be iso8601
435431
436432
Returns
437433
-------
@@ -502,16 +498,6 @@ cpdef array_to_datetime(
502498
iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)
503499

504500
elif is_integer_object(val) or is_float_object(val):
505-
if require_iso8601:
506-
if is_coerce:
507-
iresult[i] = NPY_NAT
508-
continue
509-
elif is_raise:
510-
raise ValueError(
511-
f"time data \"{val}\" at position {i} doesn't "
512-
f"match format \"{format}\""
513-
)
514-
return values, tz_out
515501
# these must be ns unit by-definition
516502
seen_integer = True
517503

@@ -542,25 +528,13 @@ cpdef array_to_datetime(
542528

543529
string_to_dts_failed = string_to_dts(
544530
val, &dts, &out_bestunit, &out_local,
545-
&out_tzoffset, False, format, exact
531+
&out_tzoffset, False, None, False
546532
)
547533
if string_to_dts_failed:
548534
# An error at this point is a _parsing_ error
549535
# specifically _not_ OutOfBoundsDatetime
550-
if _parse_today_now(val, &iresult[i], utc):
536+
if parse_today_now(val, &iresult[i], utc):
551537
continue
552-
elif require_iso8601:
553-
# if requiring iso8601 strings, skip trying
554-
# other formats
555-
if is_coerce:
556-
iresult[i] = NPY_NAT
557-
continue
558-
elif is_raise:
559-
raise ValueError(
560-
f"time data \"{val}\" at position {i} doesn't "
561-
f"match format \"{format}\""
562-
)
563-
return values, tz_out
564538

565539
try:
566540
py_dt = parse_datetime_string(val,
@@ -623,18 +597,6 @@ cpdef array_to_datetime(
623597
if is_coerce:
624598
iresult[i] = NPY_NAT
625599
continue
626-
elif require_iso8601 and isinstance(val, str):
627-
# GH#19382 for just-barely-OutOfBounds falling back to
628-
# dateutil parser will return incorrect result because
629-
# it will ignore nanoseconds
630-
if is_raise:
631-
632-
# Still raise OutOfBoundsDatetime,
633-
# as error message is informative.
634-
raise
635-
636-
assert is_ignore
637-
return values, tz_out
638600
raise
639601

640602
except OutOfBoundsDatetime:
@@ -793,26 +755,6 @@ cdef _array_to_datetime_object(
793755
return oresult, None
794756

795757

796-
cdef bint _parse_today_now(str val, int64_t* iresult, bint utc):
797-
# We delay this check for as long as possible
798-
# because it catches relatively rare cases
799-
800-
# Multiply by 1000 to convert to nanos, since these methods naturally have
801-
# microsecond resolution
802-
if val == "now":
803-
if utc:
804-
iresult[0] = Timestamp.utcnow().value * 1000
805-
else:
806-
# GH#18705 make sure to_datetime("now") matches Timestamp("now")
807-
# Note using Timestamp.now() is faster than Timestamp("now")
808-
iresult[0] = Timestamp.now().value * 1000
809-
return True
810-
elif val == "today":
811-
iresult[0] = Timestamp.today().value * 1000
812-
return True
813-
return False
814-
815-
816758
def array_to_datetime_with_tz(ndarray values, tzinfo tz):
817759
"""
818760
Vectorized analogue to pd.Timestamp(value, tz=tz)

pandas/_libs/tslibs/parsing.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -825,7 +825,7 @@ def format_is_iso(f: str) -> bint:
825825
but must be consistent. Leading 0s in dates and times are optional.
826826
"""
827827
iso_template = "%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}".format
828-
excluded_formats = ["%Y%m%d", "%Y%m", "%Y"]
828+
excluded_formats = ["%Y%m"]
829829

830830
for date_sep in [" ", "/", "\\", "-", ".", ""]:
831831
for time_sep in [" ", "T"]:

pandas/_libs/tslibs/strptime.pxd

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from numpy cimport int64_t
2+
3+
4+
cdef bint parse_today_now(str val, int64_t* iresult, bint utc)

pandas/_libs/tslibs/strptime.pyx

Lines changed: 76 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,14 @@ from pandas._libs.tslibs.nattype cimport (
3434
c_nat_strings as nat_strings,
3535
)
3636
from pandas._libs.tslibs.np_datetime cimport (
37+
NPY_DATETIMEUNIT,
3738
NPY_FR_ns,
3839
check_dts_bounds,
3940
npy_datetimestruct,
4041
npy_datetimestruct_to_datetime,
4142
pydate_to_dt64,
4243
pydatetime_to_dt64,
44+
string_to_dts,
4345
)
4446
from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
4547
from pandas._libs.tslibs.timestamps cimport _Timestamp
@@ -48,9 +50,28 @@ from pandas._libs.util cimport (
4850
is_float_object,
4951
is_integer_object,
5052
)
53+
from pandas._libs.tslibs.timestamps import Timestamp
5154

5255
cnp.import_array()
5356

57+
cdef bint parse_today_now(str val, int64_t* iresult, bint utc):
58+
# We delay this check for as long as possible
59+
# because it catches relatively rare cases
60+
61+
# Multiply by 1000 to convert to nanos, since these methods naturally have
62+
# microsecond resolution
63+
if val == "now":
64+
if utc:
65+
iresult[0] = Timestamp.utcnow().value * 1000
66+
else:
67+
# GH#18705 make sure to_datetime("now") matches Timestamp("now")
68+
# Note using Timestamp.now() is faster than Timestamp("now")
69+
iresult[0] = Timestamp.now().value * 1000
70+
return True
71+
elif val == "today":
72+
iresult[0] = Timestamp.today().value * 1000
73+
return True
74+
return False
5475

5576
cdef dict _parse_code_table = {"y": 0,
5677
"Y": 1,
@@ -94,6 +115,7 @@ def array_strptime(
94115
exact : matches must be exact if True, search if False
95116
errors : string specifying error handling, {'raise', 'ignore', 'coerce'}
96117
"""
118+
from pandas._libs.tslibs.parsing import format_is_iso
97119

98120
cdef:
99121
Py_ssize_t i, n = len(values)
@@ -111,6 +133,9 @@ def array_strptime(
111133
bint found_naive = False
112134
bint found_tz = False
113135
tzinfo tz_out = None
136+
bint iso_format = fmt is not None and format_is_iso(fmt)
137+
NPY_DATETIMEUNIT out_bestunit
138+
int out_local = 0, out_tzoffset = 0
114139

115140
assert is_raise or is_ignore or is_coerce
116141

@@ -232,17 +257,57 @@ def array_strptime(
232257
else:
233258
val = str(val)
234259

235-
# exact matching
236-
if exact:
237-
found = format_regex.match(val)
238-
if not found:
239-
raise ValueError(f"time data \"{val}\" at position {i} doesn't "
240-
f"match format \"{fmt}\"")
241-
if len(val) != found.end():
242-
raise ValueError(
243-
f"unconverted data remains at position {i}: "
244-
f'"{val[found.end():]}"'
245-
)
260+
if iso_format:
261+
string_to_dts_failed = string_to_dts(
262+
val, &dts, &out_bestunit, &out_local,
263+
&out_tzoffset, False, fmt, exact
264+
)
265+
if not string_to_dts_failed:
266+
# No error reported by string_to_dts, pick back up
267+
# where we left off
268+
value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
269+
if out_local == 1:
270+
# Store the out_tzoffset in seconds
271+
# since we store the total_seconds of
272+
# dateutil.tz.tzoffset objects
273+
tz = timezone(timedelta(minutes=out_tzoffset))
274+
result_timezone[i] = tz
275+
out_local = 0
276+
out_tzoffset = 0
277+
iresult[i] = value
278+
try:
279+
check_dts_bounds(&dts)
280+
except ValueError:
281+
if is_coerce:
282+
iresult[i] = NPY_NAT
283+
continue
284+
raise
285+
continue
286+
287+
if parse_today_now(val, &iresult[i], utc):
288+
continue
289+
290+
# Some ISO formats can't be parsed by string_to_dts
291+
# For example, 6-digit YYYYMD. So, if there's an error,
292+
# try the string-matching code below.
293+
294+
# exact matching
295+
if exact:
296+
found = format_regex.match(val)
297+
if not found:
298+
if is_coerce:
299+
iresult[i] = NPY_NAT
300+
continue
301+
raise ValueError(f"time data \"{val}\" at position {i} doesn't "
302+
f"match format \"{fmt}\"")
303+
if len(val) != found.end():
304+
if is_coerce:
305+
iresult[i] = NPY_NAT
306+
continue
307+
raise ValueError(
308+
f"unconverted data remains at position {i}: "
309+
f'"{val[found.end():]}"'
310+
)
246311

247312
# search
248313
else:

pandas/core/arrays/datetimes.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2118,10 +2118,7 @@ def objects_to_datetime64ns(
21182118
yearfirst,
21192119
utc: bool = False,
21202120
errors: DateTimeErrorChoices = "raise",
2121-
require_iso8601: bool = False,
21222121
allow_object: bool = False,
2123-
format: str | None = None,
2124-
exact: bool = True,
21252122
):
21262123
"""
21272124
Convert data to array of timestamps.
@@ -2134,7 +2131,6 @@ def objects_to_datetime64ns(
21342131
utc : bool, default False
21352132
Whether to convert/localize timestamps to UTC.
21362133
errors : {'raise', 'ignore', 'coerce'}
2137-
require_iso8601 : bool, default False
21382134
allow_object : bool
21392135
Whether to return an object-dtype ndarray instead of raising if the
21402136
data contains more than one timezone.
@@ -2165,9 +2161,6 @@ def objects_to_datetime64ns(
21652161
utc=utc,
21662162
dayfirst=dayfirst,
21672163
yearfirst=yearfirst,
2168-
require_iso8601=require_iso8601,
2169-
format=format,
2170-
exact=exact,
21712164
)
21722165
result = result.reshape(data.shape, order=order)
21732166
except OverflowError as err:

0 commit comments

Comments
 (0)