From 0443b2cc355b83278515830a33c454398c0fdd31 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 29 Jul 2022 13:22:07 +0200 Subject: [PATCH] parse 8 or 9 digit delimited dates --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/_libs/tslibs/parsing.pyx | 39 ++++++++++++++++++---- pandas/tests/io/parser/test_parse_dates.py | 33 ++++++++++++++++++ 3 files changed, 66 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index e57166f7a4861..b053986bd5899 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -770,6 +770,7 @@ Other Deprecations - Deprecated the argument ``na_sentinel`` in :func:`factorize`, :meth:`Index.factorize`, and :meth:`.ExtensionArray.factorize`; pass ``use_na_sentinel=True`` instead to use the sentinel ``-1`` for NaN values and ``use_na_sentinel=False`` instead of ``na_sentinel=None`` to encode NaN values (:issue:`46910`) - Deprecated :meth:`DataFrameGroupBy.transform` not aligning the result when the UDF returned DataFrame (:issue:`45648`) - Clarified warning from :func:`to_datetime` when delimited dates can't be parsed in accordance to specified ``dayfirst`` argument (:issue:`46210`) +- Emit warning from :func:`to_datetime` when delimited dates can't be parsed in accordance to specified ``dayfirst`` argument even for dates where leading zero is omitted (e.g. ``31/1/2001``) (:issue:`47880`) - Deprecated :class:`Series` and :class:`Resampler` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) raising a ``NotImplementedError`` when the dtype is non-numric and ``numeric_only=True`` is provided; this will raise a ``TypeError`` in a future version (:issue:`47500`) - Deprecated :meth:`Series.rank` returning an empty result when the dtype is non-numeric and ``numeric_only=True`` is provided; this will raise a ``TypeError`` in a future version (:issue:`47500`) - Deprecated argument ``errors`` for :meth:`Series.mask`, :meth:`Series.where`, :meth:`DataFrame.mask`, and :meth:`DataFrame.where` as ``errors`` had no effect on this methods (:issue:`47728`) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 5cb11436f6f45..97a8f81094a8f 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -99,8 +99,14 @@ cdef: int MAX_DAYS_IN_MONTH = 31, MAX_MONTH = 12 -cdef inline bint _is_not_delimiter(const char ch): - return strchr(delimiters, ch) == NULL +cdef inline bint _is_delimiter(const char ch): + return strchr(delimiters, ch) != NULL + + +cdef inline int _parse_1digit(const char* s): + cdef int result = 0 + result += getdigit_ascii(s[0], -10) * 1 + return result cdef inline int _parse_2digit(const char* s): @@ -151,18 +157,37 @@ cdef inline object _parse_delimited_date(str date_string, bint dayfirst): bint can_swap = 0 buf = get_c_string_buf_and_size(date_string, &length) - if length == 10: + if length == 10 and _is_delimiter(buf[2]) and _is_delimiter(buf[5]): # parsing MM?DD?YYYY and DD?MM?YYYY dates - if _is_not_delimiter(buf[2]) or _is_not_delimiter(buf[5]): - return None, None month = _parse_2digit(buf) day = _parse_2digit(buf + 3) year = _parse_4digit(buf + 6) reso = 'day' can_swap = 1 - elif length == 7: + elif length == 9 and _is_delimiter(buf[1]) and _is_delimiter(buf[4]): + # parsing M?DD?YYYY and D?MM?YYYY dates + month = _parse_1digit(buf) + day = _parse_2digit(buf + 2) + year = _parse_4digit(buf + 5) + reso = 'day' + can_swap = 1 + elif length == 9 and _is_delimiter(buf[2]) and _is_delimiter(buf[4]): + # parsing MM?D?YYYY and DD?M?YYYY dates + month = _parse_2digit(buf) + day = _parse_1digit(buf + 3) + year = _parse_4digit(buf + 5) + reso = 'day' + can_swap = 1 + elif length == 8 and _is_delimiter(buf[1]) and _is_delimiter(buf[3]): + # parsing M?D?YYYY and D?M?YYYY dates + month = _parse_1digit(buf) + day = _parse_1digit(buf + 2) + year = _parse_4digit(buf + 4) + reso = 'day' + can_swap = 1 + elif length == 7 and _is_delimiter(buf[2]): # parsing MM?YYYY dates - if buf[2] == b'.' or _is_not_delimiter(buf[2]): + if buf[2] == b'.': # we cannot reliably tell whether e.g. 10.2010 is a float # or a date, thus we refuse to parse it here return None, None diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index d05961b702c51..5d2e5bccd9762 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1948,6 +1948,39 @@ def test_dayfirst_warnings(): tm.assert_index_equal(expected, res8) +@pytest.mark.parametrize( + "date_string, dayfirst", + [ + pytest.param( + "31/1/2014", + False, + id="second date is single-digit", + ), + pytest.param( + "1/31/2014", + True, + id="first date is single-digit", + ), + ], +) +def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst): + # GH47880 + initial_value = f"date\n{date_string}" + expected = DatetimeIndex( + ["2014-01-31"], dtype="datetime64[ns]", freq=None, name="date" + ) + with tm.assert_produces_warning( + UserWarning, match=r"may lead to inconsistently parsed dates" + ): + res = read_csv( + StringIO(initial_value), + parse_dates=["date"], + index_col="date", + dayfirst=dayfirst, + ).index + tm.assert_index_equal(expected, res) + + @skip_pyarrow def test_infer_first_column_as_index(all_parsers): # GH#11019