From 0443b2cc355b83278515830a33c454398c0fdd31 Mon Sep 17 00:00:00 2001
From: Marco Gorelli <marcogorelli@protonmail.com>
Date: Fri, 29 Jul 2022 13:22:07 +0200
Subject: [PATCH] parse 8 or 9 digit delimited dates

---
 doc/source/whatsnew/v1.5.0.rst             |  1 +
 pandas/_libs/tslibs/parsing.pyx            | 39 ++++++++++++++++++----
 pandas/tests/io/parser/test_parse_dates.py | 33 ++++++++++++++++++
 3 files changed, 66 insertions(+), 7 deletions(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index e57166f7a4861..b053986bd5899 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -770,6 +770,7 @@ Other Deprecations
 - Deprecated the argument ``na_sentinel`` in :func:`factorize`, :meth:`Index.factorize`, and :meth:`.ExtensionArray.factorize`; pass ``use_na_sentinel=True`` instead to use the sentinel ``-1`` for NaN values and ``use_na_sentinel=False`` instead of ``na_sentinel=None`` to encode NaN values (:issue:`46910`)
 - Deprecated :meth:`DataFrameGroupBy.transform` not aligning the result when the UDF returned DataFrame (:issue:`45648`)
 - Clarified warning from :func:`to_datetime` when delimited dates can't be parsed in accordance to specified ``dayfirst`` argument (:issue:`46210`)
+- Emit warning from :func:`to_datetime` when delimited dates can't be parsed in accordance to specified ``dayfirst`` argument even for dates where leading zero is omitted (e.g. ``31/1/2001``) (:issue:`47880`)
 - Deprecated :class:`Series` and :class:`Resampler` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) raising a ``NotImplementedError`` when the dtype is non-numric and ``numeric_only=True`` is provided; this will raise a ``TypeError`` in a future version (:issue:`47500`)
 - Deprecated :meth:`Series.rank` returning an empty result when the dtype is non-numeric and ``numeric_only=True`` is provided; this will raise a ``TypeError`` in a future version (:issue:`47500`)
 - Deprecated argument ``errors`` for :meth:`Series.mask`, :meth:`Series.where`, :meth:`DataFrame.mask`, and :meth:`DataFrame.where` as ``errors`` had no effect on this methods (:issue:`47728`)
diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
index 5cb11436f6f45..97a8f81094a8f 100644
--- a/pandas/_libs/tslibs/parsing.pyx
+++ b/pandas/_libs/tslibs/parsing.pyx
@@ -99,8 +99,14 @@ cdef:
     int MAX_DAYS_IN_MONTH = 31, MAX_MONTH = 12
 
 
-cdef inline bint _is_not_delimiter(const char ch):
-    return strchr(delimiters, ch) == NULL
+cdef inline bint _is_delimiter(const char ch):
+    return strchr(delimiters, ch) != NULL
+
+
+cdef inline int _parse_1digit(const char* s):
+    cdef int result = 0
+    result += getdigit_ascii(s[0], -10) * 1
+    return result
 
 
 cdef inline int _parse_2digit(const char* s):
@@ -151,18 +157,37 @@ cdef inline object _parse_delimited_date(str date_string, bint dayfirst):
         bint can_swap = 0
 
     buf = get_c_string_buf_and_size(date_string, &length)
-    if length == 10:
+    if length == 10 and _is_delimiter(buf[2]) and _is_delimiter(buf[5]):
         # parsing MM?DD?YYYY and DD?MM?YYYY dates
-        if _is_not_delimiter(buf[2]) or _is_not_delimiter(buf[5]):
-            return None, None
         month = _parse_2digit(buf)
         day = _parse_2digit(buf + 3)
         year = _parse_4digit(buf + 6)
         reso = 'day'
         can_swap = 1
-    elif length == 7:
+    elif length == 9 and _is_delimiter(buf[1]) and _is_delimiter(buf[4]):
+        # parsing M?DD?YYYY and D?MM?YYYY dates
+        month = _parse_1digit(buf)
+        day = _parse_2digit(buf + 2)
+        year = _parse_4digit(buf + 5)
+        reso = 'day'
+        can_swap = 1
+    elif length == 9 and _is_delimiter(buf[2]) and _is_delimiter(buf[4]):
+        # parsing MM?D?YYYY and DD?M?YYYY dates
+        month = _parse_2digit(buf)
+        day = _parse_1digit(buf + 3)
+        year = _parse_4digit(buf + 5)
+        reso = 'day'
+        can_swap = 1
+    elif length == 8 and _is_delimiter(buf[1]) and _is_delimiter(buf[3]):
+        # parsing M?D?YYYY and D?M?YYYY dates
+        month = _parse_1digit(buf)
+        day = _parse_1digit(buf + 2)
+        year = _parse_4digit(buf + 4)
+        reso = 'day'
+        can_swap = 1
+    elif length == 7 and _is_delimiter(buf[2]):
         # parsing MM?YYYY dates
-        if buf[2] == b'.' or _is_not_delimiter(buf[2]):
+        if buf[2] == b'.':
             # we cannot reliably tell whether e.g. 10.2010 is a float
             # or a date, thus we refuse to parse it here
             return None, None
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index d05961b702c51..5d2e5bccd9762 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -1948,6 +1948,39 @@ def test_dayfirst_warnings():
     tm.assert_index_equal(expected, res8)
 
 
+@pytest.mark.parametrize(
+    "date_string, dayfirst",
+    [
+        pytest.param(
+            "31/1/2014",
+            False,
+            id="second date is single-digit",
+        ),
+        pytest.param(
+            "1/31/2014",
+            True,
+            id="first date is single-digit",
+        ),
+    ],
+)
+def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst):
+    # GH47880
+    initial_value = f"date\n{date_string}"
+    expected = DatetimeIndex(
+        ["2014-01-31"], dtype="datetime64[ns]", freq=None, name="date"
+    )
+    with tm.assert_produces_warning(
+        UserWarning, match=r"may lead to inconsistently parsed dates"
+    ):
+        res = read_csv(
+            StringIO(initial_value),
+            parse_dates=["date"],
+            index_col="date",
+            dayfirst=dayfirst,
+        ).index
+    tm.assert_index_equal(expected, res)
+
+
 @skip_pyarrow
 def test_infer_first_column_as_index(all_parsers):
     # GH#11019