Skip to content

ENH: parse 8 or 9 digit delimited dates #47894

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -770,6 +770,7 @@ Other Deprecations
- Deprecated the argument ``na_sentinel`` in :func:`factorize`, :meth:`Index.factorize`, and :meth:`.ExtensionArray.factorize`; pass ``use_na_sentinel=True`` instead to use the sentinel ``-1`` for NaN values and ``use_na_sentinel=False`` instead of ``na_sentinel=None`` to encode NaN values (:issue:`46910`)
- Deprecated :meth:`DataFrameGroupBy.transform` not aligning the result when the UDF returned DataFrame (:issue:`45648`)
- Clarified warning from :func:`to_datetime` when delimited dates can't be parsed in accordance to specified ``dayfirst`` argument (:issue:`46210`)
- Emit warning from :func:`to_datetime` when delimited dates can't be parsed in accordance to specified ``dayfirst`` argument even for dates where leading zero is omitted (e.g. ``31/1/2001``) (:issue:`47880`)
- Deprecated :class:`Series` and :class:`Resampler` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) raising a ``NotImplementedError`` when the dtype is non-numric and ``numeric_only=True`` is provided; this will raise a ``TypeError`` in a future version (:issue:`47500`)
- Deprecated :meth:`Series.rank` returning an empty result when the dtype is non-numeric and ``numeric_only=True`` is provided; this will raise a ``TypeError`` in a future version (:issue:`47500`)
- Deprecated argument ``errors`` for :meth:`Series.mask`, :meth:`Series.where`, :meth:`DataFrame.mask`, and :meth:`DataFrame.where` as ``errors`` had no effect on this methods (:issue:`47728`)
Expand Down
39 changes: 32 additions & 7 deletions pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,14 @@ cdef:
int MAX_DAYS_IN_MONTH = 31, MAX_MONTH = 12


cdef inline bint _is_not_delimiter(const char ch):
return strchr(delimiters, ch) == NULL
cdef inline bint _is_delimiter(const char ch):
return strchr(delimiters, ch) != NULL


cdef inline int _parse_1digit(const char* s):
cdef int result = 0
result += getdigit_ascii(s[0], -10) * 1
return result


cdef inline int _parse_2digit(const char* s):
Expand Down Expand Up @@ -151,18 +157,37 @@ cdef inline object _parse_delimited_date(str date_string, bint dayfirst):
bint can_swap = 0

buf = get_c_string_buf_and_size(date_string, &length)
if length == 10:
if length == 10 and _is_delimiter(buf[2]) and _is_delimiter(buf[5]):
# parsing MM?DD?YYYY and DD?MM?YYYY dates
if _is_not_delimiter(buf[2]) or _is_not_delimiter(buf[5]):
return None, None
month = _parse_2digit(buf)
day = _parse_2digit(buf + 3)
year = _parse_4digit(buf + 6)
reso = 'day'
can_swap = 1
elif length == 7:
elif length == 9 and _is_delimiter(buf[1]) and _is_delimiter(buf[4]):
# parsing M?DD?YYYY and D?MM?YYYY dates
month = _parse_1digit(buf)
day = _parse_2digit(buf + 2)
year = _parse_4digit(buf + 5)
reso = 'day'
can_swap = 1
elif length == 9 and _is_delimiter(buf[2]) and _is_delimiter(buf[4]):
# parsing MM?D?YYYY and DD?M?YYYY dates
month = _parse_2digit(buf)
day = _parse_1digit(buf + 3)
year = _parse_4digit(buf + 5)
reso = 'day'
can_swap = 1
elif length == 8 and _is_delimiter(buf[1]) and _is_delimiter(buf[3]):
# parsing M?D?YYYY and D?M?YYYY dates
month = _parse_1digit(buf)
day = _parse_1digit(buf + 2)
year = _parse_4digit(buf + 4)
reso = 'day'
can_swap = 1
elif length == 7 and _is_delimiter(buf[2]):
# parsing MM?YYYY dates
if buf[2] == b'.' or _is_not_delimiter(buf[2]):
if buf[2] == b'.':
# we cannot reliably tell whether e.g. 10.2010 is a float
# or a date, thus we refuse to parse it here
return None, None
Expand Down
33 changes: 33 additions & 0 deletions pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -1948,6 +1948,39 @@ def test_dayfirst_warnings():
tm.assert_index_equal(expected, res8)


@pytest.mark.parametrize(
"date_string, dayfirst",
[
pytest.param(
"31/1/2014",
False,
id="second date is single-digit",
),
pytest.param(
"1/31/2014",
True,
id="first date is single-digit",
),
],
)
def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst):
# GH47880
initial_value = f"date\n{date_string}"
expected = DatetimeIndex(
["2014-01-31"], dtype="datetime64[ns]", freq=None, name="date"
)
with tm.assert_produces_warning(
UserWarning, match=r"may lead to inconsistently parsed dates"
):
res = read_csv(
StringIO(initial_value),
parse_dates=["date"],
index_col="date",
dayfirst=dayfirst,
).index
tm.assert_index_equal(expected, res)


@skip_pyarrow
def test_infer_first_column_as_index(all_parsers):
# GH#11019
Expand Down