From a94fd287f35d0a50a5e0c1455a7bff276eb35edf Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 11 Aug 2021 14:41:26 +0530 Subject: [PATCH 1/6] BUG: pd.to_datetime with format doesn't work with pd.NA --- pandas/core/tools/datetimes.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 8eac5f76fd455..c8ef419aeac26 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -56,7 +56,10 @@ ABCDataFrame, ABCSeries, ) -from pandas.core.dtypes.missing import notna +from pandas.core.dtypes.missing import ( + isna, + notna, +) from pandas.arrays import ( DatetimeArray, @@ -390,11 +393,17 @@ def _convert_listlike_datetimes( format = None if format is not None: - res = _to_datetime_with_format( - arg, orig_arg, name, tz, format, exact, errors, infer_datetime_format - ) - if res is not None: - return res + try: + res = _to_datetime_with_format( + arg, orig_arg, name, tz, format, exact, errors, infer_datetime_format + ) + if res is not None: + return res + except ValueError: + # GH#42957: ValueError: time data '' + # does not match format '%Y%m%d%H%M%S' + if isna(arg): + format = None assert format is None or infer_datetime_format utc = tz == "utc" From f65f0defb8e565ac7e390abdb3578e11c38cf09d Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 11 Aug 2021 17:25:49 +0530 Subject: [PATCH 2/6] raised if no pd.NA --- pandas/core/tools/datetimes.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index c8ef419aeac26..e2ae61859bd5d 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -399,11 +399,13 @@ def _convert_listlike_datetimes( ) if res is not None: return res - except ValueError: + except ValueError as err: # GH#42957: ValueError: time data '' # does not match format '%Y%m%d%H%M%S' - if isna(arg): + if any(isna(arg)): format = None + else: + raise err assert format is None or infer_datetime_format utc = tz == "utc" From a1736a9f548748ce3e7b96793e31425f2e3d4c5c Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 11 Aug 2021 19:02:20 +0530 Subject: [PATCH 3/6] included pd.na check --- pandas/_libs/tslibs/strptime.pyx | 4 ++-- pandas/core/tools/datetimes.py | 23 ++++++----------------- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index e7fb38db2aa17..d214694fb659d 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -20,10 +20,10 @@ from numpy cimport ( ndarray, ) +from pandas._libs.missing cimport checknull_with_nat_and_na from pandas._libs.tslibs.nattype cimport ( NPY_NAT, c_nat_strings as nat_strings, - checknull_with_nat, ) from pandas._libs.tslibs.np_datetime cimport ( check_dts_bounds, @@ -134,7 +134,7 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' iresult[i] = NPY_NAT continue else: - if checknull_with_nat(val): + if checknull_with_nat_and_na(val): iresult[i] = NPY_NAT continue else: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index e2ae61859bd5d..8eac5f76fd455 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -56,10 +56,7 @@ ABCDataFrame, ABCSeries, ) -from pandas.core.dtypes.missing import ( - isna, - notna, -) +from pandas.core.dtypes.missing import notna from pandas.arrays import ( DatetimeArray, @@ -393,19 +390,11 @@ def _convert_listlike_datetimes( format = None if format is not None: - try: - res = _to_datetime_with_format( - arg, orig_arg, name, tz, format, exact, errors, infer_datetime_format - ) - if res is not None: - return res - except ValueError as err: - # GH#42957: ValueError: time data '' - # does not match format '%Y%m%d%H%M%S' - if any(isna(arg)): - format = None - else: - raise err + res = _to_datetime_with_format( + arg, orig_arg, name, tz, format, exact, errors, infer_datetime_format + ) + if res is not None: + return res assert format is None or infer_datetime_format utc = tz == "utc" From a0f9e74171596547441f9c295b995834ba0fa2f0 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 11 Aug 2021 21:23:18 +0530 Subject: [PATCH 4/6] added whatsnew and tests --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/tests/tools/test_to_datetime.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 7c7763f82e5c7..ef5991ea96fb0 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -198,7 +198,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ - Bug in :class:`DataFrame` constructor unnecessarily copying non-datetimelike 2D object arrays (:issue:`39272`) -- +- Bug in :func:`to_datetime` with ``format`` and ``pandas.NA`` was raising ``ValueError`` (:issue:`42957`) Timedelta ^^^^^^^^^ diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 7351f50aea8c1..fac535a912752 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -383,6 +383,24 @@ def test_to_datetime_parse_timezone_keeps_name(self): expected = DatetimeIndex(["2010-01-01 12:00:00"], tz="UTC", name="foo") tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( + "data, format, expected", + [ + ([pd.NA], "%Y%m%d%H%M%S", DatetimeIndex(["NaT"])), + ( + [pd.NA, "20210202202020"], + "%Y%m%d%H%M%S", + DatetimeIndex(["NaT", "2021-02-02 20:20:20"]), + ), + (["201010", pd.NA], "%y%m%d", DatetimeIndex(["2020-10-10", "NaT"])), + (["201010", pd.NA], "%d%m%y", DatetimeIndex(["2010-10-20", "NaT"])), + ], + ) + def test_to_datetime_with_pdNA(self, data, format, expected): + # GH#42957 + result = to_datetime(data, format=format) + tm.assert_index_equal(result, expected) + class TestToDatetime: @pytest.mark.parametrize( From cf4119dbc0ea110f2e71c031143198b06be7a1f5 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 11 Aug 2021 22:06:50 +0530 Subject: [PATCH 5/6] added couple more test cases --- pandas/tests/tools/test_to_datetime.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index fac535a912752..1ec710d9725aa 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -387,6 +387,7 @@ def test_to_datetime_parse_timezone_keeps_name(self): "data, format, expected", [ ([pd.NA], "%Y%m%d%H%M%S", DatetimeIndex(["NaT"])), + ([pd.NA], None, DatetimeIndex(["NaT"])), ( [pd.NA, "20210202202020"], "%Y%m%d%H%M%S", @@ -394,6 +395,7 @@ def test_to_datetime_parse_timezone_keeps_name(self): ), (["201010", pd.NA], "%y%m%d", DatetimeIndex(["2020-10-10", "NaT"])), (["201010", pd.NA], "%d%m%y", DatetimeIndex(["2010-10-20", "NaT"])), + (["201010", pd.NA], None, DatetimeIndex(["2010-10-20", "NaT"])), ], ) def test_to_datetime_with_pdNA(self, data, format, expected): From 74a8e95bb6ad01ebc83859dadb499f92bbf8386b Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 12 Aug 2021 00:13:15 +0530 Subject: [PATCH 6/6] added suggested testcases --- pandas/tests/tools/test_to_datetime.py | 42 ++++++++++++++------------ 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 1ec710d9725aa..e3e20c334ea63 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -177,6 +177,28 @@ def test_to_datetime_format_YYYYMMDD_overflow(self, input_s, expected): result = to_datetime(input_s, format="%Y%m%d", errors="coerce") tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "data, format, expected", + [ + ([pd.NA], "%Y%m%d%H%M%S", DatetimeIndex(["NaT"])), + ([pd.NA], None, DatetimeIndex(["NaT"])), + ( + [pd.NA, "20210202202020"], + "%Y%m%d%H%M%S", + DatetimeIndex(["NaT", "2021-02-02 20:20:20"]), + ), + (["201010", pd.NA], "%y%m%d", DatetimeIndex(["2020-10-10", "NaT"])), + (["201010", pd.NA], "%d%m%y", DatetimeIndex(["2010-10-20", "NaT"])), + (["201010", pd.NA], None, DatetimeIndex(["2010-10-20", "NaT"])), + ([None, np.nan, pd.NA], None, DatetimeIndex(["NaT", "NaT", "NaT"])), + ([None, np.nan, pd.NA], "%Y%m%d", DatetimeIndex(["NaT", "NaT", "NaT"])), + ], + ) + def test_to_datetime_with_NA(self, data, format, expected): + # GH#42957 + result = to_datetime(data, format=format) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format_integer(self, cache): # GH 10178 @@ -383,26 +405,6 @@ def test_to_datetime_parse_timezone_keeps_name(self): expected = DatetimeIndex(["2010-01-01 12:00:00"], tz="UTC", name="foo") tm.assert_index_equal(result, expected) - @pytest.mark.parametrize( - "data, format, expected", - [ - ([pd.NA], "%Y%m%d%H%M%S", DatetimeIndex(["NaT"])), - ([pd.NA], None, DatetimeIndex(["NaT"])), - ( - [pd.NA, "20210202202020"], - "%Y%m%d%H%M%S", - DatetimeIndex(["NaT", "2021-02-02 20:20:20"]), - ), - (["201010", pd.NA], "%y%m%d", DatetimeIndex(["2020-10-10", "NaT"])), - (["201010", pd.NA], "%d%m%y", DatetimeIndex(["2010-10-20", "NaT"])), - (["201010", pd.NA], None, DatetimeIndex(["2010-10-20", "NaT"])), - ], - ) - def test_to_datetime_with_pdNA(self, data, format, expected): - # GH#42957 - result = to_datetime(data, format=format) - tm.assert_index_equal(result, expected) - class TestToDatetime: @pytest.mark.parametrize(