From 7c8ec89f6b67875f0240e080dfec8fd10bb80a94 Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Wed, 26 Oct 2022 13:16:33 -0300 Subject: [PATCH 01/15] Parse `datetime` properly in `pd.to_datetime` When applying `pd.to_datetime` on array-like structure that contain a `datetime.datetime` object, while using the `format` argument, a `ValueError` is raised because the `datetime.datetime` object does not match the expected format. The implemented solution looks for `datetime.datetime` instances in the `array_strptime` method. If an instance of this type is found, it's properly handled by the new `_parse_python_datetime_object`, which returns the expected Numpy datetime object. Signed-off-by: Antonio Ossa Guerra --- pandas/_libs/tslibs/strptime.pyx | 47 ++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 6287c2fbc5d34..1d6e059f56a2f 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -2,6 +2,7 @@ """ from cpython.datetime cimport ( date, + datetime, tzinfo, ) @@ -129,12 +130,22 @@ def array_strptime(ndarray[object] values, str fmt, bint exact=True, errors='rai if val in nat_strings: iresult[i] = NPY_NAT continue + elif checknull_with_nat_and_na(val): + iresult[i] = NPY_NAT + continue + elif isinstance(val, datetime): + iresult[i] = _parse_python_datetime_object(val, &dts) + try: + check_dts_bounds(&dts) + except ValueError: + if is_coerce: + iresult[i] = NPY_NAT + continue + raise + result_timezone[i] = val.tzname() + continue else: - if checknull_with_nat_and_na(val): - iresult[i] = NPY_NAT - continue - else: - val = str(val) + val = str(val) # exact matching if exact: @@ -532,3 +543,29 @@ cdef tzinfo parse_timezone_directive(str z): (microseconds // 60_000_000)) total_minutes = -total_minutes if z.startswith("-") else total_minutes return pytz.FixedOffset(total_minutes) + +cdef int64_t _parse_python_datetime_object(datetime dt, npy_datetimestruct *dts): + """ + Parse a native datetime.datetime object and return a numpy datetime object + + Parameters + ---------- + dt : datetime.datetime instance + dts: numpy datetime struct + + Returns + ------- + int64_t + the numpy datetime object + """ + dts.year = dt.year + dts.month = dt.month + dts.day = dt.day + dts.hour = dt.hour + dts.min = dt.minute + dts.sec = dt.second + dts.us = dt.microsecond + dts.ps = 0 # Not enough precision in datetime objects (https://github.com/python/cpython/issues/59648) + + npy_datetime = npy_datetimestruct_to_datetime(NPY_FR_ns, dts) + return npy_datetime From f275564fb05be03280ddbaac14538def76d393b1 Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Wed, 26 Oct 2022 13:29:50 -0300 Subject: [PATCH 02/15] Test `pd_to_datetime` in presence of `datetime` Two tests are introduced to check that `datetime.datetime` objects are skipped while parsing array-like structures to `pd.to_datetime` and the `format` argument is being used. Before the fix, all of these tests and their test cases failed, except `test_to_datetime_arraylike_contains_pydatetime_utc` when cache was `True` and the `init_constructor` was `Index`. The test creates an input of data that must be formated and a `datetime.datetime` object. Then, the values are passed to the array-like `init_constructor` to be converted to datetime using the `pd.to_datetime` method. The test must not fail, and the result must match the expected data. Signed-off-by: Antonio Ossa Guerra --- pandas/tests/tools/test_to_datetime.py | 50 ++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index f524bc18793d8..f6543bd5c42ee 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -609,6 +609,56 @@ def test_to_datetime_dtarr(self, tz): result = to_datetime(arr) assert result is arr + @pytest.mark.parametrize( + "init_constructor, end_constructor", + [ + (Index, DatetimeIndex), + (list, DatetimeIndex), + (np.array, DatetimeIndex), + (Series, Series), + ], + ) + def test_to_datetime_arraylike_contains_pydatetime( + self, init_constructor, end_constructor + ): + # GH 49298 + data = ["01/02/01 12:00", datetime(2001, 2, 2, 12, 30)] + expected_data = [ + Timestamp("2001-02-01 12:00:00"), + Timestamp("2001-02-02 12:30:00"), + ] + result = to_datetime(init_constructor(data), format="%d/%m/%y %H:%M") + expected = end_constructor(expected_data) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( + "init_constructor, end_constructor", + [ + (Index, DatetimeIndex), + (list, DatetimeIndex), + (np.array, DatetimeIndex), + (Series, Series), + ], + ) + def test_to_datetime_arraylike_contains_pydatetime_utc( + self, cache, init_constructor, end_constructor + ): + # GH 49298 + dt = datetime(2010, 1, 2, 12, 13, 16) + dt = dt.replace(tzinfo=timezone.utc) + data = ["20100102 121314", "20100102 121315", dt] + expected_data = [ + Timestamp("2010-01-02 12:13:14", tz="utc"), + Timestamp("2010-01-02 12:13:15", tz="utc"), + Timestamp("2010-01-02 12:13:16", tz="utc"), + ] + + result = to_datetime( + init_constructor(data), format="%Y%m%d %H%M%S", utc=True, cache=cache + ) + expected = end_constructor(expected_data) + tm.assert_equal(result, expected) + def test_to_datetime_pydatetime(self): actual = to_datetime(datetime(2008, 1, 15)) assert actual == datetime(2008, 1, 15) From 86df782c31f7ad56e3ae1a6dd75ab60b47da65c0 Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Wed, 26 Oct 2022 13:52:54 -0300 Subject: [PATCH 03/15] Add whatsnew on `to_datetime` handling `datetime` The introduced enhancement of handling `datetime.datetime` objects in the input of `pd.to_datetime` when using the `format` argument is now included in the whatsnew Signed-off-by: Antonio Ossa Guerra --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 4c85b3d5dc745..8b839bfc807f2 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -49,6 +49,7 @@ Other enhancements - Fix ``test`` optional_extra by adding missing test package ``pytest-asyncio`` (:issue:`48361`) - :func:`DataFrame.astype` exception message thrown improved to include column name when type conversion is not possible. (:issue:`47571`) - :meth:`DataFrame.to_json` now supports a ``mode`` keyword with supported inputs 'w' and 'a'. Defaulting to 'w', 'a' can be used when lines=True and orient='records' to append record oriented json lines to an existing json file. (:issue:`35849`) +- :func:`to_datetime` now correctly parses ``datetime.datetime`` objects in the input when using the ``format`` argument instead of raising a ``ValueError``. (:issue:`49298`) .. --------------------------------------------------------------------------- .. _whatsnew_200.notable_bug_fixes: From bd41e79da95486c1c0c4e8079b7fae135bf34f4e Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Fri, 28 Oct 2022 14:31:01 -0300 Subject: [PATCH 04/15] Apply `fmt` on `Timestamp` and `datetime` objects Instead of manually handling the `datetime.datetime` and `Timestamp` objects by skipping formatting and leaving them untouched, apply the passed format (`fmt`). This allow us to return a consistent output while reusing existing parsing code Signed-off-by: Antonio Ossa Guerra --- pandas/_libs/tslibs/strptime.pyx | 37 +------------------------------- 1 file changed, 1 insertion(+), 36 deletions(-) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 1d6e059f56a2f..5ec8494f9f354 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -134,16 +134,7 @@ def array_strptime(ndarray[object] values, str fmt, bint exact=True, errors='rai iresult[i] = NPY_NAT continue elif isinstance(val, datetime): - iresult[i] = _parse_python_datetime_object(val, &dts) - try: - check_dts_bounds(&dts) - except ValueError: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise - result_timezone[i] = val.tzname() - continue + val = val.strftime(fmt) else: val = str(val) @@ -543,29 +534,3 @@ cdef tzinfo parse_timezone_directive(str z): (microseconds // 60_000_000)) total_minutes = -total_minutes if z.startswith("-") else total_minutes return pytz.FixedOffset(total_minutes) - -cdef int64_t _parse_python_datetime_object(datetime dt, npy_datetimestruct *dts): - """ - Parse a native datetime.datetime object and return a numpy datetime object - - Parameters - ---------- - dt : datetime.datetime instance - dts: numpy datetime struct - - Returns - ------- - int64_t - the numpy datetime object - """ - dts.year = dt.year - dts.month = dt.month - dts.day = dt.day - dts.hour = dt.hour - dts.min = dt.minute - dts.sec = dt.second - dts.us = dt.microsecond - dts.ps = 0 # Not enough precision in datetime objects (https://github.com/python/cpython/issues/59648) - - npy_datetime = npy_datetimestruct_to_datetime(NPY_FR_ns, dts) - return npy_datetime From a5cb6fe55392b5f543db618ea566ef5db0803a56 Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Fri, 28 Oct 2022 14:35:14 -0300 Subject: [PATCH 05/15] Update tests to reflect new expected behavior The new expecte bahavior is apply the format (`fmt`) on `datetime.datetime` and `Timestamp` objects to create a consistent output. This test includes some unexpected cases, like the limitation of resolution on `datetime.datetime` objects (Python does not support nanosecond resolution [1]) and also when formatting `Timestamp` objects (same limitation [2]) [1]: https://github.com/python/cpython/issues/59648 [2]: https://github.com/pandas-dev/pandas/issues/29461 Signed-off-by: Antonio Ossa Guerra --- pandas/tests/tools/test_to_datetime.py | 85 +++++++++++++++++++++++--- 1 file changed, 75 insertions(+), 10 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index f6543bd5c42ee..fce21da316308 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -609,6 +609,29 @@ def test_to_datetime_dtarr(self, tz): result = to_datetime(arr) assert result is arr + @pytest.mark.parametrize( + "data, expected", + [ + ( + Timestamp("2001-10-01 12:00:01.123456789"), + Timestamp("2001-10-01 12:00:01.123456"), + ), + ( + datetime(2002, 10, 1, 12, 00, 1, 123456), + Timestamp("2002-10-01 12:00:01.123456"), + ), + ("10/01/03 12:00:01.123456789", Timestamp("2003-10-01 12:00:01.123456789")), + ], + ) + def test_to_datetime_preserves_resolution_when_possible(self, data, expected): + # GH 49298 + if not isinstance(data, str): + result = to_datetime([data]) + tm.assert_equal(result, DatetimeIndex([data])) + + result = to_datetime([data], format="%m/%d/%y %H:%M:%S.%f") + tm.assert_equal(result, DatetimeIndex([expected])) + @pytest.mark.parametrize( "init_constructor, end_constructor", [ @@ -618,16 +641,52 @@ def test_to_datetime_dtarr(self, tz): (Series, Series), ], ) - def test_to_datetime_arraylike_contains_pydatetime( + def test_to_datetime_arraylike_contains_pydatetime_and_timestamp( self, init_constructor, end_constructor ): # GH 49298 - data = ["01/02/01 12:00", datetime(2001, 2, 2, 12, 30)] + # Timestamp/datetime have more resolution than str + case1 = [ + Timestamp("2001-10-01 12:00:01.123456789"), + datetime(2001, 10, 2, 12, 30, 1, 123456), + "10/03/01", + ] + result = to_datetime(init_constructor(case1), format="%m/%d/%y") + expected_data = [ + Timestamp("2001-10-01"), + Timestamp("2001-10-02"), + Timestamp("2001-10-03"), + ] + expected = end_constructor(expected_data) + tm.assert_equal(result, expected) + + # Timestamp/datetime have the same resolution than str (nanosecond) + case2 = [ + Timestamp("2001-10-01 12:00:01.123456789"), + datetime(2001, 10, 2, 12, 30, 1, 123456), + "10/03/01 13:00:01.123456789", + ] + result = to_datetime(init_constructor(case2), format="%m/%d/%y %H:%M:%S.%f") expected_data = [ - Timestamp("2001-02-01 12:00:00"), - Timestamp("2001-02-02 12:30:00"), + Timestamp("2001-10-01 12:00:01.123456"), + Timestamp("2001-10-02 12:30:01.123456"), + Timestamp("2001-10-03 13:00:01.123456789"), + ] + expected = end_constructor(expected_data) + tm.assert_equal(result, expected) + + # Timestamp/datetime have less resolution than str + case3 = [ + Timestamp("2001-10-01"), + datetime(2001, 10, 2), + "10/03/01 12:00:01", + ] + result = to_datetime(init_constructor(case3), format="%m/%d/%y %H:%M:%S") + expected_data = [ + Timestamp("2001-10-01 00:00:00"), + Timestamp("2001-10-02 00:00:00"), + Timestamp("2001-10-03 12:00:01"), ] - result = to_datetime(init_constructor(data), format="%d/%m/%y %H:%M") expected = end_constructor(expected_data) tm.assert_equal(result, expected) @@ -640,22 +699,28 @@ def test_to_datetime_arraylike_contains_pydatetime( (Series, Series), ], ) - def test_to_datetime_arraylike_contains_pydatetime_utc( + def test_to_datetime_arraylike_contains_pydatetime_and_timestamp_utc( self, cache, init_constructor, end_constructor ): # GH 49298 dt = datetime(2010, 1, 2, 12, 13, 16) dt = dt.replace(tzinfo=timezone.utc) - data = ["20100102 121314", "20100102 121315", dt] + data = [ + "20100102 121314", + Timestamp("2010-01-02 12:13:15", tz="utc"), + dt, + ] expected_data = [ Timestamp("2010-01-02 12:13:14", tz="utc"), Timestamp("2010-01-02 12:13:15", tz="utc"), Timestamp("2010-01-02 12:13:16", tz="utc"), ] - result = to_datetime( - init_constructor(data), format="%Y%m%d %H%M%S", utc=True, cache=cache - ) + if init_constructor is Series: + input_data = init_constructor(data, dtype="datetime64[ns, UTC]") + else: + input_data = init_constructor(data) + result = to_datetime(input_data, format="%Y%m%d %H%M%S", utc=True, cache=cache) expected = end_constructor(expected_data) tm.assert_equal(result, expected) From d00c8041c9ed102c9d97ca01de8637f6759bf2a2 Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Fri, 28 Oct 2022 14:40:23 -0300 Subject: [PATCH 06/15] Update whatsnew to describe new expected behavior As described in the previous commit, the new expected behavior when handling `datetime.datetime` and `Timestamp` objects changed, by applying the passed format (`fmt`) instead of raising `ValueError` Signed-off-by: Antonio Ossa Guerra --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 8b839bfc807f2..4e9a45b513e4d 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -49,7 +49,7 @@ Other enhancements - Fix ``test`` optional_extra by adding missing test package ``pytest-asyncio`` (:issue:`48361`) - :func:`DataFrame.astype` exception message thrown improved to include column name when type conversion is not possible. (:issue:`47571`) - :meth:`DataFrame.to_json` now supports a ``mode`` keyword with supported inputs 'w' and 'a'. Defaulting to 'w', 'a' can be used when lines=True and orient='records' to append record oriented json lines to an existing json file. (:issue:`35849`) -- :func:`to_datetime` now correctly parses ``datetime.datetime`` objects in the input when using the ``format`` argument instead of raising a ``ValueError``. (:issue:`49298`) +- :func:`to_datetime` now handles ``datetime.datetime`` and :class:`Timestamp` and applies the ``format`` argument on them instead of raising a ``ValueError``. (:issue:`49298`) .. --------------------------------------------------------------------------- .. _whatsnew_200.notable_bug_fixes: From 6cc376867481e7d598ade9bffe4f28798aab8594 Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Wed, 2 Nov 2022 09:20:31 -0300 Subject: [PATCH 07/15] Use `PyDateTime_Check` instead of `isisntance` The `PyDateTime_Check` function is used in other parts of the source to properly check if an object is an instance of type `PyDateTime_DateTimeType` or a subtype of the same type. Signed-off-by: Antonio Ossa Guerra --- pandas/_libs/tslibs/strptime.pyx | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 5ec8494f9f354..2d2a57bc4b743 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -1,11 +1,14 @@ """Strptime-related classes and functions. """ from cpython.datetime cimport ( + PyDateTime_Check, date, - datetime, + import_datetime, tzinfo, ) +import_datetime() + from _thread import allocate_lock as _thread_allocate_lock import numpy as np @@ -133,7 +136,7 @@ def array_strptime(ndarray[object] values, str fmt, bint exact=True, errors='rai elif checknull_with_nat_and_na(val): iresult[i] = NPY_NAT continue - elif isinstance(val, datetime): + elif PyDateTime_Check(val): val = val.strftime(fmt) else: val = str(val) From 859969675458ea917721b66510515123b6e9102d Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Wed, 2 Nov 2022 09:28:07 -0300 Subject: [PATCH 08/15] Simplify tests by removing unnecesary parameters Parametrized constructors add no information that is relevant in the modified test, so they can be safely removed Signed-off-by: Antonio Ossa Guerra --- pandas/tests/tools/test_to_datetime.py | 57 ++++++-------------------- 1 file changed, 12 insertions(+), 45 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index fce21da316308..c393894441916 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -625,25 +625,12 @@ def test_to_datetime_dtarr(self, tz): ) def test_to_datetime_preserves_resolution_when_possible(self, data, expected): # GH 49298 - if not isinstance(data, str): - result = to_datetime([data]) - tm.assert_equal(result, DatetimeIndex([data])) - + result = to_datetime([data]) + tm.assert_equal(result, DatetimeIndex([data])) result = to_datetime([data], format="%m/%d/%y %H:%M:%S.%f") tm.assert_equal(result, DatetimeIndex([expected])) - @pytest.mark.parametrize( - "init_constructor, end_constructor", - [ - (Index, DatetimeIndex), - (list, DatetimeIndex), - (np.array, DatetimeIndex), - (Series, Series), - ], - ) - def test_to_datetime_arraylike_contains_pydatetime_and_timestamp( - self, init_constructor, end_constructor - ): + def test_to_datetime_arraylike_contains_pydatetime_and_timestamp(self): # GH 49298 # Timestamp/datetime have more resolution than str case1 = [ @@ -651,14 +638,13 @@ def test_to_datetime_arraylike_contains_pydatetime_and_timestamp( datetime(2001, 10, 2, 12, 30, 1, 123456), "10/03/01", ] - result = to_datetime(init_constructor(case1), format="%m/%d/%y") + result = to_datetime(case1, format="%m/%d/%y") expected_data = [ Timestamp("2001-10-01"), Timestamp("2001-10-02"), Timestamp("2001-10-03"), ] - expected = end_constructor(expected_data) - tm.assert_equal(result, expected) + tm.assert_equal(result, DatetimeIndex(expected_data)) # Timestamp/datetime have the same resolution than str (nanosecond) case2 = [ @@ -666,14 +652,13 @@ def test_to_datetime_arraylike_contains_pydatetime_and_timestamp( datetime(2001, 10, 2, 12, 30, 1, 123456), "10/03/01 13:00:01.123456789", ] - result = to_datetime(init_constructor(case2), format="%m/%d/%y %H:%M:%S.%f") + result = to_datetime(case2, format="%m/%d/%y %H:%M:%S.%f") expected_data = [ Timestamp("2001-10-01 12:00:01.123456"), Timestamp("2001-10-02 12:30:01.123456"), Timestamp("2001-10-03 13:00:01.123456789"), ] - expected = end_constructor(expected_data) - tm.assert_equal(result, expected) + tm.assert_equal(result, DatetimeIndex(expected_data)) # Timestamp/datetime have less resolution than str case3 = [ @@ -681,27 +666,15 @@ def test_to_datetime_arraylike_contains_pydatetime_and_timestamp( datetime(2001, 10, 2), "10/03/01 12:00:01", ] - result = to_datetime(init_constructor(case3), format="%m/%d/%y %H:%M:%S") + result = to_datetime(case3, format="%m/%d/%y %H:%M:%S") expected_data = [ Timestamp("2001-10-01 00:00:00"), Timestamp("2001-10-02 00:00:00"), Timestamp("2001-10-03 12:00:01"), ] - expected = end_constructor(expected_data) - tm.assert_equal(result, expected) + tm.assert_equal(result, DatetimeIndex(expected_data)) - @pytest.mark.parametrize( - "init_constructor, end_constructor", - [ - (Index, DatetimeIndex), - (list, DatetimeIndex), - (np.array, DatetimeIndex), - (Series, Series), - ], - ) - def test_to_datetime_arraylike_contains_pydatetime_and_timestamp_utc( - self, cache, init_constructor, end_constructor - ): + def test_to_datetime_arraylike_contains_pydatetime_and_timestamp_utc(self): # GH 49298 dt = datetime(2010, 1, 2, 12, 13, 16) dt = dt.replace(tzinfo=timezone.utc) @@ -715,14 +688,8 @@ def test_to_datetime_arraylike_contains_pydatetime_and_timestamp_utc( Timestamp("2010-01-02 12:13:15", tz="utc"), Timestamp("2010-01-02 12:13:16", tz="utc"), ] - - if init_constructor is Series: - input_data = init_constructor(data, dtype="datetime64[ns, UTC]") - else: - input_data = init_constructor(data) - result = to_datetime(input_data, format="%Y%m%d %H%M%S", utc=True, cache=cache) - expected = end_constructor(expected_data) - tm.assert_equal(result, expected) + result = to_datetime(data, format="%Y%m%d %H%M%S", utc=True) + tm.assert_equal(result, DatetimeIndex(expected_data)) def test_to_datetime_pydatetime(self): actual = to_datetime(datetime(2008, 1, 15)) From 7cd241b5e1d978c770df51ac8dd470615ab9e675 Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Wed, 2 Nov 2022 10:49:00 -0300 Subject: [PATCH 09/15] Add ISO8601 test case When using ISO8601 the format is inferred, so no format has to be explicitly defined. This test case includes input strings in ISO8601 to assert the behavior when the input includes `Timestamp` and `datetime.datetime` objects Signed-off-by: Antonio Ossa Guerra --- pandas/tests/tools/test_to_datetime.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index c393894441916..e9b393d7ae9f3 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -674,6 +674,22 @@ def test_to_datetime_arraylike_contains_pydatetime_and_timestamp(self): ] tm.assert_equal(result, DatetimeIndex(expected_data)) + # Test ISO8601 format + case4 = [ + Timestamp("2001-10-01 13:18:05"), + datetime(2001, 10, 2, 13, 18, 5), + "2001-10-03T13:18:05", + "20011004", + ] + result = to_datetime(case4) + expected_data = [ + Timestamp("2001-10-01 13:18:05"), + Timestamp("2001-10-02 13:18:05"), + Timestamp("2001-10-03 13:18:05"), + Timestamp("2001-10-04 00:00:00"), + ] + tm.assert_equal(result, DatetimeIndex(expected_data)) + def test_to_datetime_arraylike_contains_pydatetime_and_timestamp_utc(self): # GH 49298 dt = datetime(2010, 1, 2, 12, 13, 16) From 3464068b1c14d68d0f03f126b5426b20dc7e473f Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Wed, 2 Nov 2022 13:07:20 -0300 Subject: [PATCH 10/15] Update whatsnew to explain expected result Improve description by explaining the change of behavior in the result Signed-off-by: Antonio Ossa Guerra --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 4e9a45b513e4d..cb3da08af22ca 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -49,7 +49,7 @@ Other enhancements - Fix ``test`` optional_extra by adding missing test package ``pytest-asyncio`` (:issue:`48361`) - :func:`DataFrame.astype` exception message thrown improved to include column name when type conversion is not possible. (:issue:`47571`) - :meth:`DataFrame.to_json` now supports a ``mode`` keyword with supported inputs 'w' and 'a'. Defaulting to 'w', 'a' can be used when lines=True and orient='records' to append record oriented json lines to an existing json file. (:issue:`35849`) -- :func:`to_datetime` now handles ``datetime.datetime`` and :class:`Timestamp` and applies the ``format`` argument on them instead of raising a ``ValueError``. (:issue:`49298`) +- :func:`to_datetime` now formats ``datetime.datetime`` and :class:`Timestamp` objects to return a format-consistent output instead of raising a ``ValueError``. (:issue:`49298`) .. --------------------------------------------------------------------------- .. _whatsnew_200.notable_bug_fixes: From dcd74eb7a7eca9af96d7cc92eab58852cfdaa222 Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Fri, 4 Nov 2022 17:58:27 -0300 Subject: [PATCH 11/15] Do not modify `Timestamp` and `datetime` objects To avoid introducing an unexpected behavior, we'll skip objects that are already `pd.Timestamp` or `datetime.datetime` Signed-off-by: Antonio Ossa Guerra --- pandas/_libs/tslibs/strptime.pyx | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 2d2a57bc4b743..ac34675dea4df 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -29,7 +29,9 @@ from pandas._libs.tslibs.np_datetime cimport ( check_dts_bounds, npy_datetimestruct, npy_datetimestruct_to_datetime, + pydatetime_to_dt64, ) +from pandas._libs.tslibs.timestamps cimport _Timestamp cdef dict _parse_code_table = {'y': 0, @@ -137,7 +139,13 @@ def array_strptime(ndarray[object] values, str fmt, bint exact=True, errors='rai iresult[i] = NPY_NAT continue elif PyDateTime_Check(val): - val = val.strftime(fmt) + if isinstance(val, _Timestamp): + iresult[i] = val.tz_localize(None)._as_unit("ns").value + else: + iresult[i] = pydatetime_to_dt64(val, &dts) + check_dts_bounds(&dts) + result_timezone[i] = val.tzinfo + continue else: val = str(val) From d3382ce664ebbfafe43b159a9aa43145b5c8d58c Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Fri, 4 Nov 2022 18:01:19 -0300 Subject: [PATCH 12/15] Update and simplify tests Simplify the tests further by testing specific cases that are actually relevant to the introduced behavior. The first test includes cases for both custom and ISO8601, while the second tests timezone-aware objects while forcing UTC and while leaving the offsets alone Signed-off-by: Antonio Ossa Guerra --- pandas/tests/tools/test_to_datetime.py | 88 ++++++++------------------ 1 file changed, 25 insertions(+), 63 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index e9b393d7ae9f3..4553895295fb6 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -609,30 +609,9 @@ def test_to_datetime_dtarr(self, tz): result = to_datetime(arr) assert result is arr - @pytest.mark.parametrize( - "data, expected", - [ - ( - Timestamp("2001-10-01 12:00:01.123456789"), - Timestamp("2001-10-01 12:00:01.123456"), - ), - ( - datetime(2002, 10, 1, 12, 00, 1, 123456), - Timestamp("2002-10-01 12:00:01.123456"), - ), - ("10/01/03 12:00:01.123456789", Timestamp("2003-10-01 12:00:01.123456789")), - ], - ) - def test_to_datetime_preserves_resolution_when_possible(self, data, expected): - # GH 49298 - result = to_datetime([data]) - tm.assert_equal(result, DatetimeIndex([data])) - result = to_datetime([data], format="%m/%d/%y %H:%M:%S.%f") - tm.assert_equal(result, DatetimeIndex([expected])) - def test_to_datetime_arraylike_contains_pydatetime_and_timestamp(self): # GH 49298 - # Timestamp/datetime have more resolution than str + # Test explicit custom format case1 = [ Timestamp("2001-10-01 12:00:01.123456789"), datetime(2001, 10, 2, 12, 30, 1, 123456), @@ -640,48 +619,20 @@ def test_to_datetime_arraylike_contains_pydatetime_and_timestamp(self): ] result = to_datetime(case1, format="%m/%d/%y") expected_data = [ - Timestamp("2001-10-01"), - Timestamp("2001-10-02"), - Timestamp("2001-10-03"), - ] - tm.assert_equal(result, DatetimeIndex(expected_data)) - - # Timestamp/datetime have the same resolution than str (nanosecond) - case2 = [ Timestamp("2001-10-01 12:00:01.123456789"), - datetime(2001, 10, 2, 12, 30, 1, 123456), - "10/03/01 13:00:01.123456789", - ] - result = to_datetime(case2, format="%m/%d/%y %H:%M:%S.%f") - expected_data = [ - Timestamp("2001-10-01 12:00:01.123456"), Timestamp("2001-10-02 12:30:01.123456"), - Timestamp("2001-10-03 13:00:01.123456789"), - ] - tm.assert_equal(result, DatetimeIndex(expected_data)) - - # Timestamp/datetime have less resolution than str - case3 = [ - Timestamp("2001-10-01"), - datetime(2001, 10, 2), - "10/03/01 12:00:01", - ] - result = to_datetime(case3, format="%m/%d/%y %H:%M:%S") - expected_data = [ - Timestamp("2001-10-01 00:00:00"), - Timestamp("2001-10-02 00:00:00"), - Timestamp("2001-10-03 12:00:01"), + Timestamp("2001-10-03 00:00:00"), ] tm.assert_equal(result, DatetimeIndex(expected_data)) # Test ISO8601 format - case4 = [ + case2 = [ Timestamp("2001-10-01 13:18:05"), datetime(2001, 10, 2, 13, 18, 5), "2001-10-03T13:18:05", "20011004", ] - result = to_datetime(case4) + result = to_datetime(case2) expected_data = [ Timestamp("2001-10-01 13:18:05"), Timestamp("2001-10-02 13:18:05"), @@ -690,23 +641,34 @@ def test_to_datetime_arraylike_contains_pydatetime_and_timestamp(self): ] tm.assert_equal(result, DatetimeIndex(expected_data)) - def test_to_datetime_arraylike_contains_pydatetime_and_timestamp_utc(self): + def test_to_datetime_arraylike_contains_pydatetime_and_timestamp_with_tz(self): # GH 49298 - dt = datetime(2010, 1, 2, 12, 13, 16) - dt = dt.replace(tzinfo=timezone.utc) + # Different offsets when utc=True data = [ - "20100102 121314", - Timestamp("2010-01-02 12:13:15", tz="utc"), - dt, + "20100102 121314 +01:00", + "20100102 121315 -05:00", + pytz.timezone("Europe/Berlin").localize(datetime(2010, 1, 2, 12, 13, 16)), + pytz.timezone("US/Eastern").localize(Timestamp("2010-01-02 12:13:17")), ] expected_data = [ - Timestamp("2010-01-02 12:13:14", tz="utc"), - Timestamp("2010-01-02 12:13:15", tz="utc"), - Timestamp("2010-01-02 12:13:16", tz="utc"), + Timestamp("2010-01-02 11:13:14", tz="utc"), + Timestamp("2010-01-02 17:13:15", tz="utc"), + Timestamp("2010-01-02 11:13:16", tz="utc"), + Timestamp("2010-01-02 17:13:17", tz="utc"), ] - result = to_datetime(data, format="%Y%m%d %H%M%S", utc=True) + result = to_datetime(data, format="%Y%m%d %H%M%S %z", utc=True) tm.assert_equal(result, DatetimeIndex(expected_data)) + # Different offsets when utc=False + expected_data = [ + Timestamp("2010-01-02 12:13:14 +01:00"), + Timestamp("2010-01-02 12:13:15 -05:00"), + Timestamp("2010-01-02 12:13:16 +01:00"), + Timestamp("2010-01-02 12:13:17 -05:00"), + ] + result = to_datetime(data, format="%Y%m%d %H%M%S %z", utc=False) + tm.assert_equal(result, Index(expected_data)) + def test_to_datetime_pydatetime(self): actual = to_datetime(datetime(2008, 1, 15)) assert actual == datetime(2008, 1, 15) From fe6ad66e6cf17dc253dec7dcc5eff356c8845c3f Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Fri, 4 Nov 2022 18:24:38 -0300 Subject: [PATCH 13/15] Update whatsnew describing new behavior `pd.to_datetime` now simply skips `datetime.datetime` and `Timestamp` objects when the `format` argument is present, instead of trying to use the same format in the output Signed-off-by: Antonio Ossa Guerra --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index cb3da08af22ca..337d1df20488a 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -49,7 +49,7 @@ Other enhancements - Fix ``test`` optional_extra by adding missing test package ``pytest-asyncio`` (:issue:`48361`) - :func:`DataFrame.astype` exception message thrown improved to include column name when type conversion is not possible. (:issue:`47571`) - :meth:`DataFrame.to_json` now supports a ``mode`` keyword with supported inputs 'w' and 'a'. Defaulting to 'w', 'a' can be used when lines=True and orient='records' to append record oriented json lines to an existing json file. (:issue:`35849`) -- :func:`to_datetime` now formats ``datetime.datetime`` and :class:`Timestamp` objects to return a format-consistent output instead of raising a ``ValueError``. (:issue:`49298`) +- :func:`to_datetime` now skips ``datetime.datetime`` and :class:`Timestamp` objects when passing ``format`` argument instead of raising a ``ValueError``. (:issue:`49298`) .. --------------------------------------------------------------------------- .. _whatsnew_200.notable_bug_fixes: From f64964326b9beca43df9afc59dca0ebc82e38f64 Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Mon, 7 Nov 2022 13:52:53 -0300 Subject: [PATCH 14/15] Return localized object if a timezone is present When using the `format` argument on `pd.to_timestamp`, and the input contains localized `pd.Timestamp` or `datetime.datetime` objects, the result should be localized too. A bug was triggered when `format` did not contain `%z` or `%Z`, because the `_array_strptime_with_fallback` function did not expect any type different than string. Now this assumption is no longer valid, since we now let `pd.Timestamp` and `datetime.datetime` objects pass unmodified. The fix is to change the condition that checks if returning a localized output is necessary: instead of looking at the passed `format`, return a localized result when any input object contains a timezone. Also, a new test is added to check this case. A case is included for a `pd.Timestamp` and another for a `datetime.datetime` object, and each compares the actual result with the expected result when the input is localized and when is not. Signed-off-by: Antonio Ossa Guerra --- pandas/core/tools/datetimes.py | 2 +- pandas/tests/tools/test_to_datetime.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 7791ea804a52a..84178f9203ec0 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -499,7 +499,7 @@ def _array_strptime_with_fallback( # Indicates to the caller to fallback to objects_to_datetime64ns return None else: - if "%Z" in fmt or "%z" in fmt: + if any(timezones): return _return_parsed_timezone_results(result, timezones, tz, name) return _box_as_indexlike(result, utc=utc, name=name) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 4553895295fb6..5d3b7ecbc9d0d 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -669,6 +669,21 @@ def test_to_datetime_arraylike_contains_pydatetime_and_timestamp_with_tz(self): result = to_datetime(data, format="%Y%m%d %H%M%S %z", utc=False) tm.assert_equal(result, Index(expected_data)) + @pytest.mark.parametrize("value", [datetime(2010, 1, 2, 12, 13, 16), Timestamp("2010-01-02 12:13:17")]) + def test_to_datetime_includes_tz_dtype_on_pydatetime_and_timestamp(self, value): + # GH 49298 + # No timezone + result_no_format = to_datetime([value]) + result_with_format = to_datetime([value], format="%m-%d-%Y") + tm.assert_equal(result_no_format, result_with_format) + + # Localized value + america_santiago = pytz.timezone("America/Santiago") + result_no_format = to_datetime([america_santiago.localize(value)]) + result_with_format = to_datetime([america_santiago.localize(value)], format="%m-%d-%Y") + tm.assert_equal(result_with_format.dtype.tz, america_santiago) + tm.assert_equal(result_no_format, result_with_format) + def test_to_datetime_pydatetime(self): actual = to_datetime(datetime(2008, 1, 15)) assert actual == datetime(2008, 1, 15) From d915cf7f147c410ce00aadada13ece9d9b5017cc Mon Sep 17 00:00:00 2001 From: Antonio Ossa Guerra Date: Tue, 8 Nov 2022 12:50:44 -0300 Subject: [PATCH 15/15] Raise `ValueError` when tz-aware differs in input Just like `to_datetime` when `format` is not passed, we should raise a `ValueError` if the input contains some tz-aware objects and some naive objects. The change is to identify if the `format` expects tz-aware objects (check for `"%z"` and `"%Z"`) and then compare this expectation with the `tzinfo` of each object. This modification includes relevant tests. Signed-off-by: Antonio Ossa Guerra --- pandas/_libs/tslibs/strptime.pyx | 5 +++++ pandas/tests/tools/test_to_datetime.py | 19 ++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index ac34675dea4df..7de1a5ab15b7c 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -128,6 +128,7 @@ def array_strptime(ndarray[object] values, str fmt, bint exact=True, errors='rai result_timezone = np.empty(n, dtype='object') dts.us = dts.ps = dts.as = 0 + expect_tz_aware = "%z" in fmt or "%Z" in fmt for i in range(n): val = values[i] @@ -144,6 +145,10 @@ def array_strptime(ndarray[object] values, str fmt, bint exact=True, errors='rai else: iresult[i] = pydatetime_to_dt64(val, &dts) check_dts_bounds(&dts) + if val.tzinfo is None and expect_tz_aware: + raise ValueError("Cannot mix tz-aware with tz-naive values") + elif val.tzinfo is not None and not expect_tz_aware: + raise ValueError("Cannot mix tz-aware with tz-naive values") result_timezone[i] = val.tzinfo continue else: diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 5d3b7ecbc9d0d..6d4cf9dd240c8 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -680,10 +680,27 @@ def test_to_datetime_includes_tz_dtype_on_pydatetime_and_timestamp(self, value): # Localized value america_santiago = pytz.timezone("America/Santiago") result_no_format = to_datetime([america_santiago.localize(value)]) - result_with_format = to_datetime([america_santiago.localize(value)], format="%m-%d-%Y") + result_with_format = to_datetime([america_santiago.localize(value)], format="%m-%d-%Y %z") tm.assert_equal(result_with_format.dtype.tz, america_santiago) tm.assert_equal(result_no_format, result_with_format) + @pytest.mark.parametrize("value", [datetime(2010, 1, 2, 12, 13, 16), Timestamp("2010-01-02 12:13:17")]) + def test_to_datetime_mixing_naive_tzaware_raises(self, value): + # GH 49298 + msg = "Cannot mix tz-aware with tz-naive values" + america_santiago = pytz.timezone("America/Santiago") + # Fail if format expects tz but input is not localized + with pytest.raises(ValueError, match=msg): + to_datetime([value], format="%m-%d-%Y %z") + # Fail if format does not expect tz but input is localized + with pytest.raises(ValueError, match=msg): + to_datetime([america_santiago.localize(value)], format="%m-%d-%Y") + # Mixed input should fail in both cases + with pytest.raises(ValueError, match=msg): + to_datetime([value, america_santiago.localize(value)], format="%m-%d-%Y %z") + with pytest.raises(ValueError, match=msg): + to_datetime([value, america_santiago.localize(value)], format="%m-%d-%Y") + def test_to_datetime_pydatetime(self): actual = to_datetime(datetime(2008, 1, 15)) assert actual == datetime(2008, 1, 15)