From 65f5c41ab65d768cf6ad9def9aede3aad73821c6 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 12 Dec 2019 22:07:52 +0300 Subject: [PATCH 1/3] BUG: close #30050 - initial solution --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/tools/datetimes.py | 17 ++++++++++++++++- pandas/tests/indexes/datetimes/test_tools.py | 18 ++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 3810ab37822cc..3f944b8862417 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -796,6 +796,7 @@ Datetimelike - Bug in :meth:`DataFrame.append` would remove the timezone-awareness of new data (:issue:`30238`) - Bug in :meth:`Series.cummin` and :meth:`Series.cummax` with timezone-aware dtype incorrectly dropping its timezone (:issue:`15553`) - Bug in :class:`DatetimeArray`, :class:`TimedeltaArray`, and :class:`PeriodArray` where inplace addition and subtraction did not actually operate inplace (:issue:`24115`) +- Bug in :func:`pandas.to_datetime` when called with ``Series`` storing ``IntegerArray`` raising ``TypeError`` instead of returning ``Series`` (:issue:`30050`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index f193865d90b71..6c3119f2f6e1c 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -38,6 +38,8 @@ ) from pandas.core.dtypes.missing import notna +from pandas._typing import ArrayLike +from pandas.arrays import IntegerArray from pandas.core import algorithms from pandas.core.algorithms import unique @@ -317,7 +319,20 @@ def _convert_listlike_datetimes( if format is not None: raise ValueError("cannot specify both format and unit") arg = getattr(arg, "values", arg) - result, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) + # GH 30050 pass an ndarray to tslib.array_with_unit_to_datetime + # because it expects an ndarray argument + if isinstance(arg, IntegerArray): + arg_np = np.array(arg[np.logical_not(arg._mask)], dtype=type(arg[0])) + result_np, tz_parsed = tslib.array_with_unit_to_datetime( + arg_np, unit, errors=errors + ) + result = np.empty(arg.shape[0], dtype="datetime64[" + unit + "]") + result[arg._mask] = np.datetime64("nat") + result[np.logical_not(arg._mask)] = result_np + else: + result, tz_parsed = tslib.array_with_unit_to_datetime( + arg, unit, errors=errors + ) if errors == "ignore": from pandas import Index diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 1aaacfc0949c3..32eefb6c6c470 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -2291,3 +2291,21 @@ def test_should_cache_errors(unique_share, check_count, err_message): with pytest.raises(AssertionError, match=err_message): tools.should_cache(arg, unique_share, check_count) + +def test_intarray_to_datetime(): + # Test for #30050 + ser = pd.Series([1, 2, None, 2 ** 61, None]) + ser = ser.astype("Int64") + + res = pd.to_datetime(ser, unit="ns") + + expected = pd.Series( + [ + np.datetime64("1970-01-01 00:00:00.000000001"), + np.datetime64("1970-01-01 00:00:00.000000002"), + np.datetime64("NaT"), + np.datetime64("2043-01-25 23:56:49.213693952"), + np.datetime64("NaT"), + ] + ) + tm.assert_series_equal(res, expected) \ No newline at end of file From f925f839ba5f20cab02194460bf12e9ce5f458f3 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Fri, 13 Dec 2019 22:00:10 +0300 Subject: [PATCH 2/3] CLN: switch to generic functions Stop relying on inner implementations of IntegerArray and numpy arrays. Use public funcitons as much as possible. --- pandas/core/tools/datetimes.py | 14 ++++++++------ pandas/tests/indexes/datetimes/test_tools.py | 3 ++- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 6c3119f2f6e1c..2c5dd9a3660e2 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -318,17 +318,19 @@ def _convert_listlike_datetimes( elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") - arg = getattr(arg, "values", arg) + arg = getattr(arg, "_values", arg) # GH 30050 pass an ndarray to tslib.array_with_unit_to_datetime # because it expects an ndarray argument if isinstance(arg, IntegerArray): - arg_np = np.array(arg[np.logical_not(arg._mask)], dtype=type(arg[0])) + # Send only non-na values to array_with_unit_to_datetime + mask_na = arg.isna() result_np, tz_parsed = tslib.array_with_unit_to_datetime( - arg_np, unit, errors=errors + np.compress(np.logical_not(mask_na), arg), unit, errors=errors ) - result = np.empty(arg.shape[0], dtype="datetime64[" + unit + "]") - result[arg._mask] = np.datetime64("nat") - result[np.logical_not(arg._mask)] = result_np + # Insert na values back in proper positions + ins_index = np.ravel(np.argwhere(mask_na)) + ins_index -= range(ins_index.shape[0]) + result = np.insert(result_np, ins_index, None) else: result, tz_parsed = tslib.array_with_unit_to_datetime( arg, unit, errors=errors diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 32eefb6c6c470..3afde7db90e9d 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -2292,6 +2292,7 @@ def test_should_cache_errors(unique_share, check_count, err_message): with pytest.raises(AssertionError, match=err_message): tools.should_cache(arg, unique_share, check_count) + def test_intarray_to_datetime(): # Test for #30050 ser = pd.Series([1, 2, None, 2 ** 61, None]) @@ -2308,4 +2309,4 @@ def test_intarray_to_datetime(): np.datetime64("NaT"), ] ) - tm.assert_series_equal(res, expected) \ No newline at end of file + tm.assert_series_equal(res, expected) From 620506d1857b02fb9ab58f29baa16bd3ebac97e2 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Wed, 1 Jan 2020 10:55:52 +0300 Subject: [PATCH 3/3] BUG: move nan-handling to tslib and clean up --- pandas/_libs/tslib.pyx | 36 ++++++++++++++++---- pandas/core/tools/datetimes.py | 23 ++++++------- pandas/tests/indexes/datetimes/test_tools.py | 5 ++- 3 files changed, 43 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index cbe6dd6c2322d..e0a2b987c98d5 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -296,10 +296,15 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, return result -def array_with_unit_to_datetime(ndarray values, object unit, +def array_with_unit_to_datetime(ndarray values, ndarray mask, object unit, str errors='coerce'): """ - convert the ndarray according to the unit + Convert the ndarray to datetime according to the time unit. + + This function converts an array of objects into a numpy array of + datetime64[ns]. It returns the converted array + and also returns the timezone offset + if errors: - raise: return converted values or raise OutOfBoundsDatetime if out of range on the conversion or @@ -307,6 +312,18 @@ def array_with_unit_to_datetime(ndarray values, object unit, - ignore: return non-convertible values as the same unit - coerce: NaT for non-convertibles + Parameters + ---------- + values : ndarray of object + Date-like objects to convert + mask : ndarray of bool + Not-a-time mask for non-nullable integer types conversion, + can be None + unit : object + Time unit to use during conversion + errors : str, default 'raise' + Error behavior when parsing + Returns ------- result : ndarray of m8 values @@ -316,7 +333,6 @@ def array_with_unit_to_datetime(ndarray values, object unit, Py_ssize_t i, j, n=len(values) int64_t m ndarray[float64_t] fvalues - ndarray mask bint is_ignore = errors=='ignore' bint is_coerce = errors=='coerce' bint is_raise = errors=='raise' @@ -329,9 +345,13 @@ def array_with_unit_to_datetime(ndarray values, object unit, if unit == 'ns': if issubclass(values.dtype.type, np.integer): - return values.astype('M8[ns]'), tz - # This will return a tz - return array_to_datetime(values.astype(object), errors=errors) + result = values.astype('M8[ns]') + else: + result, tz = array_to_datetime(values.astype(object), errors=errors) + if mask is not None: + iresult = result.view('i8') + iresult[mask] = NPY_NAT + return result, tz m = cast_from_unit(None, unit) @@ -343,7 +363,9 @@ def array_with_unit_to_datetime(ndarray values, object unit, if values.dtype.kind == "i": # Note: this condition makes the casting="same_kind" redundant iresult = values.astype('i8', casting='same_kind', copy=False) - mask = iresult == NPY_NAT + # If no mask, fill mask by comparing to NPY_NAT constant + if mask is None: + mask = iresult == NPY_NAT iresult[mask] = 0 fvalues = iresult.astype('f8') * m need_to_iterate = False diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 2c5dd9a3660e2..85094ce741134 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -38,7 +38,6 @@ ) from pandas.core.dtypes.missing import notna -from pandas._typing import ArrayLike from pandas.arrays import IntegerArray from pandas.core import algorithms from pandas.core.algorithms import unique @@ -319,22 +318,20 @@ def _convert_listlike_datetimes( if format is not None: raise ValueError("cannot specify both format and unit") arg = getattr(arg, "_values", arg) + # GH 30050 pass an ndarray to tslib.array_with_unit_to_datetime # because it expects an ndarray argument if isinstance(arg, IntegerArray): - # Send only non-na values to array_with_unit_to_datetime - mask_na = arg.isna() - result_np, tz_parsed = tslib.array_with_unit_to_datetime( - np.compress(np.logical_not(mask_na), arg), unit, errors=errors - ) - # Insert na values back in proper positions - ins_index = np.ravel(np.argwhere(mask_na)) - ins_index -= range(ins_index.shape[0]) - result = np.insert(result_np, ins_index, None) + # Explicitly pass NaT mask to array_with_unit_to_datetime + mask = arg.isna() + arg = arg._ndarray_values else: - result, tz_parsed = tslib.array_with_unit_to_datetime( - arg, unit, errors=errors - ) + mask = None + + result, tz_parsed = tslib.array_with_unit_to_datetime( + arg, mask, unit, errors=errors + ) + if errors == "ignore": from pandas import Index diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 3afde7db90e9d..807d0b05e8d13 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -2293,10 +2293,11 @@ def test_should_cache_errors(unique_share, check_count, err_message): tools.should_cache(arg, unique_share, check_count) -def test_intarray_to_datetime(): +def test_nullable_integer_to_datetime(): # Test for #30050 ser = pd.Series([1, 2, None, 2 ** 61, None]) ser = ser.astype("Int64") + ser_copy = ser.copy() res = pd.to_datetime(ser, unit="ns") @@ -2310,3 +2311,5 @@ def test_intarray_to_datetime(): ] ) tm.assert_series_equal(res, expected) + # Check that ser isn't mutated + tm.assert_series_equal(ser, ser_copy)