From 47288e9e6e1e5f829e920623d2cdb977256833bf Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 12 Oct 2022 09:38:36 -0700 Subject: [PATCH 1/5] API: retain non-nano dtype in DatetimeArray constructor --- pandas/_libs/tslibs/__init__.py | 4 ++ pandas/core/arrays/datetimes.py | 58 ++++++++++++++----- pandas/core/dtypes/cast.py | 3 + pandas/tests/arrays/test_array.py | 4 +- pandas/tests/base/test_constructors.py | 11 ++-- .../frame/constructors/test_from_records.py | 1 + pandas/tests/frame/indexing/test_setitem.py | 15 +++-- pandas/tests/frame/methods/test_astype.py | 5 +- pandas/tests/frame/test_block_internals.py | 4 +- .../indexes/datetimes/test_constructors.py | 6 +- pandas/tests/reshape/merge/test_merge.py | 16 ++++- pandas/tests/series/test_constructors.py | 55 +++++++++++++----- pandas/tests/tools/test_to_datetime.py | 10 +++- pandas/tests/tslibs/test_api.py | 2 + pandas/tests/window/test_online.py | 2 +- 15 files changed, 147 insertions(+), 49 deletions(-) diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 47143b32d6dbe..42f84619ddbe5 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -31,13 +31,17 @@ "periods_per_day", "periods_per_second", "is_supported_unit", + "npy_unit_to_abbrev", + "get_supported_reso", ] from pandas._libs.tslibs import dtypes from pandas._libs.tslibs.conversion import localize_pydatetime from pandas._libs.tslibs.dtypes import ( Resolution, + get_supported_reso, is_supported_unit, + npy_unit_to_abbrev, periods_per_day, periods_per_second, ) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 79e8d2d90ab1e..4c11f78f864ea 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -30,12 +30,14 @@ astype_overflowsafe, fields, get_resolution, + get_supported_reso, get_unit_from_dtype, ints_to_pydatetime, is_date_array_normalized, is_supported_unit, is_unitless, normalize_i8_timestamps, + npy_unit_to_abbrev, timezones, to_offset, tz_convert_from_utc, @@ -322,6 +324,14 @@ def _from_sequence_not_strict( # if dtype has an embedded tz, capture it tz = validate_tz_from_dtype(dtype, tz, explicit_tz_none) + unit = None + if dtype is not None: + if isinstance(dtype, np.dtype): + unit = np.datetime_data(dtype)[0] + else: + # DatetimeTZDtype + unit = dtype.unit + subarr, tz, inferred_freq = _sequence_to_dt64ns( data, copy=copy, @@ -342,8 +352,12 @@ def _from_sequence_not_strict( if explicit_none: freq = None - dtype = tz_to_dtype(tz) - result = cls._simple_new(subarr, freq=freq, dtype=dtype) + data_unit = np.datetime_data(subarr.dtype)[0] + data_dtype = tz_to_dtype(tz, data_unit) + result = cls._simple_new(subarr, freq=freq, dtype=data_dtype) + if unit is not None and unit != result._unit: + # If unit was specified in user-passed dtype, cast to it here + result = result._as_unit(unit) if inferred_freq is None and freq is not None: # this condition precludes `freq_infer` @@ -2005,7 +2019,8 @@ def sequence_to_datetimes(data, require_iso8601: bool = False) -> DatetimeArray: require_iso8601=require_iso8601, ) - dtype = tz_to_dtype(tz) + unit = np.datetime_data(result.dtype)[0] + dtype = tz_to_dtype(tz, unit) dta = DatetimeArray._simple_new(result, freq=freq, dtype=dtype) return dta @@ -2111,8 +2126,21 @@ def _sequence_to_dt64ns( elif is_datetime64_dtype(data_dtype): # tz-naive DatetimeArray or ndarray[datetime64] data = getattr(data, "_ndarray", data) - if data.dtype != DT64NS_DTYPE: - data = astype_overflowsafe(data, dtype=DT64NS_DTYPE) + new_dtype = data.dtype + data_unit = get_unit_from_dtype(new_dtype) + if not is_supported_unit(data_unit): + # Cast to the nearest supported unit, generally "s" + new_reso = get_supported_reso(data_unit) + new_unit = npy_unit_to_abbrev(new_reso) + new_dtype = np.dtype(f"M8[{new_unit}]") + data = astype_overflowsafe(data, dtype=new_dtype, copy=False) + copy = False + + if data.dtype.byteorder == ">": + # TODO: better way to handle this? non-copying alternative? + # without this, test_constructor_datetime64_bigendian fails + data = data.astype(data.dtype.newbyteorder("<")) + new_dtype = data.dtype copy = False if tz is not None: @@ -2120,11 +2148,11 @@ def _sequence_to_dt64ns( # TODO: if tz is UTC, are there situations where we *don't* want a # copy? tz_localize_to_utc always makes one. data = tzconversion.tz_localize_to_utc( - data.view("i8"), tz, ambiguous=ambiguous + data.view("i8"), tz, ambiguous=ambiguous, reso=data_unit ) - data = data.view(DT64NS_DTYPE) + data = data.view(new_dtype) - assert data.dtype == DT64NS_DTYPE, data.dtype + assert data.dtype == new_dtype, data.dtype result = data else: @@ -2138,7 +2166,9 @@ def _sequence_to_dt64ns( result = result.copy() assert isinstance(result, np.ndarray), type(result) - assert result.dtype == "M8[ns]", result.dtype + assert result.dtype.kind == "M" + assert result.dtype != "M8" + assert is_supported_unit(get_unit_from_dtype(result.dtype)) return result, tz, inferred_freq @@ -2359,12 +2389,14 @@ def _validate_dt64_dtype(dtype): ) raise ValueError(msg) - if (isinstance(dtype, np.dtype) and dtype != DT64NS_DTYPE) or not isinstance( - dtype, (np.dtype, DatetimeTZDtype) - ): + if ( + isinstance(dtype, np.dtype) + and (dtype.kind != "M" or not is_supported_unit(get_unit_from_dtype(dtype))) + ) or not isinstance(dtype, (np.dtype, DatetimeTZDtype)): raise ValueError( f"Unexpected value for 'dtype': '{dtype}'. " - "Must be 'datetime64[ns]' or DatetimeTZDtype'." + "Must be 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', " + "'datetime64[ns]' or DatetimeTZDtype'." ) if getattr(dtype, "tz", None): diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 6b890f98e8cac..dd57ee72912e2 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1370,6 +1370,9 @@ def maybe_cast_to_datetime( # Note: NOT equivalent to dta.astype(dtype) dta = dta.tz_localize(None) + # TODO(2.0): Do this astype in sequence_to_datetimes to + # avoid potential extra copy? + dta = dta.astype(dtype, copy=False) value = dta elif is_datetime64tz: dtype = cast(DatetimeTZDtype, dtype) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 6666b613fb50a..7c813d7e129d4 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -242,7 +242,9 @@ def test_array_copy(): ), ( np.array([1, 2], dtype="M8[us]"), - DatetimeArray(np.array([1000, 2000], dtype="M8[ns]")), + DatetimeArray._simple_new( + np.array([1, 2], dtype="M8[us]"), dtype=np.dtype("M8[us]") + ), ), # datetimetz ( diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py index f66ebc451c239..c8b923031b9e8 100644 --- a/pandas/tests/base/test_constructors.py +++ b/pandas/tests/base/test_constructors.py @@ -146,9 +146,9 @@ def test_constructor_datetime_outofbound(self, a, constructor): # datetime64[non-ns] raise error, other cases result in object dtype # and preserve original data if a.dtype.kind == "M": - msg = "Out of bounds" - with pytest.raises(pd.errors.OutOfBoundsDatetime, match=msg): - constructor(a) + # Can't fit in nanosecond bounds -> get the nearest supported unit + result = constructor(a) + assert result.dtype == "M8[s]" else: result = constructor(a) assert result.dtype == "object" @@ -162,7 +162,10 @@ def test_constructor_datetime_outofbound(self, a, constructor): def test_constructor_datetime_nonns(self, constructor): arr = np.array(["2020-01-01T00:00:00.000000"], dtype="datetime64[us]") - expected = constructor(pd.to_datetime(["2020-01-01"])) + dta = pd.core.arrays.DatetimeArray._simple_new(arr, dtype=arr.dtype) + expected = constructor(dta) + assert expected.dtype == arr.dtype + result = constructor(arr) tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 3b5a0c5a32bc4..b2efa0713b513 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -44,6 +44,7 @@ def test_from_records_with_datetimes(self): dtypes = [("EXPIRY", " Date: Wed, 12 Oct 2022 13:53:56 -0700 Subject: [PATCH 2/5] update test --- pandas/tests/tools/test_to_datetime.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index f9ed98c0bc88d..4ef78e100cc2f 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -717,10 +717,9 @@ def test_to_datetime_array_of_dt64s(self, cache, unit): ] * 30 # Assuming all datetimes are in bounds, to_datetime() returns # an array that is equal to Timestamp() parsing - tm.assert_index_equal( - to_datetime(dts, cache=cache), - DatetimeIndex([Timestamp(x).asm8 for x in dts]), - ) + result = to_datetime(dts, cache=cache) + expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[ns]") + tm.assert_index_equal(result, expected) # A list of datetimes where the last one is out of bounds dts_with_oob = dts + [np.datetime64("9999-01-01")] From 975bf48d6d2494d2b837e8fd1516b12a46aa550c Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 12 Oct 2022 17:56:50 -0700 Subject: [PATCH 3/5] un-xfail --- pandas/tests/series/methods/test_isin.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index 0eca3e0512849..449724508fcaa 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -57,10 +57,6 @@ def test_isin_datetimelike_mismatched_reso(self): result = ser.isin(dta) tm.assert_series_equal(result, expected) - @pytest.mark.xfail( - reason="DTA._from_sequence incorrectly treats Timestamp[s].value as " - "nanoseconds." - ) def test_isin_datetimelike_mismatched_reso_list(self): expected = Series([True, True, False, False, False]) From 1ed2caeec5cd5b9d7f52ca0617404725730a8447 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 12 Oct 2022 20:13:32 -0700 Subject: [PATCH 4/5] un-xfail --- pandas/tests/arrays/categorical/test_constructors.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 633a763dab80a..d11f4648ec632 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -6,11 +6,6 @@ import numpy as np import pytest -from pandas.compat import ( - IS64, - is_platform_windows, -) - from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -749,10 +744,6 @@ def test_from_sequence_copy(self): assert not tm.shares_memory(result, cat) - @pytest.mark.xfail( - not IS64 or is_platform_windows(), - reason="Incorrectly raising in astype_overflowsafe", - ) def test_constructor_datetime64_non_nano(self): categories = np.arange(10).view("M8[D]") values = categories[::2].copy() From d7257846563e6b3dedd2e4b4f0e85404b6d0fbdc Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 14 Oct 2022 18:39:45 -0700 Subject: [PATCH 5/5] un-xfail --- pandas/tests/series/test_constructors.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 4c0501f721191..d05eccae922aa 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -925,10 +925,6 @@ def test_constructor_dtype_datetime64(self): assert isna(s[1]) assert s.dtype == "M8[ns]" - @pytest.mark.xfail( - reason="Series(dates, dtype='M8[ms]') does not yet respect non-nano " - "dtype keyword" - ) def test_constructor_dtype_datetime64_10(self): # GH3416 pydates = [datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3)]