Skip to content

API: retain non-nano dtype in DatetimeArray constructor #49058

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Oct 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 45 additions & 13 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@
astype_overflowsafe,
fields,
get_resolution,
get_supported_reso,
get_unit_from_dtype,
ints_to_pydatetime,
is_date_array_normalized,
is_supported_unit,
is_unitless,
normalize_i8_timestamps,
npy_unit_to_abbrev,
timezones,
to_offset,
tz_convert_from_utc,
Expand Down Expand Up @@ -321,6 +323,14 @@ def _from_sequence_not_strict(
# if dtype has an embedded tz, capture it
tz = validate_tz_from_dtype(dtype, tz, explicit_tz_none)

unit = None
if dtype is not None:
if isinstance(dtype, np.dtype):
unit = np.datetime_data(dtype)[0]
else:
# DatetimeTZDtype
unit = dtype.unit

subarr, tz, inferred_freq = _sequence_to_dt64ns(
data,
copy=copy,
Expand All @@ -341,8 +351,12 @@ def _from_sequence_not_strict(
if explicit_none:
freq = None

dtype = tz_to_dtype(tz)
result = cls._simple_new(subarr, freq=freq, dtype=dtype)
data_unit = np.datetime_data(subarr.dtype)[0]
data_dtype = tz_to_dtype(tz, data_unit)
result = cls._simple_new(subarr, freq=freq, dtype=data_dtype)
if unit is not None and unit != result._unit:
# If unit was specified in user-passed dtype, cast to it here
result = result._as_unit(unit)

if inferred_freq is None and freq is not None:
# this condition precludes `freq_infer`
Expand Down Expand Up @@ -2004,7 +2018,8 @@ def sequence_to_datetimes(data, require_iso8601: bool = False) -> DatetimeArray:
require_iso8601=require_iso8601,
)

dtype = tz_to_dtype(tz)
unit = np.datetime_data(result.dtype)[0]
dtype = tz_to_dtype(tz, unit)
dta = DatetimeArray._simple_new(result, freq=freq, dtype=dtype)
return dta

Expand Down Expand Up @@ -2110,20 +2125,33 @@ def _sequence_to_dt64ns(
elif is_datetime64_dtype(data_dtype):
# tz-naive DatetimeArray or ndarray[datetime64]
data = getattr(data, "_ndarray", data)
if data.dtype != DT64NS_DTYPE:
data = astype_overflowsafe(data, dtype=DT64NS_DTYPE)
new_dtype = data.dtype
data_unit = get_unit_from_dtype(new_dtype)
if not is_supported_unit(data_unit):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it generally not possible to get a dt64tz type with an unsupported unit?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

correct

# Cast to the nearest supported unit, generally "s"
new_reso = get_supported_reso(data_unit)
new_unit = npy_unit_to_abbrev(new_reso)
new_dtype = np.dtype(f"M8[{new_unit}]")
data = astype_overflowsafe(data, dtype=new_dtype, copy=False)
copy = False

if data.dtype.byteorder == ">":
# TODO: better way to handle this? non-copying alternative?
# without this, test_constructor_datetime64_bigendian fails
data = data.astype(data.dtype.newbyteorder("<"))
new_dtype = data.dtype
copy = False

if tz is not None:
# Convert tz-naive to UTC
# TODO: if tz is UTC, are there situations where we *don't* want a
# copy? tz_localize_to_utc always makes one.
data = tzconversion.tz_localize_to_utc(
data.view("i8"), tz, ambiguous=ambiguous
data.view("i8"), tz, ambiguous=ambiguous, reso=data_unit
)
data = data.view(DT64NS_DTYPE)
data = data.view(new_dtype)

assert data.dtype == DT64NS_DTYPE, data.dtype
assert data.dtype == new_dtype, data.dtype
result = data

else:
Expand All @@ -2137,7 +2165,9 @@ def _sequence_to_dt64ns(
result = result.copy()

assert isinstance(result, np.ndarray), type(result)
assert result.dtype == "M8[ns]", result.dtype
assert result.dtype.kind == "M"
assert result.dtype != "M8"
assert is_supported_unit(get_unit_from_dtype(result.dtype))
return result, tz, inferred_freq


Expand Down Expand Up @@ -2358,12 +2388,14 @@ def _validate_dt64_dtype(dtype):
)
raise ValueError(msg)

if (isinstance(dtype, np.dtype) and dtype != DT64NS_DTYPE) or not isinstance(
dtype, (np.dtype, DatetimeTZDtype)
):
if (
isinstance(dtype, np.dtype)
and (dtype.kind != "M" or not is_supported_unit(get_unit_from_dtype(dtype)))
) or not isinstance(dtype, (np.dtype, DatetimeTZDtype)):
raise ValueError(
f"Unexpected value for 'dtype': '{dtype}'. "
"Must be 'datetime64[ns]' or DatetimeTZDtype'."
"Must be 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', "
"'datetime64[ns]' or DatetimeTZDtype'."
)

if getattr(dtype, "tz", None):
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1373,6 +1373,9 @@ def maybe_cast_to_datetime(
# Note: NOT equivalent to dta.astype(dtype)
dta = dta.tz_localize(None)

# TODO(2.0): Do this astype in sequence_to_datetimes to
# avoid potential extra copy?
dta = dta.astype(dtype, copy=False)
value = dta
elif is_datetime64tz:
dtype = cast(DatetimeTZDtype, dtype)
Expand Down
9 changes: 0 additions & 9 deletions pandas/tests/arrays/categorical/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,6 @@
import numpy as np
import pytest

from pandas.compat import (
IS64,
is_platform_windows,
)

from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
Expand Down Expand Up @@ -749,10 +744,6 @@ def test_from_sequence_copy(self):

assert not tm.shares_memory(result, cat)

@pytest.mark.xfail(
not IS64 or is_platform_windows(),
reason="Incorrectly raising in astype_overflowsafe",
)
def test_constructor_datetime64_non_nano(self):
categories = np.arange(10).view("M8[D]")
values = categories[::2].copy()
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/arrays/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,9 @@ def test_array_copy():
),
(
np.array([1, 2], dtype="M8[us]"),
DatetimeArray(np.array([1000, 2000], dtype="M8[ns]")),
DatetimeArray._simple_new(
np.array([1, 2], dtype="M8[us]"), dtype=np.dtype("M8[us]")
),
),
# datetimetz
(
Expand Down
11 changes: 7 additions & 4 deletions pandas/tests/base/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,9 @@ def test_constructor_datetime_outofbound(self, a, constructor):
# datetime64[non-ns] raise error, other cases result in object dtype
# and preserve original data
if a.dtype.kind == "M":
msg = "Out of bounds"
with pytest.raises(pd.errors.OutOfBoundsDatetime, match=msg):
constructor(a)
# Can't fit in nanosecond bounds -> get the nearest supported unit
result = constructor(a)
assert result.dtype == "M8[s]"
else:
result = constructor(a)
assert result.dtype == "object"
Expand All @@ -162,7 +162,10 @@ def test_constructor_datetime_outofbound(self, a, constructor):

def test_constructor_datetime_nonns(self, constructor):
arr = np.array(["2020-01-01T00:00:00.000000"], dtype="datetime64[us]")
expected = constructor(pd.to_datetime(["2020-01-01"]))
dta = pd.core.arrays.DatetimeArray._simple_new(arr, dtype=arr.dtype)
expected = constructor(dta)
assert expected.dtype == arr.dtype

result = constructor(arr)
tm.assert_equal(result, expected)

Expand Down
1 change: 1 addition & 0 deletions pandas/tests/frame/constructors/test_from_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def test_from_records_with_datetimes(self):
dtypes = [("EXPIRY", "<M8[m]")]
recarray = np.core.records.fromarrays(arrdata, dtype=dtypes)
result = DataFrame.from_records(recarray)
expected["EXPIRY"] = expected["EXPIRY"].astype("M8[m]")
tm.assert_frame_equal(result, expected)

def test_from_records_sequencelike(self):
Expand Down
15 changes: 10 additions & 5 deletions pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,11 +277,11 @@ def test_setitem_dt64_ndarray_with_NaT_and_diff_time_units(self):
expected = DataFrame({0: [1, None], "new": [1, None]}, dtype="datetime64[ns]")
tm.assert_frame_equal(result, expected)

# OutOfBoundsDatetime error shouldn't occur
# OutOfBoundsDatetime error shouldn't occur; as of 2.0 we preserve "M8[s]"
data_s = np.array([1, "nat"], dtype="datetime64[s]")
result["new"] = data_s
expected = DataFrame({0: [1, None], "new": [1e9, None]}, dtype="datetime64[ns]")
tm.assert_frame_equal(result, expected)
tm.assert_series_equal(result[0], expected[0])
tm.assert_numpy_array_equal(result["new"].to_numpy(), data_s)

@pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"])
def test_frame_setitem_datetime64_col_other_units(self, unit):
Expand All @@ -291,12 +291,17 @@ def test_frame_setitem_datetime64_col_other_units(self, unit):

dtype = np.dtype(f"M8[{unit}]")
vals = np.arange(n, dtype=np.int64).view(dtype)
ex_vals = vals.astype("datetime64[ns]")
if unit in ["s", "ms"]:
# supported unit
ex_vals = vals
else:
# we get the nearest supported units, i.e. "s"
ex_vals = vals.astype("datetime64[s]")

df = DataFrame({"ints": np.arange(n)}, index=np.arange(n))
df[unit] = vals

assert df[unit].dtype == np.dtype("M8[ns]")
assert df[unit].dtype == ex_vals.dtype
assert (df[unit].values == ex_vals).all()

@pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"])
Expand Down
5 changes: 2 additions & 3 deletions pandas/tests/frame/methods/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,9 +424,8 @@ def test_astype_to_datetime_unit(self, unit):
# GH#48928
exp_dtype = dtype
else:
# TODO(2.0): use the nearest supported dtype (i.e. M8[s]) instead
# of nanos
exp_dtype = "M8[ns]"
# we use the nearest supported dtype (i.e. M8[s])
exp_dtype = "M8[s]"
# TODO(2.0): once DataFrame constructor doesn't cast ndarray inputs.
# can simplify this
exp_values = arr.astype(exp_dtype)
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/frame/test_block_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,11 +228,13 @@ def test_construction_with_conversions(self):
expected = DataFrame(
{
"dt1": Timestamp("20130101"),
"dt2": date_range("20130101", periods=3),
"dt2": date_range("20130101", periods=3).astype("M8[s]"),
# 'dt3' : date_range('20130101 00:00:01',periods=3,freq='s'),
},
index=range(3),
)
assert expected.dtypes["dt1"] == "M8[ns]"
assert expected.dtypes["dt2"] == "M8[s]"

df = DataFrame(index=range(3))
df["dt1"] = np.datetime64("2013-01-01")
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/indexes/datetimes/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -913,9 +913,9 @@ def test_constructor_no_precision_raises(self):
Index(["2000"], dtype="datetime64")

def test_constructor_wrong_precision_raises(self):
msg = "Unexpected value for 'dtype': 'datetime64\\[us\\]'"
with pytest.raises(ValueError, match=msg):
DatetimeIndex(["2000"], dtype="datetime64[us]")
dti = DatetimeIndex(["2000"], dtype="datetime64[us]")
assert dti.dtype == "M8[us]"
assert dti[0] == Timestamp(2000, 1, 1)

def test_index_constructor_with_numpy_object_array_and_timestamp_tz_with_nan(self):
# GH 27011
Expand Down
16 changes: 13 additions & 3 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -731,18 +731,28 @@ def test_other_datetime_unit(self, unit):

dtype = f"datetime64[{unit}]"
df2 = ser.astype(dtype).to_frame("days")
# coerces to datetime64[ns], thus should not be affected
assert df2["days"].dtype == "datetime64[ns]"

if unit in ["D", "h", "m"]:
# not supported so we cast to the nearest supported unit, seconds
# TODO(2.0): cast to nearest (second) instead of ns
# coerces to datetime64[ns], thus should not be affected
exp_dtype = "datetime64[s]"
else:
exp_dtype = dtype
assert df2["days"].dtype == exp_dtype

result = df1.merge(df2, left_on="entity_id", right_index=True)

days = np.array(["nat", "nat"], dtype=exp_dtype)
days = pd.core.arrays.DatetimeArray._simple_new(days, dtype=days.dtype)
exp = DataFrame(
{
"entity_id": [101, 102],
"days": np.array(["nat", "nat"], dtype="datetime64[ns]"),
"days": days,
},
columns=["entity_id", "days"],
)
assert exp["days"].dtype == exp_dtype
tm.assert_frame_equal(result, exp)

@pytest.mark.parametrize("unit", ["D", "h", "m", "s", "ms", "us", "ns"])
Expand Down
Loading