Skip to content

BUG: Series constructor drops nanoseconds of Timedelta scalar #38040

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,7 @@ Datetimelike
- Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`.DatetimeIndex.isin` incorrectly casting integers to datetimes (:issue:`36621`)
- Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`.DatetimeIndex.isin` failing to consider timezone-aware and timezone-naive datetimes as always different (:issue:`35728`)
- Bug in :meth:`Series.isin` with ``PeriodDtype`` dtype and :meth:`PeriodIndex.isin` failing to consider arguments with different ``PeriodDtype`` as always different (:issue:`37528`)
- Bug in :class:`DataFrame` and :class:`Series` constructors sometimes dropping nanoseconds from :class:`Timestamp` (resp. :class:`Timedelta`) ``data``, with ``dtype=datetime64[ns]`` (resp. ``timedelta64[ns]``) (:issue:`38032`)

Timedelta
^^^^^^^^^
Expand Down
1 change: 1 addition & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,7 @@ def float_frame():
DatetimeTZDtype(tz="US/Eastern"),
),
(Timedelta(seconds=500), "timedelta64[ns]"),
(Timedelta(nanoseconds=1), "timedelta64[ns]"), # GH38032
]
)
def ea_scalar_and_dtype(request):
Expand Down
55 changes: 33 additions & 22 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1611,13 +1611,42 @@ def cast_scalar_to_array(
ndarray of shape, filled with value, of specified / inferred dtype

"""
# that's what the type annotation indicates
assert isinstance(dtype, (type(None), str, np.dtype))

if dtype is None:
dtype, fill_value = infer_dtype_from_scalar(value)
dtype, value = infer_dtype_from_scalar(value)
else:
fill_value = value
if not isinstance(dtype, np.dtype):
dtype = np.dtype(dtype)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there no risk of getting here with an EADtype?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As it is, no, cast_scalar_to_array is called after having checked dtype is not EADtype. It must have been written with that in mind.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add an assert isinstance(dtype, (type(None), str, np.dtype)) and a comment (that's what the type annotation indicates) (at the top of the function)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

empty = shape and not any(shape)
# dtype coercion when empty: sometimes yes, sometimes no?

if not empty and is_integer_dtype(dtype) and isna(value):
# coerce if we have nan for an integer dtype
dtype = np.dtype("float64")
elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"):
# we need to coerce to object dtype to avoid
# to allow numpy to take our string as a scalar value
dtype = np.dtype("object")
if not isna(value):
value = ensure_str(value)
elif dtype.kind == "m":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could share the "m" and "M" code and for the is_valid_nat_for_dtype case use value = dtype.type("NaT", "ns")

# GH38032: filling in Timedelta/Timestamp drops nanoseconds
if isinstance(value, Timedelta):
value = value.to_numpy()
# GH36541: filling datetime-like array directly with pd.NaT
# raises ValueError: cannot convert float NaN to integer
elif is_valid_nat_for_dtype(value, dtype):
value = np.timedelta64("NaT")
elif dtype.kind == "M":
if isinstance(value, Timestamp):
value = value.to_numpy()
elif is_valid_nat_for_dtype(value, dtype):
value = np.datetime64("NaT")

values = np.empty(shape, dtype=dtype)
values.fill(fill_value)
values.fill(value)

return values

Expand All @@ -1643,26 +1672,8 @@ def construct_1d_arraylike_from_scalar(
if is_extension_array_dtype(dtype):
cls = dtype.construct_array_type()
subarr = cls._from_sequence([value] * length, dtype=dtype)

else:

if length and is_integer_dtype(dtype) and isna(value):
# coerce if we have nan for an integer dtype
dtype = np.dtype("float64")
elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"):
# we need to coerce to object dtype to avoid
# to allow numpy to take our string as a scalar value
dtype = np.dtype("object")
if not isna(value):
value = ensure_str(value)
elif dtype.kind in ["M", "m"] and is_valid_nat_for_dtype(value, dtype):
# GH36541: can't fill array directly with pd.NaT
# > np.empty(10, dtype="datetime64[64]").fill(pd.NaT)
# ValueError: cannot convert float NaN to integer
value = np.datetime64("NaT")

subarr = np.empty(length, dtype=dtype)
subarr.fill(value)
subarr = cast_scalar_to_array((length,), value, dtype)

return subarr

Expand Down
7 changes: 1 addition & 6 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3923,12 +3923,7 @@ def reindexer(value):
value, len(self.index), infer_dtype
)
else:
# pandas\core\frame.py:3827: error: Argument 1 to
# "cast_scalar_to_array" has incompatible type "int"; expected
# "Tuple[Any, ...]" [arg-type]
value = cast_scalar_to_array(
len(self.index), value # type: ignore[arg-type]
)
value = cast_scalar_to_array((len(self.index),), value)

value = maybe_cast_to_datetime(value, infer_dtype)

Expand Down
30 changes: 26 additions & 4 deletions pandas/tests/dtypes/cast/test_infer_dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from pandas import (
Categorical,
Interval,
NaT,
Period,
Series,
Timedelta,
Expand Down Expand Up @@ -188,11 +189,32 @@ def test_infer_dtype_from_array(arr, expected, pandas_dtype):
(Period("2011-01-01", freq="D"), object),
],
)
def test_cast_scalar_to_array(obj, dtype):
shape = (3, 2)

@pytest.mark.parametrize("shape", [(), (5,), (3, 2)])
def test_cast_scalar_to_array(obj, dtype, shape):
exp = np.empty(shape, dtype=dtype)
exp.fill(obj)

arr = cast_scalar_to_array(shape, obj, dtype=dtype)
arr = cast_scalar_to_array(shape, obj, dtype=np.dtype(dtype))
tm.assert_numpy_array_equal(arr, exp)


@pytest.mark.parametrize(
"obj_in,dtype_in,obj_out,dtype_out",
[
(NaT, "timedelta64[ns]", np.timedelta64("NaT"), "timedelta64[ns]"),
(Timedelta(1), "timedelta64[ns]", 1, "timedelta64[ns]"),
(NaT, "datetime64[ns]", np.datetime64("NaT"), "datetime64[ns]"),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls add a Nat that is timedelta64[ns]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

(Timestamp(1), "datetime64[ns]", 1, "datetime64[ns]"),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a tz-aware Timestamp case
can you add Period, Interval as well

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can probably use this fixture: ea_scalar_and_dtype

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to be sure: as we cast to np.dtype, Period and Interval should be left aside, right?

(Timestamp(1, tz="US/Eastern"), "datetime64[ns]", 1, "datetime64[ns]"),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a deeply weird behavior that we should be trying to deprecate

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, true, it can be counterintuitive that pd.Timestamp("1970-01-01", tz="US/Pacific") != pd.Timestamp(0, tz="US/Pacific")

(np.nan, np.int64, np.nan, np.float64),
("hello", "U", "hello", object),
("hello", "S", "hello", object),
],
)
@pytest.mark.parametrize("shape", [(), (5,), (3, 2)])
def test_cast_scalar_to_array_conversion_needed(
obj_in, dtype_in, obj_out, dtype_out, shape
):
result = cast_scalar_to_array(shape, obj_in, dtype=np.dtype(dtype_in))
expected = np.full(shape, obj_out, dtype=dtype_out)
tm.assert_numpy_array_equal(result, expected)
15 changes: 15 additions & 0 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1934,6 +1934,21 @@ def test_constructor_datetimes_with_nulls(self, arr):
expected = Series([np.dtype("datetime64[ns]")])
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize(
"scalar,dtype",
[
(Timedelta(1), "timedelta64[ns]"),
(Timestamp(1), "datetime64[ns]"),
(Timestamp(1, tz="US/Eastern"), "datetime64[ns]"),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same weird behavior

],
)
def test_constructor_timelike_nanoseconds(self, scalar, dtype):
# GH38032
df = DataFrame(scalar, index=[0], columns=[0], dtype=dtype)
result = df.at[0, 0].value
expected = scalar.value
assert result == expected

def test_constructor_for_list_with_dtypes(self):
# test list of lists/ndarrays
df = DataFrame([np.arange(5) for x in range(5)])
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
Period,
RangeIndex,
Series,
Timedelta,
Timestamp,
date_range,
isna,
Expand Down Expand Up @@ -1319,6 +1320,21 @@ def test_constructor_dtype_timedelta64(self):
s = Series([pd.NaT, np.nan, "1 Day"])
assert s.dtype == "timedelta64[ns]"

@pytest.mark.parametrize(
"scalar,dtype",
[
(Timedelta(1), "timedelta64[ns]"),
(Timestamp(1), "datetime64[ns]"),
(Timestamp(1, tz="US/Eastern"), "datetime64[ns]"),
],
)
def test_constructor_timelike_nanoseconds(self, scalar, dtype):
# GH38032
ser = Series(scalar, index=[0], dtype=dtype)
result = ser[0].value
expected = scalar.value
assert result == expected

# GH 16406
def test_constructor_mixed_tz(self):
s = Series([Timestamp("20130101"), Timestamp("20130101", tz="US/Eastern")])
Expand Down