-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: DataFrame.append with timedelta64 #39574
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
b3fb477
8083939
512a50c
fa6d8a4
1c63c05
d75a950
5e35e31
7de3800
dbee2bc
dbb59e7
f58d791
f40cf7c
7315004
faf6c35
3ed534c
d1c9872
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,8 @@ | ||
from __future__ import annotations | ||
|
||
from collections import defaultdict | ||
import copy | ||
import itertools | ||
from typing import TYPE_CHECKING, Dict, List, Sequence, cast | ||
from typing import TYPE_CHECKING, Dict, List, Sequence | ||
|
||
import numpy as np | ||
|
||
|
@@ -15,26 +14,21 @@ | |
from pandas.core.dtypes.common import ( | ||
get_dtype, | ||
is_categorical_dtype, | ||
is_datetime64_dtype, | ||
is_datetime64tz_dtype, | ||
is_extension_array_dtype, | ||
is_float_dtype, | ||
is_numeric_dtype, | ||
is_sparse, | ||
is_timedelta64_dtype, | ||
) | ||
from pandas.core.dtypes.concat import concat_compat | ||
from pandas.core.dtypes.missing import isna_all | ||
from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna_all | ||
|
||
import pandas.core.algorithms as algos | ||
from pandas.core.arrays import DatetimeArray, ExtensionArray | ||
from pandas.core.arrays import DatetimeArray, ExtensionArray, TimedeltaArray | ||
from pandas.core.internals.array_manager import ArrayManager | ||
from pandas.core.internals.blocks import make_block | ||
from pandas.core.internals.managers import BlockManager | ||
|
||
if TYPE_CHECKING: | ||
from pandas import Index | ||
from pandas.core.arrays.sparse.dtype import SparseDtype | ||
|
||
|
||
def concatenate_block_managers( | ||
|
@@ -233,6 +227,24 @@ def dtype(self): | |
else: | ||
return get_dtype(maybe_promote(self.block.dtype, self.block.fill_value)[0]) | ||
|
||
def is_valid_na_for(self, dtype: DtypeObj) -> bool: | ||
""" | ||
Check that we are all-NA of a type/dtype that is compatible with this dtype. | ||
""" | ||
if not self.is_na: | ||
return False | ||
if self.block is None: | ||
return True | ||
|
||
if self.dtype == object: | ||
values = self.block.values | ||
return all( | ||
is_valid_nat_for_dtype(x, dtype) for x in values.ravel(order="K") | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not only required for object dtype, I think. Also float NaN is considered "all NaN" when it comes to ignoring the dtype in concatting dataframes (and other dtypes as well I think):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the non-object case is handled below on L245-246. or do you have something else in mind? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does my snippet above work with this PR? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
yes it does
I think that's driven by something sketchy-looking in get_reindexed_values, will see if that can be addressed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. better? |
||
|
||
na_value = self.block.fill_value | ||
return is_valid_nat_for_dtype(na_value, dtype) | ||
|
||
@cache_readonly | ||
def is_na(self) -> bool: | ||
if self.block is None: | ||
|
@@ -263,7 +275,7 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: | |
else: | ||
fill_value = upcasted_na | ||
|
||
if self.is_na: | ||
if self.is_valid_na_for(empty_dtype): | ||
blk_dtype = getattr(self.block, "dtype", None) | ||
|
||
if blk_dtype == np.dtype(object): | ||
|
@@ -296,6 +308,8 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: | |
empty_arr, allow_fill=True, fill_value=fill_value | ||
) | ||
else: | ||
# NB: we should never get here with empty_dtype integer or bool; | ||
# if we did, the missing_arr.fill would cast to gibberish | ||
missing_arr = np.empty(self.shape, dtype=empty_dtype) | ||
missing_arr.fill(fill_value) | ||
return missing_arr | ||
|
@@ -363,9 +377,11 @@ def _concatenate_join_units( | |
# concatting with at least one EA means we are concatting a single column | ||
# the non-EA values are 2D arrays with shape (1, n) | ||
to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat] | ||
concat_values = concat_compat(to_concat, axis=0) | ||
if not isinstance(concat_values, ExtensionArray) or ( | ||
isinstance(concat_values, DatetimeArray) and concat_values.tz is None | ||
concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True) | ||
if ( | ||
not isinstance(concat_values, ExtensionArray) | ||
or (isinstance(concat_values, DatetimeArray) and concat_values.tz is None) | ||
or isinstance(concat_values, TimedeltaArray) | ||
): | ||
# if the result of concat is not an EA but an ndarray, reshape to | ||
# 2D to put it a non-EA Block | ||
|
@@ -420,108 +436,23 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: | |
return empty_dtype | ||
|
||
has_none_blocks = any(unit.block is None for unit in join_units) | ||
dtypes = [None if unit.block is None else unit.dtype for unit in join_units] | ||
|
||
filtered_dtypes = [ | ||
dtypes = [ | ||
unit.dtype for unit in join_units if unit.block is not None and not unit.is_na | ||
] | ||
if not len(filtered_dtypes): | ||
filtered_dtypes = [unit.dtype for unit in join_units if unit.block is not None] | ||
dtype_alt = find_common_type(filtered_dtypes) | ||
|
||
upcast_classes = _get_upcast_classes(join_units, dtypes) | ||
|
||
if is_extension_array_dtype(dtype_alt): | ||
return dtype_alt | ||
elif dtype_alt == object: | ||
return dtype_alt | ||
|
||
# TODO: de-duplicate with maybe_promote? | ||
# create the result | ||
if "extension" in upcast_classes: | ||
return np.dtype("object") | ||
elif "bool" in upcast_classes: | ||
if has_none_blocks: | ||
return np.dtype(np.object_) | ||
else: | ||
return np.dtype(np.bool_) | ||
elif "datetimetz" in upcast_classes: | ||
# GH-25014. We use NaT instead of iNaT, since this eventually | ||
# ends up in DatetimeArray.take, which does not allow iNaT. | ||
dtype = upcast_classes["datetimetz"] | ||
return dtype[0] | ||
elif "datetime" in upcast_classes: | ||
return np.dtype("M8[ns]") | ||
elif "timedelta" in upcast_classes: | ||
return np.dtype("m8[ns]") | ||
else: | ||
try: | ||
common_dtype = np.find_common_type(upcast_classes, []) | ||
except TypeError: | ||
# At least one is an ExtensionArray | ||
return np.dtype(np.object_) | ||
else: | ||
if is_float_dtype(common_dtype): | ||
return common_dtype | ||
elif is_numeric_dtype(common_dtype): | ||
if has_none_blocks: | ||
return np.dtype(np.float64) | ||
else: | ||
return common_dtype | ||
|
||
msg = "invalid dtype determination in get_concat_dtype" | ||
raise AssertionError(msg) | ||
|
||
|
||
def _get_upcast_classes( | ||
join_units: Sequence[JoinUnit], | ||
dtypes: Sequence[DtypeObj], | ||
) -> Dict[str, List[DtypeObj]]: | ||
"""Create mapping between upcast class names and lists of dtypes.""" | ||
upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) | ||
null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) | ||
for dtype, unit in zip(dtypes, join_units): | ||
if dtype is None: | ||
continue | ||
|
||
upcast_cls = _select_upcast_cls_from_dtype(dtype) | ||
# Null blocks should not influence upcast class selection, unless there | ||
# are only null blocks, when same upcasting rules must be applied to | ||
# null upcast classes. | ||
if unit.is_na: | ||
null_upcast_classes[upcast_cls].append(dtype) | ||
else: | ||
upcast_classes[upcast_cls].append(dtype) | ||
|
||
if not upcast_classes: | ||
upcast_classes = null_upcast_classes | ||
|
||
return upcast_classes | ||
|
||
|
||
def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str: | ||
"""Select upcast class name based on dtype.""" | ||
if is_categorical_dtype(dtype): | ||
return "extension" | ||
elif is_datetime64tz_dtype(dtype): | ||
return "datetimetz" | ||
elif is_extension_array_dtype(dtype): | ||
return "extension" | ||
elif issubclass(dtype.type, np.bool_): | ||
return "bool" | ||
elif issubclass(dtype.type, np.object_): | ||
return "object" | ||
elif is_datetime64_dtype(dtype): | ||
return "datetime" | ||
elif is_timedelta64_dtype(dtype): | ||
return "timedelta" | ||
elif is_sparse(dtype): | ||
dtype = cast("SparseDtype", dtype) | ||
return dtype.subtype.name | ||
elif is_float_dtype(dtype) or is_numeric_dtype(dtype): | ||
return dtype.name | ||
else: | ||
return "float" | ||
if not len(dtypes): | ||
dtypes = [unit.dtype for unit in join_units if unit.block is not None] | ||
|
||
dtype = find_common_type(dtypes) | ||
if has_none_blocks: | ||
if not isinstance(dtype, np.dtype): | ||
# EA dtype | ||
pass | ||
elif dtype.kind in ["i", "u"]: | ||
dtype = np.dtype(np.float64) | ||
elif dtype.kind == "b": | ||
dtype = np.dtype(object) | ||
return dtype | ||
|
||
|
||
def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -334,29 +334,34 @@ def test_append_missing_column_proper_upcast(self, sort): | |
def test_append_empty_frame_to_series_with_dateutil_tz(self): | ||
# GH 23682 | ||
date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc()) | ||
s = Series({"date": date, "a": 1.0, "b": 2.0}) | ||
ser = Series({"date": date, "a": 1.0, "b": 2.0}) | ||
df = DataFrame(columns=["c", "d"]) | ||
result_a = df.append(s, ignore_index=True) | ||
result_a = df.append(ser, ignore_index=True) | ||
expected = DataFrame( | ||
[[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"] | ||
) | ||
# These columns get cast to object after append | ||
expected["c"] = expected["c"].astype(object) | ||
expected["d"] = expected["d"].astype(object) | ||
expected["date"] = expected["date"].astype(object) | ||
# TODO: "date" might make sense to keep as dt64tz | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
tm.assert_frame_equal(result_a, expected) | ||
|
||
expected = DataFrame( | ||
[[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"] | ||
) | ||
expected["c"] = expected["c"].astype(object) | ||
expected["d"] = expected["d"].astype(object) | ||
|
||
result_b = result_a.append(s, ignore_index=True) | ||
expected["date"] = expected["date"].astype(object) | ||
# TODO: "date" might make sense to keep as dt64tz | ||
result_b = result_a.append(ser, ignore_index=True) | ||
tm.assert_frame_equal(result_b, expected) | ||
|
||
# column order is different | ||
expected = expected[["c", "d", "date", "a", "b"]] | ||
result = df.append([s, s], ignore_index=True) | ||
dtype = Series([date]).dtype | ||
expected["date"] = expected["date"].astype(dtype) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this still needed? (might be a left-over from astyping it to object before) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you're right, updated |
||
result = df.append([ser, ser], ignore_index=True) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_append_empty_tz_frame_with_datetime64ns(self): | ||
|
@@ -378,12 +383,27 @@ def test_append_empty_tz_frame_with_datetime64ns(self): | |
@pytest.mark.parametrize( | ||
"dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"] | ||
) | ||
def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str): | ||
@pytest.mark.parametrize("val", [1, "NaT"]) | ||
def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val): | ||
# https://github.com/pandas-dev/pandas/issues/35460 | ||
df = DataFrame(columns=["a"]).astype(dtype_str) | ||
|
||
other = DataFrame({"a": [np.timedelta64("NaT", "ns")]}) | ||
other = DataFrame({"a": [np.timedelta64(val, "ns")]}) | ||
result = df.append(other, ignore_index=True) | ||
|
||
expected = other.astype(object) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
@pytest.mark.parametrize( | ||
"dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"] | ||
) | ||
@pytest.mark.parametrize("val", [1, "NaT"]) | ||
def test_append_frame_with_timedelta64ns_nat(self, dtype_str, val): | ||
# https://github.com/pandas-dev/pandas/issues/35460 | ||
df = DataFrame({"a": pd.array([1], dtype=dtype_str)}) | ||
|
||
other = DataFrame({"a": [np.timedelta64(val, "ns")]}) | ||
result = df.append(other, ignore_index=True) | ||
|
||
expected = DataFrame({"a": [df.iloc[0, 0], other.iloc[0, 0]]}, dtype=object) | ||
tm.assert_frame_equal(result, expected) |
Uh oh!
There was an error while loading. Please reload this page.