Skip to content

BUG: Fix mixed datetime dtype inference #33749

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Jun 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,7 @@ Datetimelike
- Bug in :meth:`DatetimeIndex.to_period` not infering the frequency when called with no arguments (:issue:`33358`)
- Bug in :meth:`DatetimeIndex.tz_localize` incorrectly retaining ``freq`` in some cases where the original freq is no longer valid (:issue:`30511`)
- Bug in :meth:`DatetimeIndex.intersection` losing ``freq`` and timezone in some cases (:issue:`33604`)
- Bug in :meth:`DatetimeIndex.get_indexer` where incorrect output would be returned for mixed datetime-like targets (:issue:`33741`)
- Bug in :class:`DatetimeIndex` addition and subtraction with some types of :class:`DateOffset` objects incorrectly retaining an invalid ``freq`` attribute (:issue:`33779`)
- Bug in :class:`DatetimeIndex` where setting the ``freq`` attribute on an index could silently change the ``freq`` attribute on another index viewing the same data (:issue:`33552`)
- :meth:`DataFrame.min`/:meth:`DataFrame.max` not returning consistent result with :meth:`Series.min`/:meth:`Series.max` when called on objects initialized with empty :func:`pd.to_datetime`
Expand Down
8 changes: 5 additions & 3 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1380,8 +1380,10 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
return "mixed-integer"

elif PyDateTime_Check(val):
if is_datetime_array(values):
if is_datetime_array(values, skipna=skipna):
return "datetime"
elif is_date_array(values, skipna=skipna):
return "date"

elif PyDate_Check(val):
if is_date_array(values, skipna=skipna):
Expand Down Expand Up @@ -1752,10 +1754,10 @@ cdef class DatetimeValidator(TemporalValidator):
return is_null_datetime64(value)


cpdef bint is_datetime_array(ndarray values):
cpdef bint is_datetime_array(ndarray values, bint skipna=True):
cdef:
DatetimeValidator validator = DatetimeValidator(len(values),
skipna=True)
skipna=skipna)
return validator.validate(values)


Expand Down
5 changes: 4 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4701,7 +4701,10 @@ def _maybe_promote(self, other: "Index"):
"""

if self.inferred_type == "date" and isinstance(other, ABCDatetimeIndex):
return type(other)(self), other
try:
return type(other)(self), other
except OutOfBoundsDatetime:
return self, other
elif self.inferred_type == "timedelta" and isinstance(other, ABCTimedeltaIndex):
# TODO: we dont have tests that get here
return type(other)(self), other
Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -1106,6 +1106,21 @@ def test_date(self):
result = lib.infer_dtype(dates, skipna=True)
assert result == "date"

@pytest.mark.parametrize(
"values",
[
[date(2020, 1, 1), pd.Timestamp("2020-01-01")],
[pd.Timestamp("2020-01-01"), date(2020, 1, 1)],
[date(2020, 1, 1), pd.NaT],
[pd.NaT, date(2020, 1, 1)],
],
)
@pytest.mark.parametrize("skipna", [True, False])
def test_infer_dtype_date_order_invariant(self, values, skipna):
# https://github.com/pandas-dev/pandas/issues/33741
result = lib.infer_dtype(values, skipna=skipna)
assert result == "date"

def test_is_numeric_array(self):

assert lib.is_float_array(np.array([1, 2.0]))
Expand Down
34 changes: 33 additions & 1 deletion pandas/tests/indexes/datetimes/test_indexing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datetime import datetime, time, timedelta
from datetime import date, datetime, time, timedelta

import numpy as np
import pytest
Expand Down Expand Up @@ -575,6 +575,38 @@ def test_get_indexer(self):
with pytest.raises(ValueError, match="abbreviation w/o a number"):
idx.get_indexer(idx[[0]], method="nearest", tolerance="foo")

@pytest.mark.parametrize(
"target",
[
[date(2020, 1, 1), pd.Timestamp("2020-01-02")],
[pd.Timestamp("2020-01-01"), date(2020, 1, 2)],
],
)
def test_get_indexer_mixed_dtypes(self, target):
# https://github.com/pandas-dev/pandas/issues/33741
values = pd.DatetimeIndex(
[pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")]
)
result = values.get_indexer(target)
expected = np.array([0, 1], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize(
"target, positions",
[
([date(9999, 1, 1), pd.Timestamp("2020-01-01")], [-1, 0]),
([pd.Timestamp("2020-01-01"), date(9999, 1, 1)], [0, -1]),
([date(9999, 1, 1), date(9999, 1, 1)], [-1, -1]),
],
)
def test_get_indexer_out_of_bounds_date(self, target, positions):
values = pd.DatetimeIndex(
[pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")]
)
result = values.get_indexer(target)
expected = np.array(positions, dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)


class TestMaybeCastSliceBound:
def test_maybe_cast_slice_bounds_empty(self):
Expand Down