Skip to content

BUG: lib.infer_dtype with mixed-freq Periods #41526

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ Other enhancements
- Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`)
- Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
- Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`)
-
- :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`)

.. ---------------------------------------------------------------------------

Expand Down
19 changes: 14 additions & 5 deletions pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ def is_integer(val: object) -> bool: ...
def is_float(val: object) -> bool: ...

def is_interval_array(values: np.ndarray) -> bool: ...
def is_period_array(values: np.ndarray) -> bool: ...
def is_datetime64_array(values: np.ndarray) -> bool: ...
def is_timedelta_or_timedelta64_array(values: np.ndarray) -> bool: ...
def is_datetime_with_singletz_array(values: np.ndarray) -> bool: ...
Expand All @@ -67,50 +66,60 @@ def map_infer(
@overload # both convert_datetime and convert_to_nullable_integer False -> np.ndarray
def maybe_convert_objects(
objects: np.ndarray, # np.ndarray[object]
*,
try_float: bool = ...,
safe: bool = ...,
convert_datetime: Literal[False] = ...,
convert_timedelta: bool = ...,
convert_period: Literal[False] = ...,
convert_to_nullable_integer: Literal[False] = ...,
) -> np.ndarray: ...

@overload
def maybe_convert_objects(
objects: np.ndarray, # np.ndarray[object]
*,
try_float: bool = ...,
safe: bool = ...,
convert_datetime: Literal[False] = False,
convert_datetime: bool = ...,
convert_timedelta: bool = ...,
convert_period: bool = ...,
convert_to_nullable_integer: Literal[True] = ...,
) -> ArrayLike: ...

@overload
def maybe_convert_objects(
objects: np.ndarray, # np.ndarray[object]
*,
try_float: bool = ...,
safe: bool = ...,
convert_datetime: Literal[True] = ...,
convert_timedelta: bool = ...,
convert_to_nullable_integer: Literal[False] = ...,
convert_period: bool = ...,
convert_to_nullable_integer: bool = ...,
) -> ArrayLike: ...

@overload
def maybe_convert_objects(
objects: np.ndarray, # np.ndarray[object]
*,
try_float: bool = ...,
safe: bool = ...,
convert_datetime: Literal[True] = ...,
convert_datetime: bool = ...,
convert_timedelta: bool = ...,
convert_to_nullable_integer: Literal[True] = ...,
convert_period: Literal[True] = ...,
convert_to_nullable_integer: bool = ...,
) -> ArrayLike: ...

@overload
def maybe_convert_objects(
objects: np.ndarray, # np.ndarray[object]
*,
try_float: bool = ...,
safe: bool = ...,
convert_datetime: bool = ...,
convert_timedelta: bool = ...,
convert_period: bool = ...,
convert_to_nullable_integer: bool = ...,
) -> ArrayLike: ...

Expand Down
77 changes: 54 additions & 23 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1186,6 +1186,7 @@ cdef class Seen:
bint coerce_numeric # coerce data to numeric
bint timedelta_ # seen_timedelta
bint datetimetz_ # seen_datetimetz
bint period_ # seen_period

def __cinit__(self, bint coerce_numeric=False):
"""
Expand All @@ -1210,6 +1211,7 @@ cdef class Seen:
self.datetime_ = False
self.timedelta_ = False
self.datetimetz_ = False
self.period_ = False
self.coerce_numeric = coerce_numeric

cdef inline bint check_uint64_conflict(self) except -1:
Expand Down Expand Up @@ -1996,18 +1998,35 @@ cpdef bint is_time_array(ndarray values, bint skipna=False):
return validator.validate(values)


cdef class PeriodValidator(TemporalValidator):
cdef inline bint is_value_typed(self, object value) except -1:
return is_period_object(value)
cdef bint is_period_array(ndarray[object] values):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you not keep the current PeriodValidator format itself? (e.g. just put this function in validate). this is breaking the pattern.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is the same pattern we use for is_datetime_with_singletz_array (and will end up using for is_interval_array in an upcoming PR)

"""
Is this an ndarray of Period objects (or NaT) with a single `freq`?
"""
cdef:
Py_ssize_t i, n = len(values)
int dtype_code = -10000 # i.e. c_FreqGroup.FR_UND
object val

cdef inline bint is_valid_null(self, object value) except -1:
return checknull_with_nat(value)
if len(values) == 0:
return False

for val in values:
if is_period_object(val):
if dtype_code == -10000:
dtype_code = val._dtype._dtype_code
elif dtype_code != val._dtype._dtype_code:
# mismatched freqs
return False
elif checknull_with_nat(val):
pass
else:
# Not a Period or NaT-like
return False

cpdef bint is_period_array(ndarray values):
cdef:
PeriodValidator validator = PeriodValidator(len(values), skipna=True)
return validator.validate(values)
if dtype_code == -10000:
# we saw all-NaTs, no actual Periods
return False
return True


cdef class IntervalValidator(Validator):
Expand Down Expand Up @@ -2249,9 +2268,13 @@ def maybe_convert_numeric(

@cython.boundscheck(False)
@cython.wraparound(False)
def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
bint safe=False, bint convert_datetime=False,
def maybe_convert_objects(ndarray[object] objects,
*,
bint try_float=False,
bint safe=False,
bint convert_datetime=False,
bint convert_timedelta=False,
bint convert_period=False,
bint convert_to_nullable_integer=False) -> "ArrayLike":
"""
Type inference function-- convert object array to proper dtype
Expand All @@ -2272,6 +2295,9 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
convert_timedelta : bool, default False
If an array-like object contains only timedelta values or NaT is
encountered, whether to convert and return an array of m8[ns] dtype.
convert_period : bool, default False
If an array-like object contains only (homogeneous-freq) Period values
or NaT, whether to convert and return a PeriodArray.
convert_to_nullable_integer : bool, default False
If an array-like object contains only integer values (and NaN) is
encountered, whether to convert and return an IntegerArray.
Expand All @@ -2292,7 +2318,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
int64_t[:] itimedeltas
Seen seen = Seen()
object val
float64_t fval, fnan
float64_t fval, fnan = np.nan

n = len(objects)

Expand All @@ -2311,8 +2337,6 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
timedeltas = np.empty(n, dtype='m8[ns]')
itimedeltas = timedeltas.view(np.int64)

fnan = np.nan

for i in range(n):
val = objects[i]
if itemsize_max != -1:
Expand All @@ -2330,7 +2354,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
idatetimes[i] = NPY_NAT
if convert_timedelta:
itimedeltas[i] = NPY_NAT
if not (convert_datetime or convert_timedelta):
if not (convert_datetime or convert_timedelta or convert_period):
seen.object_ = True
break
elif val is np.nan:
Expand All @@ -2343,14 +2367,6 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
elif util.is_float_object(val):
floats[i] = complexes[i] = val
seen.float_ = True
elif util.is_datetime64_object(val):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unrelated to the rest of the PR, this chunk of code is just redundant with a chunk below

if convert_datetime:
idatetimes[i] = convert_to_tsobject(
val, None, None, 0, 0).value
seen.datetime_ = True
else:
seen.object_ = True
break
elif is_timedelta(val):
if convert_timedelta:
itimedeltas[i] = convert_to_timedelta64(val, "ns").view("i8")
Expand Down Expand Up @@ -2396,6 +2412,13 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
else:
seen.object_ = True
break
elif is_period_object(val):
if convert_period:
seen.period_ = True
break
else:
seen.object_ = True
break
elif try_float and not isinstance(val, str):
# this will convert Decimal objects
try:
Expand All @@ -2419,6 +2442,14 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
return dti._data
seen.object_ = True

if seen.period_:
if is_period_array(objects):
from pandas import PeriodIndex
pi = PeriodIndex(objects)

# unbox to PeriodArray
return pi._data

if not seen.object_:
result = None
if not safe:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def _box_values(self, values) -> np.ndarray:
"""
apply box func to passed values
"""
return lib.map_infer(values, self._box_func)
return lib.map_infer(values, self._box_func, convert=False)

def __iter__(self):
if self.ndim > 1:
Expand Down
7 changes: 6 additions & 1 deletion pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1315,6 +1315,7 @@ def soft_convert_objects(
datetime: bool = True,
numeric: bool = True,
timedelta: bool = True,
period: bool = True,
copy: bool = True,
) -> ArrayLike:
"""
Expand All @@ -1327,6 +1328,7 @@ def soft_convert_objects(
datetime : bool, default True
numeric: bool, default True
timedelta : bool, default True
period : bool, default True
copy : bool, default True

Returns
Expand All @@ -1348,7 +1350,10 @@ def soft_convert_objects(
# bound of nanosecond-resolution 64-bit integers.
try:
converted = lib.maybe_convert_objects(
values, convert_datetime=datetime, convert_timedelta=timedelta
values,
convert_datetime=datetime,
convert_timedelta=timedelta,
convert_period=period,
)
except (OutOfBoundsDatetime, ValueError):
return values
Expand Down
16 changes: 14 additions & 2 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -1105,8 +1105,9 @@ def test_infer_dtype_period(self):
arr = np.array([Period("2011-01", freq="D"), Period("2011-02", freq="D")])
assert lib.infer_dtype(arr, skipna=True) == "period"

# non-homogeneous freqs -> mixed
arr = np.array([Period("2011-01", freq="D"), Period("2011-02", freq="M")])
assert lib.infer_dtype(arr, skipna=True) == "period"
assert lib.infer_dtype(arr, skipna=True) == "mixed"

@pytest.mark.parametrize("klass", [pd.array, Series, Index])
@pytest.mark.parametrize("skipna", [True, False])
Expand All @@ -1121,6 +1122,18 @@ def test_infer_dtype_period_array(self, klass, skipna):
)
assert lib.infer_dtype(values, skipna=skipna) == "period"

# periods but mixed freq
values = klass(
[
Period("2011-01-01", freq="D"),
Period("2011-01-02", freq="M"),
pd.NaT,
]
)
# with pd.array this becomes PandasArray which ends up as "unknown-array"
exp = "unknown-array" if klass is pd.array else "mixed"
assert lib.infer_dtype(values, skipna=skipna) == exp

def test_infer_dtype_period_mixed(self):
arr = np.array(
[Period("2011-01", freq="M"), np.datetime64("nat")], dtype=object
Expand Down Expand Up @@ -1319,7 +1332,6 @@ def test_is_datetimelike_array_all_nan_nat_like(self):
"is_date_array",
"is_time_array",
"is_interval_array",
"is_period_array",
],
)
def test_other_dtypes_for_array(self, func):
Expand Down
8 changes: 3 additions & 5 deletions pandas/tests/frame/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -1021,11 +1021,9 @@ def test_replace_period(self):
columns=["fname"],
)
assert set(df.fname.values) == set(d["fname"].keys())
# We don't support converting object -> specialized EA in
# replace yet.
expected = DataFrame(
{"fname": [d["fname"][k] for k in df.fname.values]}, dtype=object
)

expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]})
assert expected.dtypes[0] == "Period[M]"
result = df.replace(d)
tm.assert_frame_equal(result, expected)

Expand Down
Loading