From f6a0f2870a09a3672ddb308658e872706b320f72 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 17 May 2021 13:16:20 -0700 Subject: [PATCH 1/2] BUG: lib.infer_dtype with mixed-freq Periods --- pandas/_libs/lib.pyi | 19 +++-- pandas/_libs/lib.pyx | 77 ++++++++++++++------ pandas/core/arrays/datetimelike.py | 2 +- pandas/core/dtypes/cast.py | 7 +- pandas/tests/dtypes/test_inference.py | 16 +++- pandas/tests/frame/methods/test_replace.py | 8 +- pandas/tests/series/methods/test_describe.py | 33 +++++---- 7 files changed, 111 insertions(+), 51 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 9dbc47f1d40f7..5e1cc612bed57 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -40,7 +40,6 @@ def is_integer(val: object) -> bool: ... def is_float(val: object) -> bool: ... def is_interval_array(values: np.ndarray) -> bool: ... -def is_period_array(values: np.ndarray) -> bool: ... def is_datetime64_array(values: np.ndarray) -> bool: ... def is_timedelta_or_timedelta64_array(values: np.ndarray) -> bool: ... def is_datetime_with_singletz_array(values: np.ndarray) -> bool: ... @@ -67,50 +66,60 @@ def map_infer( @overload # both convert_datetime and convert_to_nullable_integer False -> np.ndarray def maybe_convert_objects( objects: np.ndarray, # np.ndarray[object] + *, try_float: bool = ..., safe: bool = ..., convert_datetime: Literal[False] = ..., convert_timedelta: bool = ..., + convert_period: Literal[False] = ..., convert_to_nullable_integer: Literal[False] = ..., ) -> np.ndarray: ... @overload def maybe_convert_objects( objects: np.ndarray, # np.ndarray[object] + *, try_float: bool = ..., safe: bool = ..., - convert_datetime: Literal[False] = False, + convert_datetime: bool = ..., convert_timedelta: bool = ..., + convert_period: bool = ..., convert_to_nullable_integer: Literal[True] = ..., ) -> ArrayLike: ... @overload def maybe_convert_objects( objects: np.ndarray, # np.ndarray[object] + *, try_float: bool = ..., safe: bool = ..., convert_datetime: Literal[True] = ..., convert_timedelta: bool = ..., - convert_to_nullable_integer: Literal[False] = ..., + convert_period: bool = ..., + convert_to_nullable_integer: bool = ..., ) -> ArrayLike: ... @overload def maybe_convert_objects( objects: np.ndarray, # np.ndarray[object] + *, try_float: bool = ..., safe: bool = ..., - convert_datetime: Literal[True] = ..., + convert_datetime: bool = ..., convert_timedelta: bool = ..., - convert_to_nullable_integer: Literal[True] = ..., + convert_period: Literal[True] = ..., + convert_to_nullable_integer: bool = ..., ) -> ArrayLike: ... @overload def maybe_convert_objects( objects: np.ndarray, # np.ndarray[object] + *, try_float: bool = ..., safe: bool = ..., convert_datetime: bool = ..., convert_timedelta: bool = ..., + convert_period: bool = ..., convert_to_nullable_integer: bool = ..., ) -> ArrayLike: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e1cb744c7033c..cbef4ed44dc06 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1186,6 +1186,7 @@ cdef class Seen: bint coerce_numeric # coerce data to numeric bint timedelta_ # seen_timedelta bint datetimetz_ # seen_datetimetz + bint period_ # seen_period def __cinit__(self, bint coerce_numeric=False): """ @@ -1210,6 +1211,7 @@ cdef class Seen: self.datetime_ = False self.timedelta_ = False self.datetimetz_ = False + self.period_ = False self.coerce_numeric = coerce_numeric cdef inline bint check_uint64_conflict(self) except -1: @@ -1996,18 +1998,35 @@ cpdef bint is_time_array(ndarray values, bint skipna=False): return validator.validate(values) -cdef class PeriodValidator(TemporalValidator): - cdef inline bint is_value_typed(self, object value) except -1: - return is_period_object(value) +cdef bint is_period_array(ndarray[object] values): + """ + Is this an ndarray of Period objects (or NaT) with a single `freq`? + """ + cdef: + Py_ssize_t i, n = len(values) + int dtype_code = -10000 # i.e. c_FreqGroup.FR_UND + object val - cdef inline bint is_valid_null(self, object value) except -1: - return checknull_with_nat(value) + if len(values) == 0: + return False + for val in values: + if is_period_object(val): + if dtype_code == -10000: + dtype_code = val._dtype._dtype_code + elif dtype_code != val._dtype._dtype_code: + # mismatched freqs + return False + elif checknull_with_nat(val): + pass + else: + # Not a Period or NaT-like + return False -cpdef bint is_period_array(ndarray values): - cdef: - PeriodValidator validator = PeriodValidator(len(values), skipna=True) - return validator.validate(values) + if dtype_code == -10000: + # we saw all-NaTs, no actual Periods + return False + return True cdef class IntervalValidator(Validator): @@ -2249,9 +2268,13 @@ def maybe_convert_numeric( @cython.boundscheck(False) @cython.wraparound(False) -def maybe_convert_objects(ndarray[object] objects, bint try_float=False, - bint safe=False, bint convert_datetime=False, +def maybe_convert_objects(ndarray[object] objects, + *, + bint try_float=False, + bint safe=False, + bint convert_datetime=False, bint convert_timedelta=False, + bint convert_period=False, bint convert_to_nullable_integer=False) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2272,6 +2295,9 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, convert_timedelta : bool, default False If an array-like object contains only timedelta values or NaT is encountered, whether to convert and return an array of m8[ns] dtype. + convert_period : bool, default False + If an array-like object contains only (homogeneous-freq) Period values + or NaT, whether to convert and return a PeriodArray. convert_to_nullable_integer : bool, default False If an array-like object contains only integer values (and NaN) is encountered, whether to convert and return an IntegerArray. @@ -2292,7 +2318,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, int64_t[:] itimedeltas Seen seen = Seen() object val - float64_t fval, fnan + float64_t fval, fnan = np.nan n = len(objects) @@ -2311,8 +2337,6 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, timedeltas = np.empty(n, dtype='m8[ns]') itimedeltas = timedeltas.view(np.int64) - fnan = np.nan - for i in range(n): val = objects[i] if itemsize_max != -1: @@ -2330,7 +2354,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, idatetimes[i] = NPY_NAT if convert_timedelta: itimedeltas[i] = NPY_NAT - if not (convert_datetime or convert_timedelta): + if not (convert_datetime or convert_timedelta or convert_period): seen.object_ = True break elif val is np.nan: @@ -2343,14 +2367,6 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, elif util.is_float_object(val): floats[i] = complexes[i] = val seen.float_ = True - elif util.is_datetime64_object(val): - if convert_datetime: - idatetimes[i] = convert_to_tsobject( - val, None, None, 0, 0).value - seen.datetime_ = True - else: - seen.object_ = True - break elif is_timedelta(val): if convert_timedelta: itimedeltas[i] = convert_to_timedelta64(val, "ns").view("i8") @@ -2396,6 +2412,13 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, else: seen.object_ = True break + elif is_period_object(val): + if convert_period: + seen.period_ = True + break + else: + seen.object_ = True + break elif try_float and not isinstance(val, str): # this will convert Decimal objects try: @@ -2419,6 +2442,14 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, return dti._data seen.object_ = True + if seen.period_: + if is_period_array(objects): + from pandas import PeriodIndex + pi = PeriodIndex(objects) + + # unbox to PeriodArray + return pi._data + if not seen.object_: result = None if not safe: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 286fd8bf8ba4a..a3c58b6c6ae15 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -261,7 +261,7 @@ def _box_values(self, values) -> np.ndarray: """ apply box func to passed values """ - return lib.map_infer(values, self._box_func) + return lib.map_infer(values, self._box_func, convert=False) def __iter__(self): if self.ndim > 1: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 46dc97214e2f6..783474c53f304 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1315,6 +1315,7 @@ def soft_convert_objects( datetime: bool = True, numeric: bool = True, timedelta: bool = True, + period: bool = True, copy: bool = True, ) -> ArrayLike: """ @@ -1327,6 +1328,7 @@ def soft_convert_objects( datetime : bool, default True numeric: bool, default True timedelta : bool, default True + period : bool, default True copy : bool, default True Returns @@ -1348,7 +1350,10 @@ def soft_convert_objects( # bound of nanosecond-resolution 64-bit integers. try: converted = lib.maybe_convert_objects( - values, convert_datetime=datetime, convert_timedelta=timedelta + values, + convert_datetime=datetime, + convert_timedelta=timedelta, + convert_period=period, ) except (OutOfBoundsDatetime, ValueError): return values diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index d16dda370c498..09efa97871fae 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1105,8 +1105,9 @@ def test_infer_dtype_period(self): arr = np.array([Period("2011-01", freq="D"), Period("2011-02", freq="D")]) assert lib.infer_dtype(arr, skipna=True) == "period" + # non-homogeneous freqs -> mixed arr = np.array([Period("2011-01", freq="D"), Period("2011-02", freq="M")]) - assert lib.infer_dtype(arr, skipna=True) == "period" + assert lib.infer_dtype(arr, skipna=True) == "mixed" @pytest.mark.parametrize("klass", [pd.array, Series, Index]) @pytest.mark.parametrize("skipna", [True, False]) @@ -1121,6 +1122,18 @@ def test_infer_dtype_period_array(self, klass, skipna): ) assert lib.infer_dtype(values, skipna=skipna) == "period" + # periods but mixed freq + values = klass( + [ + Period("2011-01-01", freq="D"), + Period("2011-01-02", freq="M"), + pd.NaT, + ] + ) + # with pd.array this becomes PandasArray which ends up as "unknown-array" + exp = "unknown-array" if klass is pd.array else "mixed" + assert lib.infer_dtype(values, skipna=skipna) == exp + def test_infer_dtype_period_mixed(self): arr = np.array( [Period("2011-01", freq="M"), np.datetime64("nat")], dtype=object @@ -1319,7 +1332,6 @@ def test_is_datetimelike_array_all_nan_nat_like(self): "is_date_array", "is_time_array", "is_interval_array", - "is_period_array", ], ) def test_other_dtypes_for_array(self, func): diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index e6e992d37fd5d..645de6f193750 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1021,11 +1021,9 @@ def test_replace_period(self): columns=["fname"], ) assert set(df.fname.values) == set(d["fname"].keys()) - # We don't support converting object -> specialized EA in - # replace yet. - expected = DataFrame( - {"fname": [d["fname"][k] for k in df.fname.values]}, dtype=object - ) + + expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]}) + assert expected.dtypes[0] == "Period[M]" result = df.replace(d) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py index bdb308ddbfd58..e6c6016d2b3a1 100644 --- a/pandas/tests/series/methods/test_describe.py +++ b/pandas/tests/series/methods/test_describe.py @@ -11,31 +11,35 @@ class TestSeriesDescribe: - def test_describe(self): - s = Series([0, 1, 2, 3, 4], name="int_data") - result = s.describe() + def test_describe_ints(self): + ser = Series([0, 1, 2, 3, 4], name="int_data") + result = ser.describe() expected = Series( - [5, 2, s.std(), 0, 1, 2, 3, 4], + [5, 2, ser.std(), 0, 1, 2, 3, 4], name="int_data", index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) tm.assert_series_equal(result, expected) - s = Series([True, True, False, False, False], name="bool_data") - result = s.describe() + def test_describe_bools(self): + ser = Series([True, True, False, False, False], name="bool_data") + result = ser.describe() expected = Series( [5, 2, False, 3], name="bool_data", index=["count", "unique", "top", "freq"] ) tm.assert_series_equal(result, expected) - s = Series(["a", "a", "b", "c", "d"], name="str_data") - result = s.describe() + def test_describe_strs(self): + + ser = Series(["a", "a", "b", "c", "d"], name="str_data") + result = ser.describe() expected = Series( [5, 4, "a", 2], name="str_data", index=["count", "unique", "top", "freq"] ) tm.assert_series_equal(result, expected) - s = Series( + def test_describe_timedelta64(self): + ser = Series( [ Timedelta("1 days"), Timedelta("2 days"), @@ -45,21 +49,22 @@ def test_describe(self): ], name="timedelta_data", ) - result = s.describe() + result = ser.describe() expected = Series( - [5, s[2], s.std(), s[0], s[1], s[2], s[3], s[4]], + [5, ser[2], ser.std(), ser[0], ser[1], ser[2], ser[3], ser[4]], name="timedelta_data", index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) tm.assert_series_equal(result, expected) - s = Series( + def test_describe_period(self): + ser = Series( [Period("2020-01", "M"), Period("2020-01", "M"), Period("2019-12", "M")], name="period_data", ) - result = s.describe() + result = ser.describe() expected = Series( - [3, 2, s[0], 2], + [3, 2, ser[0], 2], name="period_data", index=["count", "unique", "top", "freq"], ) From fbd82511c4974d826fec82568d48a38b0994b98f Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 17 May 2021 18:02:51 -0700 Subject: [PATCH 2/2] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 9298bc6a61bae..25e482fea60ee 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -228,7 +228,7 @@ Other enhancements - Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`) - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`) - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`) -- +- :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`) .. ---------------------------------------------------------------------------