Skip to content

Use ea interface to calculate accumulator functions for datetimelike #50297

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Jan 13, 2023
Merged
72 changes: 72 additions & 0 deletions pandas/core/array_algos/datetimelike_accumulations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""
datetimelke_accumulations.py is for accumulations of datetimelike extension arrays
"""

from __future__ import annotations

from typing import Callable

import numpy as np

from pandas._libs import iNaT

from pandas.core.dtypes.missing import isna


def _cum_func(
func: Callable,
values: np.ndarray,
*,
skipna: bool = True,
):
"""
Accumulations for 1D datetimelike arrays.

Parameters
----------
func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate
func : np.cumsum, np.maximum.accumulate, np.minimum.accumulate

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thx removed

values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation).
skipna : bool, default True
Whether to skip NA.
"""
try:
fill_value = {
np.cumprod: 1,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is np.cumprod relevant?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm I guess we can remove it

np.maximum.accumulate: np.iinfo(np.int64).min,
np.cumsum: 0,
np.minimum.accumulate: np.iinfo(np.int64).max,
}[func]
except KeyError:
raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray")

mask = isna(values)
y = values.view("i8")
y[mask] = fill_value

if not skipna:
mask = np.maximum.accumulate(mask)

result = func(y)
result[mask] = iNaT

if values.dtype.kind in ["m", "M"]:
return result.view(values.dtype.base)
return result


def cumsum(values: np.ndarray, *, skipna: bool = True):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-> np.ndarray?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep good point, this one works. The others aren't that easy...

return _cum_func(np.cumsum, values, skipna=skipna)


def cumprod(values: np.ndarray, *, skipna: bool = True):
return _cum_func(np.cumprod, values, skipna=skipna)


def cummin(values: np.ndarray, *, skipna: bool = True):
return _cum_func(np.minimum.accumulate, values, skipna=skipna)


def cummax(values: np.ndarray, *, skipna: bool = True):
return _cum_func(np.maximum.accumulate, values, skipna=skipna)
25 changes: 8 additions & 17 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@
isin,
unique1d,
)
from pandas.core.array_algos import datetimelike_accumulations
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays._mixins import (
NDArrayBackedExtensionArray,
Expand Down Expand Up @@ -1380,25 +1381,15 @@ def _addsub_object_array(self, other: np.ndarray, op):
return result

def _accumulate(self, name: str, *, skipna: bool = True, **kwargs):
if name not in {"cummin", "cummax"}:
raise TypeError(f"Accumulation {name} not supported for {type(self)}")

if is_period_dtype(self.dtype):
data = self
else:
# Incompatible types in assignment (expression has type
# "ndarray[Any, Any]", variable has type "DatetimeLikeArrayMixin"
data = self._ndarray.copy() # type: ignore[assignment]

if name in {"cummin", "cummax"}:
func = np.minimum.accumulate if name == "cummin" else np.maximum.accumulate
result = cast(np.ndarray, nanops.na_accum_func(data, func, skipna=skipna))

# error: Unexpected keyword argument "freq" for
# "_simple_new" of "NDArrayBacked" [call-arg]
return type(self)._simple_new(
result, freq=self.freq, dtype=self.dtype # type: ignore[call-arg]
)
op = getattr(datetimelike_accumulations, name)
result = op(self.copy(), skipna=skipna, **kwargs)

raise TypeError(f"Accumulation {name} not supported for {type(self)}")
return type(self)._simple_new(
result, freq=self.freq, dtype=self.dtype # type: ignore[call-arg]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i dont think retaining self.freq is going to be correct in general

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you elaborate when we should set freq to None?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pretty much anything that isn't a no-op should be not-freq-preserving. e.g. pd.date_range("2016-01-01", periods=3)._data.cummin()

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thx, makes sense. Should we retain the freq in cases where we only have one element or not retain in general?

Not easy to hit btw :) But added a test now

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

id just not-retain in general

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, that's how I implemented it right now

)

@unpack_zerodim_and_defer("__add__")
def __add__(self, other):
Expand Down
5 changes: 2 additions & 3 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,10 +414,9 @@ def std(
# Accumulations

def _accumulate(self, name: str, *, skipna: bool = True, **kwargs):

data = self._ndarray.copy()

if name == "cumsum":
data = self._ndarray.copy()

func = np.cumsum
result = cast(np.ndarray, nanops.na_accum_func(data, func, skipna=skipna))

Expand Down
47 changes: 1 addition & 46 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1712,52 +1712,7 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike:
}[accum_func]

# We will be applying this function to block values
if values.dtype.kind in ["m", "M"]:
# GH#30460, GH#29058
# numpy 1.18 started sorting NaTs at the end instead of beginning,
# so we need to work around to maintain backwards-consistency.
orig_dtype = values.dtype

# We need to define mask before masking NaTs
mask = isna(values)

y = values.view("i8")
# Note: the accum_func comparison fails as an "is" comparison
changed = accum_func == np.minimum.accumulate

try:
if changed:
y[mask] = lib.i8max

result = accum_func(y, axis=0)
finally:
if changed:
# restore NaT elements
y[mask] = iNaT

if skipna:
result[mask] = iNaT
elif accum_func == np.minimum.accumulate:
# Restore NaTs that we masked previously
nz = (~np.asarray(mask)).nonzero()[0]
if len(nz):
# everything up to the first non-na entry stays NaT
result[: nz[0]] = iNaT

if isinstance(values.dtype, np.dtype):
result = result.view(orig_dtype)
else:
# DatetimeArray/TimedeltaArray
# TODO: have this case go through a DTA method?
# For DatetimeTZDtype, view result as M8[ns]
npdtype = orig_dtype if isinstance(orig_dtype, np.dtype) else "M8[ns]"
# Item "type" of "Union[Type[ExtensionArray], Type[ndarray[Any, Any]]]"
# has no attribute "_simple_new"
result = type(values)._simple_new( # type: ignore[union-attr]
result.view(npdtype), dtype=orig_dtype
)

elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)):
if skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe a comment/check that "mM" cases should not get here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(OK for follow-up, i can add this into my next "assorted" branch)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added an assert

vals = values.copy()
mask = isna(vals)
vals[mask] = mask_a
Expand Down
24 changes: 22 additions & 2 deletions pandas/tests/series/test_cumulative.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,12 @@ def test_cummin_cummax(self, datetime_series, method):
[
"cummax",
False,
["NaT", "2 days", "2 days", "2 days", "2 days", "3 days"],
["NaT", "NaT", "NaT", "NaT", "NaT", "NaT"],
],
[
"cummin",
False,
["NaT", "2 days", "2 days", "1 days", "1 days", "1 days"],
["NaT", "NaT", "NaT", "NaT", "NaT", "NaT"],
],
],
)
Expand All @@ -91,6 +91,26 @@ def test_cummin_cummax_datetimelike(self, ts, method, skipna, exp_tdi):
result = getattr(ser, method)(skipna=skipna)
tm.assert_series_equal(expected, result)

@pytest.mark.parametrize(
"func, exp",
[
("cummin", pd.Period("2012-1-1", freq="D")),
("cummax", pd.Period("2012-1-2", freq="D")),
],
)
def test_cummin_cummax_period(self, func, exp):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for checking this

# GH#28385
ser = pd.Series(
[pd.Period("2012-1-1", freq="D"), pd.NaT, pd.Period("2012-1-2", freq="D")]
)
result = getattr(ser, func)(skipna=False)
expected = pd.Series([pd.Period("2012-1-1", freq="D"), pd.NaT, pd.NaT])
tm.assert_series_equal(result, expected)

result = getattr(ser, func)(skipna=True)
expected = pd.Series([pd.Period("2012-1-1", freq="D"), pd.NaT, exp])
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize(
"arg",
[
Expand Down