Skip to content

PERF: make DTA/TDA/PA _ndarray the attribute, _data the property #40007

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray):
_infer_matches: Tuple[str, ...]
_is_recognized_dtype: Callable[[DtypeObj], bool]
_recognized_scalars: Tuple[Type, ...]
_data: np.ndarray
_ndarray: np.ndarray

def __init__(self, data, dtype: Optional[Dtype] = None, freq=None, copy=False):
raise AbstractMethodError(self)
Expand Down Expand Up @@ -253,9 +253,24 @@ def _check_compatible_with(
# ------------------------------------------------------------------
# NDArrayBackedExtensionArray compat

def __setstate__(self, state):
if isinstance(state, dict):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

woa, but ok. can we create an issue to track when we should remove things like this (e.g. call it pickle compatibility and point to this PR). will be a while for sure, but should track.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure. yah, pickle backwards compat is a PITA

if "_data" in state and "_ndarray" not in state:
# backward compat, changed what is property vs attribute
state["_ndarray"] = state.pop("_data")
for key, value in state.items():
setattr(self, key, value)
else:
# PeriodArray, bc it mixes in a cython class
if isinstance(state, tuple) and len(state) == 1:
state = state[0]
self.__setstate__(state)
else:
raise TypeError(state)

@cache_readonly
def _ndarray(self) -> np.ndarray:
return self._data
def _data(self) -> np.ndarray:
return self._ndarray

def _from_backing_data(
self: DatetimeLikeArrayT, arr: np.ndarray
Expand Down Expand Up @@ -294,7 +309,7 @@ def asi8(self) -> np.ndarray:
An ndarray with int64 dtype.
"""
# do not cache or you'll create a memory leak
return self._data.view("i8")
return self._ndarray.view("i8")

# ----------------------------------------------------------------
# Rendering Methods
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False):

if freq is None:
freq = values.freq
values = values._data
values = values._ndarray

if not isinstance(values, np.ndarray):
raise ValueError(
Expand Down Expand Up @@ -303,7 +303,7 @@ def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False):
# be incorrect(ish?) for the array as a whole
dtype = DatetimeTZDtype(tz=timezones.tz_standardize(dtype.tz))

self._data = values
self._ndarray = values
self._dtype = dtype
self._freq = freq

Expand All @@ -320,7 +320,7 @@ def _simple_new(
values = values.view(DT64NS_DTYPE)

result = object.__new__(cls)
result._data = values
result._ndarray = values
result._freq = freq
result._dtype = dtype
return result
Expand Down Expand Up @@ -618,7 +618,7 @@ def astype(self, dtype, copy=True):

elif self.tz is None and is_datetime64_dtype(dtype) and dtype != self.dtype:
# unit conversion e.g. datetime64[s]
return self._data.astype(dtype)
return self._ndarray.astype(dtype)

elif is_period_dtype(dtype):
return self.to_period(freq=dtype.freq)
Expand Down Expand Up @@ -1138,7 +1138,7 @@ def to_period(self, freq=None):

freq = res

return PeriodArray._from_datetime64(self._data, freq, tz=self.tz)
return PeriodArray._from_datetime64(self._ndarray, freq, tz=self.tz)

def to_perioddelta(self, freq):
"""
Expand Down
10 changes: 6 additions & 4 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps):
_datetimelike_ops = _field_ops + _object_ops + _bool_ops
_datetimelike_methods = ["strftime", "to_timestamp", "asfreq"]

__setstate__ = dtl.DatelikeOps.__setstate__

# --------------------------------------------------------------------
# Constructors

Expand All @@ -201,10 +203,10 @@ def __init__(self, values, dtype: Optional[Dtype] = None, freq=None, copy=False)
if isinstance(values, type(self)):
if freq is not None and freq != values.freq:
raise raise_on_incompatible(values, freq)
values, freq = values._data, values.freq
values, freq = values._ndarray, values.freq

values = np.array(values, dtype="int64", copy=copy)
self._data = values
self._ndarray = values
if freq is None:
raise ValueError("freq is not specified and cannot be inferred")
self._dtype = PeriodDtype(freq)
Expand Down Expand Up @@ -347,7 +349,7 @@ def __arrow_array__(self, type=None):

if type is not None:
if pyarrow.types.is_integer(type):
return pyarrow.array(self._data, mask=self.isna(), type=type)
return pyarrow.array(self._ndarray, mask=self.isna(), type=type)
elif isinstance(type, ArrowPeriodType):
# ensure we have the same freq
if self.freqstr != type.freq:
Expand All @@ -361,7 +363,7 @@ def __arrow_array__(self, type=None):
)

period_type = ArrowPeriodType(self.freqstr)
storage_array = pyarrow.array(self._data, mask=self.isna(), type="int64")
storage_array = pyarrow.array(self._ndarray, mask=self.isna(), type="int64")
return pyarrow.ExtensionArray.from_storage(period_type, storage_array)

# --------------------------------------------------------------------
Expand Down
51 changes: 28 additions & 23 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,8 @@ def dtype(self) -> np.dtype:
# ----------------------------------------------------------------
# Constructors

_freq = None

def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False):
values = extract_array(values)

Expand All @@ -179,7 +181,7 @@ def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False):
elif freq and values.freq:
freq = to_offset(freq)
freq, _ = dtl.validate_inferred_freq(freq, values.freq, False)
values = values._data
values = values._ndarray

if not isinstance(values, np.ndarray):
msg = (
Expand Down Expand Up @@ -211,7 +213,7 @@ def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False):
if freq:
freq = to_offset(freq)

self._data = values
self._ndarray = values
self._dtype = dtype
self._freq = freq

Expand All @@ -229,7 +231,7 @@ def _simple_new(
values = values.view(TD64NS_DTYPE)

result = object.__new__(cls)
result._data = values
result._ndarray = values
result._freq = to_offset(freq)
result._dtype = TD64NS_DTYPE
return result
Expand Down Expand Up @@ -341,7 +343,7 @@ def astype(self, dtype, copy: bool = True):
dtype = pandas_dtype(dtype)

if dtype.kind == "m":
return astype_td64_unit_conversion(self._data, dtype, copy=copy)
return astype_td64_unit_conversion(self._ndarray, dtype, copy=copy)

return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy)

Expand Down Expand Up @@ -415,8 +417,8 @@ def _formatter(self, boxed=False):
def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs):
from pandas.io.formats.format import get_format_timedelta64

formatter = get_format_timedelta64(self._data, na_rep)
return np.array([formatter(x) for x in self._data])
formatter = get_format_timedelta64(self._ndarray, na_rep)
return np.array([formatter(x) for x in self._ndarray])

# ----------------------------------------------------------------
# Arithmetic Methods
Expand Down Expand Up @@ -485,7 +487,7 @@ def _addsub_object_array(self, other, op):
def __mul__(self, other) -> TimedeltaArray:
if is_scalar(other):
# numpy will accept float and int, raise TypeError for others
result = self._data * other
result = self._ndarray * other
freq = None
if self.freq is not None and not isna(other):
freq = self.freq * other
Expand All @@ -508,7 +510,7 @@ def __mul__(self, other) -> TimedeltaArray:
return type(self)(result)

# numpy will accept float or int dtype, raise TypeError for others
result = self._data * other
result = self._ndarray * other
return type(self)(result)

__rmul__ = __mul__
Expand All @@ -526,11 +528,11 @@ def __truediv__(self, other):
return result

# otherwise, dispatch to Timedelta implementation
return self._data / other
return self._ndarray / other

elif lib.is_scalar(other):
# assume it is numeric
result = self._data / other
result = self._ndarray / other
freq = None
if self.freq is not None:
# Tick division is not implemented, so operate on Timedelta
Expand All @@ -546,7 +548,7 @@ def __truediv__(self, other):

elif is_timedelta64_dtype(other.dtype):
# let numpy handle it
return self._data / other
return self._ndarray / other

elif is_object_dtype(other.dtype):
# We operate on raveled arrays to avoid problems in inference
Expand All @@ -568,7 +570,7 @@ def __truediv__(self, other):
return result

else:
result = self._data / other
result = self._ndarray / other
return type(self)(result)

@unpack_zerodim_and_defer("__rtruediv__")
Expand All @@ -583,7 +585,7 @@ def __rtruediv__(self, other):
return result

# otherwise, dispatch to Timedelta implementation
return other / self._data
return other / self._ndarray

elif lib.is_scalar(other):
raise TypeError(
Expand All @@ -599,7 +601,7 @@ def __rtruediv__(self, other):

elif is_timedelta64_dtype(other.dtype):
# let numpy handle it
return other / self._data
return other / self._ndarray

elif is_object_dtype(other.dtype):
# Note: unlike in __truediv__, we do not _need_ to do type
Expand All @@ -626,7 +628,7 @@ def __floordiv__(self, other):
return result

# dispatch to Timedelta implementation
result = other.__rfloordiv__(self._data)
result = other.__rfloordiv__(self._ndarray)
return result

# at this point we should only have numeric scalars; anything
Expand Down Expand Up @@ -670,7 +672,7 @@ def __floordiv__(self, other):
return result

elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype):
result = self._data // other
result = self._ndarray // other
return type(self)(result)

else:
Expand All @@ -690,7 +692,7 @@ def __rfloordiv__(self, other):
return result

# dispatch to Timedelta implementation
result = other.__floordiv__(self._data)
result = other.__floordiv__(self._ndarray)
return result

raise TypeError(
Expand Down Expand Up @@ -760,15 +762,15 @@ def __rdivmod__(self, other):

def __neg__(self) -> TimedeltaArray:
if self.freq is not None:
return type(self)(-self._data, freq=-self.freq)
return type(self)(-self._data)
return type(self)(-self._ndarray, freq=-self.freq)
return type(self)(-self._ndarray)

def __pos__(self) -> TimedeltaArray:
return type(self)(self._data, freq=self.freq)
return type(self)(self._ndarray, freq=self.freq)

def __abs__(self) -> TimedeltaArray:
# Note: freq is not preserved
return type(self)(np.abs(self._data))
return type(self)(np.abs(self._ndarray))

# ----------------------------------------------------------------
# Conversion Methods - Vectorized analogues of Timedelta methods
Expand Down Expand Up @@ -946,9 +948,12 @@ def sequence_to_td64ns(data, copy=False, unit=None, errors="raise"):
data = np.array(data, copy=False)
elif isinstance(data, ABCSeries):
data = data._values
elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)):
elif isinstance(data, ABCTimedeltaIndex):
inferred_freq = data.freq
data = data._data._ndarray
elif isinstance(data, TimedeltaArray):
inferred_freq = data.freq
data = data._data
data = data._ndarray
elif isinstance(data, IntegerArray):
data = data.to_numpy("int64", na_value=tslibs.iNaT)
elif is_categorical_dtype(data.dtype):
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def _simple_new(
result._cache = {}

# For groupby perf. See note in indexes/base about _index_data
result._index_data = values._data
result._index_data = values._ndarray

result._reset_identity()
return result
Expand All @@ -165,7 +165,7 @@ def _is_all_dates(self) -> bool:
@property
def values(self) -> np.ndarray:
# Note: PeriodArray overrides this to return an ndarray of objects.
return self._data._data
return self._data._ndarray

def __array_wrap__(self, result, context=None):
"""
Expand Down