From e48f9aabf9899925880cff764fdbcff71b58f94c Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 27 Dec 2018 16:45:47 -0800 Subject: [PATCH 1/5] implement searchsorted, repeat, datetimelike portions of #24024 --- pandas/core/arrays/datetimelike.py | 135 +++++++++++++++++++++++ pandas/core/arrays/datetimes.py | 19 ++++ pandas/core/arrays/period.py | 20 ++++ pandas/core/arrays/timedeltas.py | 23 +++- pandas/core/indexes/datetimelike.py | 1 + pandas/core/indexes/datetimes.py | 5 + pandas/core/indexes/period.py | 4 + pandas/core/indexes/timedeltas.py | 4 + pandas/tests/arrays/test_datetimelike.py | 42 +++++++ 9 files changed, 250 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 149bb07d23254..f4f8e4e8277bb 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -12,6 +12,7 @@ from pandas._libs.tslibs.timestamps import ( RoundTo, maybe_integer_op_deprecated, round_nsint64) import pandas.compat as compat +from pandas.compat.numpy import function as nv from pandas.errors import ( AbstractMethodError, NullFrequencyError, PerformanceWarning) from pandas.util._decorators import Appender, Substitution, deprecate_kwarg @@ -80,6 +81,79 @@ def _get_attributes_dict(self): """ return {k: getattr(self, k, None) for k in self._attributes} + @property + def _scalar_type(self): + # type: () -> Union[type, Tuple[type]] + """The scalar associated with this datelike + + * PeriodArray : Period + * DatetimeArray : Timestamp + * TimedeltaArray : Timedelta + """ + raise AbstractMethodError(self) + + def _scalar_from_string(self, value): + # type: (str) -> Union[Period, Timestamp, Timedelta, NaTType] + """ + Construct a scalar type from a string. + + Parameters + ---------- + value : str + + Returns + ------- + Period, Timestamp, or Timedelta, or NaT + Whatever the type of ``self._scalar_type`` is. + + Notes + ----- + This should call ``self._check_compatible_with`` before + unboxing the result. + """ + raise AbstractMethodError(self) + + def _unbox_scalar(self, value): + # type: (Union[Period, Timestamp, Timedelta, NaTType]) -> int + """ + Unbox the integer value of a scalar `value`. + + Parameters + ---------- + value : Union[Period, Timestamp, Timedelta] + + Returns + ------- + int + + Examples + -------- + >>> self._unbox_scalar(Timedelta('10s')) # DOCTEST: +SKIP + 10000000000 + """ + raise AbstractMethodError(self) + + def _check_compatible_with(self, other): + # TODO: choose a type for other + # Can it be NaT? + # Scalar, array, or both? + """ + Verify that `self` and `other` are compatible. + + * DatetimeArray verifies that the timezones (if any) match + * PeriodArray verifies that the freq matches + * Timedelta has no verification + + Parameters + ---------- + other + + Raises + ------ + Exception + """ + raise AbstractMethodError(self) + class DatelikeOps(object): """ @@ -468,6 +542,67 @@ def _values_for_factorize(self): def _from_factorized(cls, values, original): return cls(values, dtype=original.dtype) + def _values_for_argsort(self): + return self._data + + # ------------------------------------------------------------------ + # Additional array methods + # These are not part of the EA API, but we implement them because + # pandas assumes they're there. + + def searchsorted(self, value, side='left', sorter=None): + """ + Find indices where elements should be inserted to maintain order. + + Find the indices into a sorted array `self` such that, if the + corresponding elements in `value` were inserted before the indices, + the order of `self` would be preserved. + + Parameters + ---------- + value : array_like + Values to insert into `self`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `self`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort `self` into ascending + order. They are typically the result of ``np.argsort``. + + Returns + ------- + indices : array of ints + Array of insertion points with the same shape as `value`. + """ + if isinstance(value, compat.string_types): + value = self._scalar_from_string(value) + + if not (isinstance(value, (self._scalar_type, type(self))) + or isna(value)): + msg = "Unexpected type for 'value': {}".format(type(value)) + raise ValueError(msg) + + self._check_compatible_with(value) + if isinstance(value, type(self)): + value = value.asi8 + else: + value = self._unbox_scalar(value) + + return self.asi8.searchsorted(value, side=side, sorter=sorter) + + def repeat(self, repeats, *args, **kwargs): + """ + Repeat elements of an array. + + See Also + -------- + numpy.ndarray.repeat + """ + nv.validate_repeat(args, kwargs) + values = self._data.repeat(repeats) + return type(self)(values, dtype=self.dtype) + # ------------------------------------------------------------------ # Null Handling diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index f7a8bdb201bfd..22d9f987743b4 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -170,6 +170,7 @@ class DatetimeArrayMixin(dtl.DatetimeLikeArrayMixin, _data """ _typ = "datetimearray" + _scalar_type = Timestamp # define my properties & methods for delegation _bool_ops = ['is_month_start', 'is_month_end', @@ -346,6 +347,24 @@ def _generate_range(cls, start, end, periods, freq, tz=None, return cls._simple_new(index.asi8, freq=freq, tz=tz) + # ----------------------------------------------------------------- + # DatetimeLike Interface + + def _unbox_scalar(self, value): + if not isinstance(value, self._scalar_type) and value is not NaT: + raise ValueError("'value' should be a Timestamp.") + if not isna(value): + self._check_compatible_with(value) + return value.value + + def _scalar_from_string(self, value): + return Timestamp(value, tz=self.tz) + + def _check_compatible_with(self, other): + if not timezones.tz_compare(self.tz, other.tz): + raise ValueError("Timezones don't match. '{own} != {other}'" + .format(own=self.tz, other=other.tz)) + # ----------------------------------------------------------------- # Descriptive Properties diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 2fcc6ab4cc3bf..5acdbf2fad419 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -139,6 +139,7 @@ class PeriodArray(dtl.DatetimeLikeArrayMixin, __array_priority__ = 1000 _attributes = ["freq"] _typ = "periodarray" # ABCPeriodArray + _scalar_type = Period # Names others delegate to us _other_ops = [] @@ -242,6 +243,25 @@ def _generate_range(cls, start, end, periods, freq, fields): return subarr, freq + # ----------------------------------------------------------------- + # DatetimeLike Interface + + def _unbox_scalar(self, value): + # type: (Union[Period, NaTType]) -> int + if value is NaT: + return value.value + elif isinstance(value, self._scalar_type): + if not isna(value): + self._check_compatible_with(value) + return value.ordinal + else: + raise ValueError("'value' should be a Period. Got '{val}' instead." + .format(val=value)) + + def _scalar_from_string(self, value): + # type: (str) -> Period + return Period(value, freq=self.freq) + def _check_compatible_with(self, other): if self.freqstr != other.freqstr: _raise_on_incompatible(self, other) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 8721e0ce3ace5..dbb1627628308 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -115,6 +115,7 @@ def wrapper(self, other): class TimedeltaArrayMixin(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): _typ = "timedeltaarray" + _scalar_type = Timedelta __array_priority__ = 1000 # define my properties & methods for delegation _other_ops = [] @@ -220,6 +221,22 @@ def _generate_range(cls, start, end, periods, freq, closed=None): return cls._simple_new(index, freq=freq) + # ---------------------------------------------------------------- + # DatetimeLike Interface + + def _unbox_scalar(self, value): + if not isinstance(value, self._scalar_type) and value is not NaT: + raise ValueError("'value' should be a Timedelta.") + self._check_compatible_with(value) + return value.value + + def _scalar_from_string(self, value): + return Timedelta(value) + + def _check_compatible_with(self, other): + # we don't have anything to validate. + pass + # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods @@ -237,13 +254,13 @@ def _validate_fill_value(self, fill_value): # ---------------------------------------------------------------- # Rendering Methods + def _format_native_types(self): + return self.astype(object) + def _formatter(self, boxed=False): from pandas.io.formats.format import _get_format_timedelta64 return _get_format_timedelta64(self, box=True) - def _format_native_types(self): - return self.astype(object) - # ---------------------------------------------------------------- # Arithmetic Methods diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index e3d24bfbed7c3..f6aef8c8e6223 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -455,6 +455,7 @@ def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) freq = self.freq if is_period_dtype(self) else None return self._shallow_copy(self.asi8.repeat(repeats), freq=freq) + # TODO: dispatch to _eadata @Appender(_index_shared_docs['where'] % _index_doc_kwargs) def where(self, cond, other=None): diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 09e741af363da..0c412c291d259 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1089,6 +1089,11 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): # -------------------------------------------------------------------- # Wrapping DatetimeArray + @property + def _eadata(self): + return DatetimeArray._simple_new(self._data, + tz=self.tz, freq=self.freq) + # Compat for frequency inference, see GH#23789 _is_monotonic_increasing = Index.is_monotonic_increasing _is_monotonic_decreasing = Index.is_monotonic_decreasing diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index b15604a57fb81..05df87365a411 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -247,6 +247,10 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): # ------------------------------------------------------------------------ # Data + @property + def _eadata(self): + return self._data + @property def _ndarray_values(self): return self._data._ndarray_values diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 47f7f7cf860fc..df22964cdee5e 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -237,6 +237,10 @@ def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): # ------------------------------------------------------------------- # Wrapping TimedeltaArray + @property + def _eadata(self): + return TimedeltaArray._simple_new(self._data, freq=self.freq) + __mul__ = _make_wrapped_arith_op("__mul__") __rmul__ = _make_wrapped_arith_op("__rmul__") __floordiv__ = _make_wrapped_arith_op("__floordiv__") diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index ebe84232d7f6d..3f1babe78751e 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -2,6 +2,8 @@ import numpy as np import pytest +import pandas.compat as compat + import pandas as pd from pandas.core.arrays import ( DatetimeArrayMixin as DatetimeArray, PeriodArray, @@ -129,6 +131,46 @@ def test_concat_same_type(self): tm.assert_index_equal(self.index_cls(result), expected) + def test_unbox_scalar(self): + data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 + arr = self.array_cls(data, freq='D') + result = arr._unbox_scalar(arr[0]) + assert isinstance(result, (int, compat.long)) + + result = arr._unbox_scalar(pd.NaT) + assert isinstance(result, (int, compat.long)) + + def test_scalar_from_string(self): + data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 + arr = self.array_cls(data, freq='D') + result = arr._scalar_from_string(str(arr[0])) + assert result == arr[0] + + def test_searchsorted(self): + data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 + arr = self.array_cls(data, freq='D') + + # scalar + result = arr.searchsorted(arr[1]) + assert result == 1 + + result = arr.searchsorted(arr[2], side="right") + assert result == 3 + + # own-type + result = arr.searchsorted(arr[1:3]) + expected = np.array([1, 2]) + tm.assert_numpy_array_equal(result, expected) + + result = arr.searchsorted(arr[1:3], side="right") + expected = np.array([2, 3]) + tm.assert_numpy_array_equal(result, expected) + + # FIXME: this fails for different reasons for all three classes; + # need to check that this is in fact the desired behavior + # with pytest.raises(ValueError): + # arr.searchsorted(pd.NaT) + class TestDatetimeArray(SharedTests): index_cls = pd.DatetimeIndex From 1c7b1a419a964005f0de3075c5721a694bb84d0b Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 27 Dec 2018 17:35:57 -0800 Subject: [PATCH 2/5] make check_compatible_with accept pd.NaT, test --- pandas/core/arrays/datetimes.py | 2 ++ pandas/core/arrays/period.py | 2 ++ pandas/tests/arrays/test_datetimelike.py | 19 +++++++++++++++---- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 22d9f987743b4..35cc67939a71f 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -361,6 +361,8 @@ def _scalar_from_string(self, value): return Timestamp(value, tz=self.tz) def _check_compatible_with(self, other): + if other is NaT: + return if not timezones.tz_compare(self.tz, other.tz): raise ValueError("Timezones don't match. '{own} != {other}'" .format(own=self.tz, other=other.tz)) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 5acdbf2fad419..9ded7de8fdf12 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -263,6 +263,8 @@ def _scalar_from_string(self, value): return Period(value, freq=self.freq) def _check_compatible_with(self, other): + if other is NaT: + return if self.freqstr != other.freqstr: _raise_on_incompatible(self, other) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 3f1babe78751e..8b2ba25d2e208 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -140,6 +140,17 @@ def test_unbox_scalar(self): result = arr._unbox_scalar(pd.NaT) assert isinstance(result, (int, compat.long)) + with pytest.raises(ValueError): + arr._unbox_scalar('foo') + + def test_check_compatible_with(self): + data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 + arr = self.array_cls(data, freq='D') + + arr._check_compatible_with(arr[0]) + arr._check_compatible_with(arr[:1]) + arr._check_compatible_with(pd.NaT) + def test_scalar_from_string(self): data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 arr = self.array_cls(data, freq='D') @@ -166,10 +177,10 @@ def test_searchsorted(self): expected = np.array([2, 3]) tm.assert_numpy_array_equal(result, expected) - # FIXME: this fails for different reasons for all three classes; - # need to check that this is in fact the desired behavior - # with pytest.raises(ValueError): - # arr.searchsorted(pd.NaT) + # Following numpy convention, NaT goes at the beginning + # (unlike NaN which goes at the end) + result = arr.searchsorted(pd.NaT) + assert result == 0 class TestDatetimeArray(SharedTests): From 9a0bb4af351324cb7ef83202826ec399305c93b9 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 27 Dec 2018 18:50:53 -0800 Subject: [PATCH 3/5] specify int64 to fix windows CI --- pandas/tests/arrays/test_datetimelike.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 8b2ba25d2e208..483f25513775e 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -170,11 +170,11 @@ def test_searchsorted(self): # own-type result = arr.searchsorted(arr[1:3]) - expected = np.array([1, 2]) + expected = np.array([1, 2], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) result = arr.searchsorted(arr[1:3], side="right") - expected = np.array([2, 3]) + expected = np.array([2, 3], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) # Following numpy convention, NaT goes at the beginning From b793665378a562b5c80752164fde187048c26fe3 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Fri, 28 Dec 2018 11:45:23 -0800 Subject: [PATCH 4/5] adjust for comments --- pandas/core/arrays/datetimelike.py | 31 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f4f8e4e8277bb..1f2097fd22cb9 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -134,9 +134,8 @@ def _unbox_scalar(self, value): raise AbstractMethodError(self) def _check_compatible_with(self, other): - # TODO: choose a type for other - # Can it be NaT? - # Scalar, array, or both? + # type: (Union[Period, Timestamp, Timedelta, NaTType]) -> None + # TODO: Scalar, array, or both? """ Verify that `self` and `other` are compatible. @@ -144,6 +143,8 @@ def _check_compatible_with(self, other): * PeriodArray verifies that the freq matches * Timedelta has no verification + In each case, NaT is considered compatible. + Parameters ---------- other @@ -550,7 +551,7 @@ def _values_for_argsort(self): # These are not part of the EA API, but we implement them because # pandas assumes they're there. - def searchsorted(self, value, side='left', sorter=None): + def searchsorted(self, v, side='left', sorter=None): """ Find indices where elements should be inserted to maintain order. @@ -560,7 +561,7 @@ def searchsorted(self, value, side='left', sorter=None): Parameters ---------- - value : array_like + v : array_like Values to insert into `self`. side : {'left', 'right'}, optional If 'left', the index of the first suitable location found is given. @@ -575,19 +576,19 @@ def searchsorted(self, value, side='left', sorter=None): indices : array of ints Array of insertion points with the same shape as `value`. """ - if isinstance(value, compat.string_types): - value = self._scalar_from_string(value) + if isinstance(v, compat.string_types): + v = self._scalar_from_string(v) - if not (isinstance(value, (self._scalar_type, type(self))) - or isna(value)): - msg = "Unexpected type for 'value': {}".format(type(value)) - raise ValueError(msg) + if not (isinstance(v, (self._scalar_type, type(self))) + or isna(v)): + raise ValueError("Unexpected type for 'value': {valtype}" + .format(valtype=type(v))) - self._check_compatible_with(value) - if isinstance(value, type(self)): - value = value.asi8 + self._check_compatible_with(v) + if isinstance(v, type(self)): + value = v.asi8 else: - value = self._unbox_scalar(value) + value = self._unbox_scalar(v) return self.asi8.searchsorted(value, side=side, sorter=sorter) From 11886cba138069de5dd824fb6bc970652cb40e21 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Fri, 28 Dec 2018 11:58:36 -0800 Subject: [PATCH 5/5] revert per discussion --- pandas/core/arrays/datetimelike.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 1f2097fd22cb9..849b524b72f74 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -135,7 +135,6 @@ def _unbox_scalar(self, value): def _check_compatible_with(self, other): # type: (Union[Period, Timestamp, Timedelta, NaTType]) -> None - # TODO: Scalar, array, or both? """ Verify that `self` and `other` are compatible. @@ -551,7 +550,7 @@ def _values_for_argsort(self): # These are not part of the EA API, but we implement them because # pandas assumes they're there. - def searchsorted(self, v, side='left', sorter=None): + def searchsorted(self, value, side='left', sorter=None): """ Find indices where elements should be inserted to maintain order. @@ -561,7 +560,7 @@ def searchsorted(self, v, side='left', sorter=None): Parameters ---------- - v : array_like + value : array_like Values to insert into `self`. side : {'left', 'right'}, optional If 'left', the index of the first suitable location found is given. @@ -576,19 +575,19 @@ def searchsorted(self, v, side='left', sorter=None): indices : array of ints Array of insertion points with the same shape as `value`. """ - if isinstance(v, compat.string_types): - v = self._scalar_from_string(v) + if isinstance(value, compat.string_types): + value = self._scalar_from_string(value) - if not (isinstance(v, (self._scalar_type, type(self))) - or isna(v)): + if not (isinstance(value, (self._scalar_type, type(self))) + or isna(value)): raise ValueError("Unexpected type for 'value': {valtype}" - .format(valtype=type(v))) + .format(valtype=type(value))) - self._check_compatible_with(v) - if isinstance(v, type(self)): - value = v.asi8 + self._check_compatible_with(value) + if isinstance(value, type(self)): + value = value.asi8 else: - value = self._unbox_scalar(v) + value = self._unbox_scalar(value) return self.asi8.searchsorted(value, side=side, sorter=sorter)