Skip to content

ENH: partial string indexing on non-monotonic PeriodIndex #31096

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jan 21, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,24 @@ including other versions of pandas.
Enhancements
~~~~~~~~~~~~

.. _whatsnew_110.period_index_partial_string_slicing:

Nonmonotonic PeriodIndex Partial String Slicing
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
:class:`PeriodIndex` now supports partial string slicing for non-monotonic indexes, mirroring :class:`DatetimeIndex` behavior (:issue:`31096`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


For example:

.. ipython:: python

dti = pd.date_range("2014-01-01", periods=30, freq="30D")
pi = dti.to_period("D")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you show ser, maybe break this into 2 to make it more readable

ser_monotonic = pd.Series(np.arange(30), index=pi)
shuffler = list(range(0, 30, 2)) + list(range(1, 31, 2))
ser = ser_monotonic[shuffler]
ser["2014"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make a separate ipythonblock for these last 2

ser.loc["May 2015"]

.. _whatsnew_110.enhancements.other:

Other enhancements
Expand Down
65 changes: 43 additions & 22 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,25 +512,20 @@ def get_value(self, series, key):
return series.iat[key]

if isinstance(key, str):
try:
loc = self._get_string_slice(key)
return series[loc]
except (TypeError, ValueError):
pass

asdt, reso = parse_time_string(key, self.freq)
grp = resolution.Resolution.get_freq_group(reso)
freqn = resolution.get_freq_group(self.freq)

vals = self._ndarray_values

# if our data is higher resolution than requested key, slice
if grp < freqn:
iv = Period(asdt, freq=(grp, 1))
ord1 = iv.asfreq(self.freq, how="S").ordinal
ord2 = iv.asfreq(self.freq, how="E").ordinal

if ord2 < vals[0] or ord1 > vals[-1]:
raise KeyError(key)
# _get_string_slice will handle cases where grp < freqn
assert grp >= freqn

pos = np.searchsorted(self._ndarray_values, [ord1, ord2])
key = slice(pos[0], pos[1] + 1)
return series[key]
elif grp == freqn:
if grp == freqn:
key = Period(asdt, freq=self.freq)
loc = self.get_loc(key)
return series.iloc[loc]
Expand Down Expand Up @@ -601,6 +596,11 @@ def get_loc(self, key, method=None, tolerance=None):
"""

if isinstance(key, str):
try:
return self._get_string_slice(key)
except (TypeError, KeyError, ValueError, OverflowError):
pass

try:
asdt, reso = parse_time_string(key, self.freq)
key = asdt
Expand Down Expand Up @@ -713,20 +713,41 @@ def _parsed_string_to_bounds(self, reso, parsed):
raise KeyError(reso)
return (t1.asfreq(self.freq, how="start"), t1.asfreq(self.freq, how="end"))

def _get_string_slice(self, key):
if not self.is_monotonic:
raise ValueError("Partial indexing only valid for ordered time series")
def _get_string_slice(self, key, use_lhs: bool = True, use_rhs: bool = True):

parsed, reso = parse_time_string(key, self.freq)
grp = resolution.Resolution.get_freq_group(reso)
freqn = resolution.get_freq_group(self.freq)
if reso in ["day", "hour", "minute", "second"] and not grp < freqn:
raise KeyError(key)
if not grp < freqn:
raise ValueError(key)

t1, t2 = self._parsed_string_to_bounds(reso, parsed)
return slice(
self.searchsorted(t1, side="left"), self.searchsorted(t2, side="right")
)
i8vals = self.asi8

if self.is_monotonic:

# we are out of range
if len(self) and (
(use_lhs and t1 < self[0] and t2 < self[0])
or ((use_rhs and t1 > self[-1] and t2 > self[-1]))
):
raise KeyError(key)

# TODO: does this depend on being monotonic _increasing_?
# If so, DTI will also be affected.

# a monotonic (sorted) series can be sliced
# Use asi8.searchsorted to avoid re-validating Periods
left = i8vals.searchsorted(t1.ordinal, side="left") if use_lhs else None
right = i8vals.searchsorted(t2.ordinal, side="right") if use_rhs else None
return slice(left, right)

else:
lhs_mask = (i8vals >= t1.ordinal) if use_lhs else True
rhs_mask = (i8vals <= t2.ordinal) if use_rhs else True

# try to find a the dates
return (lhs_mask & rhs_mask).nonzero()[0]

def _convert_tolerance(self, tolerance, target):
tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance, target)
Expand Down
53 changes: 50 additions & 3 deletions pandas/tests/indexes/period/test_partial_slicing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@


class TestPeriodIndex:
def setup_method(self, method):
pass

def test_slice_with_negative_step(self):
ts = Series(np.arange(20), period_range("2014-01", periods=20, freq="M"))
SLC = pd.IndexSlice
Expand Down Expand Up @@ -133,3 +130,53 @@ def test_range_slice_outofbounds(self):
tm.assert_frame_equal(df["2013/10/15":"2013/10/17"], empty)
tm.assert_frame_equal(df["2013-06":"2013-09"], empty)
tm.assert_frame_equal(df["2013-11":"2013-12"], empty)

def test_partial_slice_doesnt_require_monotonicity(self):
# See also: DatetimeIndex test ofm the same name
dti = pd.date_range("2014-01-01", periods=30, freq="30D")
pi = dti.to_period("D")

ser_montonic = pd.Series(np.arange(30), index=pi)

shuffler = list(range(0, 30, 2)) + list(range(1, 31, 2))
ser = ser_montonic[shuffler]
nidx = ser.index

# Manually identified locations of year==2014
indexer_2014 = np.array(
[0, 1, 2, 3, 4, 5, 6, 15, 16, 17, 18, 19, 20], dtype=np.intp
)
assert (nidx[indexer_2014].year == 2014).all()
assert not (nidx[~indexer_2014].year == 2014).any()

result = nidx.get_loc("2014")
tm.assert_numpy_array_equal(result, indexer_2014)

expected = ser[indexer_2014]

result = nidx.get_value(ser, "2014")
tm.assert_series_equal(result, expected)

result = ser.loc["2014"]
tm.assert_series_equal(result, expected)

result = ser["2014"]
tm.assert_series_equal(result, expected)

# Manually identified locations where ser.index is within Mat 2015
indexer_may2015 = np.array([23], dtype=np.intp)
assert nidx[23].year == 2015 and nidx[23].month == 5

result = nidx.get_loc("May 2015")
tm.assert_numpy_array_equal(result, indexer_may2015)

expected = ser[indexer_may2015]

result = nidx.get_value(ser, "May 2015")
tm.assert_series_equal(result, expected)

result = ser.loc["May 2015"]
tm.assert_series_equal(result, expected)

result = ser["May 2015"]
tm.assert_series_equal(result, expected)