From 25cec3adf044982accb78384acab5df94c1435c7 Mon Sep 17 00:00:00 2001 From: Mark Roth Date: Fri, 26 Feb 2016 19:04:27 -0500 Subject: [PATCH] ENH: Partial string matching for timestamps with multiindex Includes implementation, unit tests, documentation fixes and whatsnew. --- doc/source/timeseries.rst | 14 +++++ doc/source/whatsnew/v0.18.1.txt | 18 +++++++ pandas/core/indexing.py | 25 +++++++++ pandas/tests/indexes/test_multi.py | 83 +++++++++++++++++++++++++++++- 4 files changed, 138 insertions(+), 2 deletions(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index a986c3e1cb065..c262a1c505673 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -422,6 +422,20 @@ We are stopping on the included end-point as it is part of the index dft.loc['2013-1-15 12:30:00'] +DatetimeIndex Partial String Indexing also works on DataFrames with hierarchical indexing (``MultiIndex``). For +instance: + +.. ipython:: python + + dft2 = pd.DataFrame(randn(20,1), + columns=['A'], + index=pd.MultiIndex.from_product([date_range('20130101', + periods=10, + freq='12H'), + ['a', 'b']])) + dft2.loc['2013-01-05'] + dft2 = dft2.swaplevel(0, 1).sort_index() + dft2.loc[pd.IndexSlice[:,'2013-01-05'],:] Datetime Indexing ~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index d9983759083ca..cdf38e95fd9e1 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -30,6 +30,24 @@ Enhancements ~~~~~~~~~~~~ +Partial string matches on ``DateTimeIndex`` when part of a ``MultiIndex`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Partial string matches on ``DateTimeIndex`` now work when part of a ``MultiIndex`` (:issue:`10331`) + +For example: + +.. ipython:: python + + dft2 = pd.DataFrame(randn(20,1), + columns=['A'], + index=pd.MultiIndex.from_product([date_range('20130101', + periods=10, + freq='12H'), + ['a', 'b']])) + dft2.loc['2013-01-05'] + dft2 = dft2.swaplevel(0, 1).sort_index() + dft2.loc[pd.IndexSlice[:,'2013-01-05'],:] + diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index b0dd2596fccd5..23de660fd23f4 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1392,8 +1392,33 @@ def error(): return True + def _get_partial_string_timestamp_match_key(self, key, labels): + """Translate any partial string timestamp matches in key, returning the + new key (GH 10331)""" + if isinstance(labels, MultiIndex): + if isinstance(key, compat.string_types) and \ + labels.levels[0].is_all_dates: + # Convert key '2016-01-01' to + # ('2016-01-01'[, slice(None, None, None)]+) + key = tuple([key] + [slice(None)] * (len(labels.levels) - 1)) + + if isinstance(key, tuple): + # Convert (..., '2016-01-01', ...) in tuple to + # (..., slice('2016-01-01', '2016-01-01', None), ...) + new_key = [] + for i, component in enumerate(key): + if isinstance(component, compat.string_types) and \ + labels.levels[i].is_all_dates: + new_key.append(slice(component, component, None)) + else: + new_key.append(component) + key = tuple(new_key) + + return key + def _getitem_axis(self, key, axis=0): labels = self.obj._get_axis(axis) + key = self._get_partial_string_timestamp_match_key(key, labels) if isinstance(key, slice): self._has_valid_type(key, axis) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 2c3c75cfa0431..f70ea49bd4c29 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -14,8 +14,8 @@ import numpy as np -from pandas.util.testing import (assert_almost_equal, assertRaisesRegexp, - assert_copy) +from pandas.util.testing import (assert_almost_equal, assertRaises, + assertRaisesRegexp, assert_copy) import pandas.util.testing as tm @@ -1970,3 +1970,82 @@ def test_index_name_retained(self): def test_equals_operator(self): # GH9785 self.assertTrue((self.index == self.index).all()) + + def test_partial_string_timestamp_multiindex(self): + # GH10331 + dr = pd.date_range('2016-01-01', '2016-01-03', freq='12H') + abc = ['a', 'b', 'c'] + ix = pd.MultiIndex.from_product([dr, abc]) + df = pd.DataFrame({'c1': range(0, 15)}, index=ix) + idx = pd.IndexSlice + + # c1 + # 2016-01-01 00:00:00 a 0 + # b 1 + # c 2 + # 2016-01-01 12:00:00 a 3 + # b 4 + # c 5 + # 2016-01-02 00:00:00 a 6 + # b 7 + # c 8 + # 2016-01-02 12:00:00 a 9 + # b 10 + # c 11 + # 2016-01-03 00:00:00 a 12 + # b 13 + # c 14 + + # partial string matching on a single index + df_swap = df.swaplevel(0, 1).sort_index() + just_a = df_swap.loc['a'] + result = just_a.loc['2016-01-01'] + expected = df.loc[idx[:, 'a'], :].iloc[0:2] + expected.index = expected.index.droplevel(1) + tm.assert_frame_equal(result, expected) + + # indexing with IndexSlice + result = df.loc[idx['2016-01-01':'2016-02-01', :], :] + expected = df + tm.assert_frame_equal(result, expected) + + # match on secondary index + result = df_swap.loc[idx[:, '2016-01-01':'2016-01-01'], :] + expected = df_swap.iloc[[0, 1, 5, 6, 10, 11]] + tm.assert_frame_equal(result, expected) + + # Even though this syntax works on a single index, this is somewhat + # ambiguous and we don't want to extend this behavior forward to work + # in multi-indexes. This would amount to selecting a scalar from a + # column. + with assertRaises(KeyError): + df['2016-01-01'] + + # partial string match on year only + result = df.loc['2016'] + expected = df + tm.assert_frame_equal(result, expected) + + # partial string match on date + result = df.loc['2016-01-01'] + expected = df.iloc[0:6] + tm.assert_frame_equal(result, expected) + + # partial string match on date and hour, from middle + result = df.loc['2016-01-02 12'] + expected = df.iloc[9:12] + tm.assert_frame_equal(result, expected) + + # partial string match on secondary index + result = df_swap.loc[idx[:, '2016-01-02'], :] + expected = df_swap.iloc[[2, 3, 7, 8, 12, 13]] + tm.assert_frame_equal(result, expected) + + # tuple selector with partial string match on date + result = df.loc[('2016-01-01', 'a'), :] + expected = df.iloc[[0, 3]] + tm.assert_frame_equal(result, expected) + + # Slicing date on first level should break (of course) + with assertRaises(KeyError): + df_swap.loc['2016-01-01']