From bbcfd929205ef79c00a403050f107c5e05e0b300 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 15 Jul 2013 17:48:24 -0400 Subject: [PATCH] ENH: implement non-unique indexing in series (GH4246) DOC: release notes --- doc/source/release.rst | 1 + doc/source/v0.12.0.txt | 3 ++- pandas/core/index.py | 2 +- pandas/core/indexing.py | 10 +++++----- pandas/core/series.py | 11 ++++++++++- pandas/tests/test_series.py | 11 ++++++++++- 6 files changed, 29 insertions(+), 9 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index de4cea17f6d99..3b7d25789aa40 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -235,6 +235,7 @@ pandas 0.12 names (:issue:`3873`) - Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to ``reindex`` for location-based taking + - Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem`` (:issue:`4246) - Fixed bug in groupby with empty series referencing a variable before assignment. (:issue:`3510`) - Allow index name to be used in groupby for non MultiIndex (:issue:`4014`) diff --git a/doc/source/v0.12.0.txt b/doc/source/v0.12.0.txt index 25813ae026f36..64e76076368bc 100644 --- a/doc/source/v0.12.0.txt +++ b/doc/source/v0.12.0.txt @@ -437,6 +437,7 @@ Bug Fixes names (:issue:`3873`) - Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to ``reindex`` for location-based taking + - Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem`` (:issue:`4246) - ``DataFrame.from_records`` did not accept empty recarrays (:issue:`3682`) - ``read_html`` now correctly skips tests (:issue:`3741`) @@ -462,7 +463,7 @@ Bug Fixes (:issue:`4089`) - Fixed bug in ``DataFrame.replace`` where a nested dict wasn't being iterated over when regex=False (:issue:`4115`) - - Fixed bug in the parsing of microseconds when using the ``format`` + - Fixed bug in the parsing of microseconds when using the ``format`` argument in ``to_datetime`` (:issue:`4152`) - Fixed bug in ``PandasAutoDateLocator`` where ``invert_xaxis`` triggered incorrectly ``MilliSecondLocator`` (:issue:`3990`) diff --git a/pandas/core/index.py b/pandas/core/index.py index cb90dc9cb0cbb..3eb804d3a70e6 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -928,7 +928,7 @@ def reindex(self, target, method=None, level=None, limit=None, if method is not None or limit is not None: raise ValueError("cannot reindex a non-unique index " "with a method or limit") - indexer, _ = self.get_indexer_non_unique(target) + indexer, missing = self.get_indexer_non_unique(target) return target, indexer diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index fea7f3153b8a6..0237cfde3b561 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -481,12 +481,12 @@ def _reindex(keys, level=None): new_indexer = (Index(cur_indexer) + Index(missing_indexer)).values new_indexer[missing_indexer] = -1 - # need to reindex with an indexer on a specific axis - from pandas.core.frame import DataFrame - if not (type(self.obj) == DataFrame): - raise NotImplementedError("cannot handle non-unique indexing for non-DataFrame (yet)") + # reindex with the specified axis + ndim = self.obj.ndim + if axis+1 > ndim: + raise AssertionError("invalid indexing error with non-unique index") - args = [None] * 4 + args = [None] * (2*ndim) args[2*axis] = new_labels args[2*axis+1] = new_indexer diff --git a/pandas/core/series.py b/pandas/core/series.py index 7c9ae2bd3d94c..15a425fb3fd73 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -681,6 +681,10 @@ def _get_with(self, key): return self._get_values(key) else: try: + # handle the dup indexing case (GH 4246) + if isinstance(key, (list,tuple)): + return self.ix[key] + return self.reindex(key) except Exception: # [slice(0, 5, None)] will break if you convert to ndarray, @@ -2637,8 +2641,13 @@ def reindex(self, index=None, method=None, level=None, fill_value=pa.NA, new_index, indexer = self.index.reindex(index, method=method, level=level, limit=limit, takeable=takeable) + + # GH4246 (dispatch to a common method with frame to handle possibly duplicate index) + return self._reindex_with_indexers(new_index, indexer, copy=copy, fill_value=fill_value) + + def _reindex_with_indexers(self, index, indexer, copy, fill_value): new_values = com.take_1d(self.values, indexer, fill_value=fill_value) - return Series(new_values, index=new_index, name=self.name) + return Series(new_values, index=index, name=self.name) def reindex_axis(self, labels, axis=0, **kwargs): """ for compatibility with higher dims """ diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index b639ba0b2bb8a..cbf7fb070e97f 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -788,6 +788,15 @@ def test_getitem_unordered_dup(self): self.assert_(np.isscalar(obj['c'])) self.assert_(obj['c'] == 0) + def test_getitem_dups_with_missing(self): + + # breaks reindex, so need to use .ix internally + # GH 4246 + s = Series([1,2,3,4],['foo','bar','foo','bah']) + expected = s.ix[['foo','bar','bah','bam']] + result = s[['foo','bar','bah','bam']] + assert_series_equal(result,expected) + def test_setitem_ambiguous_keyerror(self): s = Series(range(10), index=range(0, 20, 2)) self.assertRaises(KeyError, s.__setitem__, 1, 5) @@ -1141,7 +1150,7 @@ def test_where(self): s = Series(np.arange(10)) mask = s > 5 self.assertRaises(ValueError, s.__setitem__, mask, ([0]*5,)) - + def test_where_broadcast(self): # Test a variety of differently sized series for size in range(2, 6):