Skip to content

API: implement non-unique indexing in series (GH4246) #4247

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 15, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ pandas 0.12
names (:issue:`3873`)
- Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to
``reindex`` for location-based taking
- Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem`` (:issue:`4246)

- Fixed bug in groupby with empty series referencing a variable before assignment. (:issue:`3510`)
- Allow index name to be used in groupby for non MultiIndex (:issue:`4014`)
Expand Down
3 changes: 2 additions & 1 deletion doc/source/v0.12.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,7 @@ Bug Fixes
names (:issue:`3873`)
- Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to
``reindex`` for location-based taking
- Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem`` (:issue:`4246)

- ``DataFrame.from_records`` did not accept empty recarrays (:issue:`3682`)
- ``read_html`` now correctly skips tests (:issue:`3741`)
Expand All @@ -462,7 +463,7 @@ Bug Fixes
(:issue:`4089`)
- Fixed bug in ``DataFrame.replace`` where a nested dict wasn't being
iterated over when regex=False (:issue:`4115`)
- Fixed bug in the parsing of microseconds when using the ``format``
- Fixed bug in the parsing of microseconds when using the ``format``
argument in ``to_datetime`` (:issue:`4152`)
- Fixed bug in ``PandasAutoDateLocator`` where ``invert_xaxis`` triggered
incorrectly ``MilliSecondLocator`` (:issue:`3990`)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -928,7 +928,7 @@ def reindex(self, target, method=None, level=None, limit=None,
if method is not None or limit is not None:
raise ValueError("cannot reindex a non-unique index "
"with a method or limit")
indexer, _ = self.get_indexer_non_unique(target)
indexer, missing = self.get_indexer_non_unique(target)

return target, indexer

Expand Down
10 changes: 5 additions & 5 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,12 +481,12 @@ def _reindex(keys, level=None):
new_indexer = (Index(cur_indexer) + Index(missing_indexer)).values
new_indexer[missing_indexer] = -1

# need to reindex with an indexer on a specific axis
from pandas.core.frame import DataFrame
if not (type(self.obj) == DataFrame):
raise NotImplementedError("cannot handle non-unique indexing for non-DataFrame (yet)")
# reindex with the specified axis
ndim = self.obj.ndim
if axis+1 > ndim:
raise AssertionError("invalid indexing error with non-unique index")

args = [None] * 4
args = [None] * (2*ndim)
args[2*axis] = new_labels
args[2*axis+1] = new_indexer

Expand Down
11 changes: 10 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,10 @@ def _get_with(self, key):
return self._get_values(key)
else:
try:
# handle the dup indexing case (GH 4246)
if isinstance(key, (list,tuple)):
return self.ix[key]

return self.reindex(key)
except Exception:
# [slice(0, 5, None)] will break if you convert to ndarray,
Expand Down Expand Up @@ -2637,8 +2641,13 @@ def reindex(self, index=None, method=None, level=None, fill_value=pa.NA,
new_index, indexer = self.index.reindex(index, method=method,
level=level, limit=limit,
takeable=takeable)

# GH4246 (dispatch to a common method with frame to handle possibly duplicate index)
return self._reindex_with_indexers(new_index, indexer, copy=copy, fill_value=fill_value)

def _reindex_with_indexers(self, index, indexer, copy, fill_value):
new_values = com.take_1d(self.values, indexer, fill_value=fill_value)
return Series(new_values, index=new_index, name=self.name)
return Series(new_values, index=index, name=self.name)

def reindex_axis(self, labels, axis=0, **kwargs):
""" for compatibility with higher dims """
Expand Down
11 changes: 10 additions & 1 deletion pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,15 @@ def test_getitem_unordered_dup(self):
self.assert_(np.isscalar(obj['c']))
self.assert_(obj['c'] == 0)

def test_getitem_dups_with_missing(self):

# breaks reindex, so need to use .ix internally
# GH 4246
s = Series([1,2,3,4],['foo','bar','foo','bah'])
expected = s.ix[['foo','bar','bah','bam']]
result = s[['foo','bar','bah','bam']]
assert_series_equal(result,expected)

def test_setitem_ambiguous_keyerror(self):
s = Series(range(10), index=range(0, 20, 2))
self.assertRaises(KeyError, s.__setitem__, 1, 5)
Expand Down Expand Up @@ -1141,7 +1150,7 @@ def test_where(self):
s = Series(np.arange(10))
mask = s > 5
self.assertRaises(ValueError, s.__setitem__, mask, ([0]*5,))

def test_where_broadcast(self):
# Test a variety of differently sized series
for size in range(2, 6):
Expand Down