Skip to content

Commit 3b28ece

Browse files
committed
Merge pull request #4018 from jreback/iloc_bug
BUG: GH4017, efficiently support non-unique indicies with iloc
2 parents 73de5de + 0890d07 commit 3b28ece

File tree

10 files changed

+117
-31
lines changed

10 files changed

+117
-31
lines changed

doc/source/release.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,8 @@ pandas 0.12
212212
- Extend ``reindex`` to correctly deal with non-unique indices (:issue:`3679`)
213213
- ``DataFrame.itertuples()`` now works with frames with duplicate column
214214
names (:issue:`3873`)
215+
- Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to
216+
``reindex`` for location-based taking
215217

216218
- Fixed bug in groupby with empty series referencing a variable before assignment. (:issue:`3510`)
217219
- Allow index name to be used in groupby for non MultiIndex (:issue:`4014`)

doc/source/v0.12.0.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,8 @@ Bug Fixes
410410
- Extend ``reindex`` to correctly deal with non-unique indices (:issue:`3679`)
411411
- ``DataFrame.itertuples()`` now works with frames with duplicate column
412412
names (:issue:`3873`)
413+
- Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to
414+
``reindex`` for location-based taking
413415

414416
- ``DataFrame.from_records`` did not accept empty recarrays (:issue:`3682`)
415417
- ``read_html`` now correctly skips tests (:issue:`3741`)

pandas/core/frame.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1979,7 +1979,9 @@ def _ixs(self, i, axis=0, copy=False):
19791979
else:
19801980
label = self.index[i]
19811981
if isinstance(label, Index):
1982-
return self.reindex(label)
1982+
1983+
# a location index by definition
1984+
return self.reindex(label, takeable=True)
19831985
else:
19841986
try:
19851987
new_values = self._data.fast_2d_xs(i, copy=copy)
@@ -2590,7 +2592,7 @@ def _align_series(self, other, join='outer', axis=None, level=None,
25902592
return left_result, right_result
25912593

25922594
def reindex(self, index=None, columns=None, method=None, level=None,
2593-
fill_value=NA, limit=None, copy=True):
2595+
fill_value=NA, limit=None, copy=True, takeable=False):
25942596
"""Conform DataFrame to new index with optional filling logic, placing
25952597
NA/NaN in locations having no value in the previous index. A new object
25962598
is produced unless the new index is equivalent to the current one and
@@ -2617,6 +2619,7 @@ def reindex(self, index=None, columns=None, method=None, level=None,
26172619
"compatible" value
26182620
limit : int, default None
26192621
Maximum size gap to forward or backward fill
2622+
takeable : the labels are locations (and not labels)
26202623
26212624
Examples
26222625
--------
@@ -2636,11 +2639,11 @@ def reindex(self, index=None, columns=None, method=None, level=None,
26362639

26372640
if columns is not None:
26382641
frame = frame._reindex_columns(columns, copy, level,
2639-
fill_value, limit)
2642+
fill_value, limit, takeable)
26402643

26412644
if index is not None:
26422645
frame = frame._reindex_index(index, method, copy, level,
2643-
fill_value, limit)
2646+
fill_value, limit, takeable)
26442647

26452648
return frame
26462649

@@ -2717,16 +2720,18 @@ def _reindex_multi(self, new_index, new_columns, copy, fill_value):
27172720
return self.copy() if copy else self
27182721

27192722
def _reindex_index(self, new_index, method, copy, level, fill_value=NA,
2720-
limit=None):
2723+
limit=None, takeable=False):
27212724
new_index, indexer = self.index.reindex(new_index, method, level,
2722-
limit=limit, copy_if_needed=True)
2725+
limit=limit, copy_if_needed=True,
2726+
takeable=takeable)
27232727
return self._reindex_with_indexers(new_index, indexer, None, None,
27242728
copy, fill_value)
27252729

27262730
def _reindex_columns(self, new_columns, copy, level, fill_value=NA,
2727-
limit=None):
2731+
limit=None, takeable=False):
27282732
new_columns, indexer = self.columns.reindex(new_columns, level=level,
2729-
limit=limit, copy_if_needed=True)
2733+
limit=limit, copy_if_needed=True,
2734+
takeable=takeable)
27302735
return self._reindex_with_indexers(None, None, new_columns, indexer,
27312736
copy, fill_value)
27322737

pandas/core/index.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -920,7 +920,8 @@ def _get_method(self, method):
920920
}
921921
return aliases.get(method, method)
922922

923-
def reindex(self, target, method=None, level=None, limit=None, copy_if_needed=False):
923+
def reindex(self, target, method=None, level=None, limit=None,
924+
copy_if_needed=False, takeable=False):
924925
"""
925926
For Index, simply returns the new index and the results of
926927
get_indexer. Provided here to enable an interface that is amenable for
@@ -953,7 +954,11 @@ def reindex(self, target, method=None, level=None, limit=None, copy_if_needed=Fa
953954
if method is not None or limit is not None:
954955
raise ValueError("cannot reindex a non-unique index "
955956
"with a method or limit")
956-
indexer, missing = self.get_indexer_non_unique(target)
957+
if takeable:
958+
indexer = target
959+
missing = (target>=len(target)).nonzero()[0]
960+
else:
961+
indexer, missing = self.get_indexer_non_unique(target)
957962

958963
return target, indexer
959964

@@ -2202,7 +2207,8 @@ def get_indexer(self, target, method=None, limit=None):
22022207

22032208
return com._ensure_platform_int(indexer)
22042209

2205-
def reindex(self, target, method=None, level=None, limit=None, copy_if_needed=False):
2210+
def reindex(self, target, method=None, level=None, limit=None,
2211+
copy_if_needed=False, takeable=False):
22062212
"""
22072213
Performs any necessary conversion on the input index and calls
22082214
get_indexer. This method is here so MultiIndex and an Index of

pandas/core/indexing.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -476,10 +476,21 @@ def _reindex(keys, level=None):
476476
cur_indexer = com._ensure_int64(l[check])
477477

478478
new_labels = np.empty(tuple([len(indexer)]),dtype=object)
479-
new_labels[cur_indexer] = cur_labels
480-
new_labels[missing_indexer] = missing_labels
479+
new_labels[cur_indexer] = cur_labels
480+
new_labels[missing_indexer] = missing_labels
481+
new_indexer = (Index(cur_indexer) + Index(missing_indexer)).values
482+
new_indexer[missing_indexer] = -1
481483

482-
result = result.reindex_axis(new_labels,axis=axis)
484+
# need to reindex with an indexer on a specific axis
485+
from pandas.core.frame import DataFrame
486+
if not (type(self.obj) == DataFrame):
487+
raise NotImplementedError("cannot handle non-unique indexing for non-DataFrame (yet)")
488+
489+
args = [None] * 4
490+
args[2*axis] = new_labels
491+
args[2*axis+1] = new_indexer
492+
493+
result = result._reindex_with_indexers(*args, copy=False, fill_value=np.nan)
483494

484495
return result
485496

pandas/core/series.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -598,7 +598,7 @@ def _ixs(self, i, axis=0):
598598
else:
599599
label = self.index[i]
600600
if isinstance(label, Index):
601-
return self.reindex(label)
601+
return self.reindex(label, takeable=True)
602602
else:
603603
return _index.get_value_at(self, i)
604604

@@ -2618,7 +2618,7 @@ def _reindex_indexer(self, new_index, indexer, copy):
26182618
return self._constructor(new_values, new_index, name=self.name)
26192619

26202620
def reindex(self, index=None, method=None, level=None, fill_value=pa.NA,
2621-
limit=None, copy=True):
2621+
limit=None, copy=True, takeable=False):
26222622
"""Conform Series to new index with optional filling logic, placing
26232623
NA/NaN in locations having no value in the previous index. A new object
26242624
is produced unless the new index is equivalent to the current one and
@@ -2643,6 +2643,7 @@ def reindex(self, index=None, method=None, level=None, fill_value=pa.NA,
26432643
"compatible" value
26442644
limit : int, default None
26452645
Maximum size gap to forward or backward fill
2646+
takeable : the labels are locations (and not labels)
26462647
26472648
Returns
26482649
-------
@@ -2664,7 +2665,8 @@ def reindex(self, index=None, method=None, level=None, fill_value=pa.NA,
26642665
return Series(nan, index=index, name=self.name)
26652666

26662667
new_index, indexer = self.index.reindex(index, method=method,
2667-
level=level, limit=limit)
2668+
level=level, limit=limit,
2669+
takeable=takeable)
26682670
new_values = com.take_1d(self.values, indexer, fill_value=fill_value)
26692671
return Series(new_values, index=new_index, name=self.name)
26702672

pandas/index.pyx

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -272,33 +272,44 @@ cdef class IndexEngine:
272272
to the -1 indicies in the results """
273273

274274
cdef:
275-
ndarray values
275+
ndarray values, x
276276
ndarray[int64_t] result, missing
277-
object v, val
277+
set stargets
278+
dict d = {}
279+
object val
278280
int count = 0, count_missing = 0
279-
Py_ssize_t i, j, n, found
281+
Py_ssize_t i, j, n, n_t
280282

281283
self._ensure_mapping_populated()
282284
values = self._get_index_values()
285+
stargets = set(targets)
283286
n = len(values)
284287
n_t = len(targets)
285-
result = np.empty(n+n_t, dtype=np.int64)
288+
result = np.empty(n*n_t, dtype=np.int64)
286289
missing = np.empty(n_t, dtype=np.int64)
287290

291+
# form the set of the results (like ismember)
292+
members = np.empty(n, dtype=np.uint8)
293+
for i in range(n):
294+
val = util.get_value_1d(values, i)
295+
if val in stargets:
296+
if val not in d:
297+
d[val] = []
298+
d[val].append(i)
299+
288300
for i in range(n_t):
289-
val = util.get_value_at(targets, i)
290-
found = 0
291301

292-
for j in range(n):
293-
v = util.get_value_at(values, j)
302+
val = util.get_value_1d(targets, i)
294303

295-
if v == val:
304+
# found
305+
if val in d:
306+
for j in d[val]:
296307
result[count] = j
297308
count += 1
298-
found = 1
299309

300310
# value not found
301-
if found == 0:
311+
else:
312+
302313
result[count] = -1
303314
count += 1
304315
missing[count_missing] = i

pandas/sparse/frame.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -583,7 +583,7 @@ def _combine_const(self, other, func):
583583
columns=self.columns)
584584

585585
def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
586-
limit=None):
586+
limit=None, takeable=False):
587587
if level is not None:
588588
raise TypeError('Reindex by level not supported for sparse')
589589

@@ -614,7 +614,8 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
614614
return SparseDataFrame(new_series, index=index, columns=self.columns,
615615
default_fill_value=self.default_fill_value)
616616

617-
def _reindex_columns(self, columns, copy, level, fill_value, limit=None):
617+
def _reindex_columns(self, columns, copy, level, fill_value, limit=None,
618+
takeable=False):
618619
if level is not None:
619620
raise TypeError('Reindex by level not supported for sparse')
620621

pandas/tests/test_indexing.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import numpy as np
99
from numpy.testing import assert_array_equal
1010

11-
import pandas as pan
11+
import pandas as pd
1212
import pandas.core.common as com
1313
from pandas.core.api import (DataFrame, Index, Series, Panel, notnull, isnull,
1414
MultiIndex, DatetimeIndex, Timestamp)
@@ -1037,6 +1037,36 @@ def test_loc_name(self):
10371037
result = df.loc[[0, 1]].index.name
10381038
self.assert_(result == 'index_name')
10391039

1040+
def test_iloc_non_unique_indexing(self):
1041+
1042+
#GH 4017, non-unique indexing (on the axis)
1043+
df = DataFrame({'A' : [0.1] * 3000, 'B' : [1] * 3000})
1044+
idx = np.array(range(30)) * 99
1045+
expected = df.iloc[idx]
1046+
1047+
df3 = pd.concat([df, 2*df, 3*df])
1048+
result = df3.iloc[idx]
1049+
1050+
assert_frame_equal(result, expected)
1051+
1052+
df2 = DataFrame({'A' : [0.1] * 1000, 'B' : [1] * 1000})
1053+
df2 = pd.concat([df2, 2*df2, 3*df2])
1054+
1055+
sidx = df2.index.to_series()
1056+
expected = df2.iloc[idx[idx<=sidx.max()]]
1057+
1058+
new_list = []
1059+
for r, s in expected.iterrows():
1060+
new_list.append(s)
1061+
new_list.append(s*2)
1062+
new_list.append(s*3)
1063+
1064+
expected = DataFrame(new_list)
1065+
expected = pd.concat([ expected, DataFrame(index=idx[idx>sidx.max()]) ])
1066+
result = df2.loc[idx]
1067+
assert_frame_equal(result, expected)
1068+
1069+
10401070
if __name__ == '__main__':
10411071
import nose
10421072
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

vb_suite/indexing.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,3 +148,19 @@
148148

149149
indexing_panel_subset = Benchmark('p.ix[inds, inds, inds]', setup,
150150
start_date=datetime(2012, 1, 1))
151+
152+
#----------------------------------------------------------------------
153+
# Iloc
154+
155+
setup = common_setup + """
156+
df = DataFrame({'A' : [0.1] * 3000, 'B' : [1] * 3000})
157+
idx = np.array(range(30)) * 99
158+
df2 = DataFrame({'A' : [0.1] * 1000, 'B' : [1] * 1000})
159+
df2 = concat([df2, 2*df2, 3*df2])
160+
"""
161+
162+
frame_iloc_dups = Benchmark('df2.iloc[idx]', setup,
163+
start_date=datetime(2013, 1, 1))
164+
165+
frame_loc_dups = Benchmark('df2.loc[idx]', setup,
166+
start_date=datetime(2013, 1, 1))

0 commit comments

Comments
 (0)