diff --git a/doc/source/release.rst b/doc/source/release.rst index 7a271688c318b..dce1a25cf434b 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -318,6 +318,8 @@ pandas 0.12 iterated over when regex=False (:issue:`4115`) - Fixed bug in ``convert_objects(convert_numeric=True)`` where a mixed numeric and object Series/Frame was not converting properly (:issue:`4119`) + - Fixed bugs in multi-index selection with column multi-index and duplicates + (:issue:`4145`, :issue:`4146`) pandas 0.11.0 diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 27c12fcd2e8eb..fea7f3153b8a6 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -608,7 +608,7 @@ def _convert_to_indexer(self, obj, axis=0): mask = check == -1 if mask.any(): raise KeyError('%s not in index' % objarr[mask]) - + return indexer else: @@ -1100,9 +1100,14 @@ def _check_slice_bounds(slobj, values): def _maybe_droplevels(index, key): # drop levels + original_index = index if isinstance(key, tuple): for _ in key: - index = index.droplevel(0) + try: + index = index.droplevel(0) + except: + # we have dropped too much, so back out + return original_index else: index = index.droplevel(0) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 99af2d7becb39..f23a89635aaf2 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1660,18 +1660,23 @@ def get(self, item): # duplicate index but only a single result if com.is_integer(indexer): + b, loc = ref_locs[indexer] - return b.iget(loc) + values = [ b.iget(loc) ] + index = Index([ self.items[indexer] ]) + + # we have a multiple result, potentially across blocks else: - # we have a multiple result, potentially across blocks values = [ block.iget(i) for block, i in ref_locs[indexer] ] index = self.items[indexer] - axes = [ index ] + self.axes[1:] - blocks = form_blocks(values, index, axes) - mgr = BlockManager(blocks, axes) - mgr._consolidate_inplace() - return mgr + + # create and return a new block manager + axes = [ index ] + self.axes[1:] + blocks = form_blocks(values, index, axes) + mgr = BlockManager(blocks, axes) + mgr._consolidate_inplace() + return mgr def iget(self, i): item = self.items[i] diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 8b6bf1ed7f651..7cd31b8f04b3a 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -2,6 +2,7 @@ import unittest import nose import itertools +from StringIO import StringIO from numpy import random, nan from numpy.random import randn @@ -45,7 +46,7 @@ def _get_value(f, i, values=False): # check agains values if values: return f.values[i] - + # this is equiv of f[col][row]..... #v = f #for a in reversed(i): @@ -70,7 +71,7 @@ def _get_result(obj, method, key, axis): xp = getattr(obj, method).__getitem__(_axify(obj,key,axis)) except: xp = getattr(obj, method).__getitem__(key) - + return xp def _axify(obj, key, axis): @@ -127,11 +128,11 @@ def setUp(self): setattr(self,o,d) def check_values(self, f, func, values = False): - + if f is None: return axes = f.axes indicies = itertools.product(*axes) - + for i in indicies: result = getattr(f,func)[i] @@ -194,7 +195,7 @@ def _print(result, error = None): if fails is True: if result == 'fail': result = 'ok (fail)' - + if not result.startswith('ok'): raise AssertionError(_print(result)) @@ -212,7 +213,7 @@ def _print(result, error = None): result = 'ok (%s)' % type(detail).__name__ _print(result) return - + result = type(detail).__name__ raise AssertionError(_print(result, error = detail)) @@ -244,14 +245,14 @@ def _print(result, error = None): obj = d[t] if obj is not None: obj = obj.copy() - + k2 = key2 _eq(t, o, a, obj, key1, k2) def test_at_and_iat_get(self): def _check(f, func, values = False): - + if f is not None: indicies = _generate_indices(f, values) for i in indicies: @@ -260,7 +261,7 @@ def _check(f, func, values = False): assert_almost_equal(result, expected) for o in self._objs: - + d = getattr(self,o) # iat @@ -274,11 +275,11 @@ def _check(f, func, values = False): _check(d['labels'],'at') _check(d['ts'], 'at') _check(d['floats'],'at') - + def test_at_and_iat_set(self): def _check(f, func, values = False): - + if f is not None: indicies = _generate_indices(f, values) for i in indicies: @@ -287,7 +288,7 @@ def _check(f, func, values = False): assert_almost_equal(expected, 1) for t in self._objs: - + d = getattr(self,t) _check(d['ints'],'iat',values=True) @@ -302,12 +303,12 @@ def _check(f, func, values = False): _check(d['floats'],'at') def test_at_timestamp(self): - + # as timestamp is not a tuple! dates = date_range('1/1/2000', periods=8) df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) s = df['A'] - + result = s.at[dates[5]] xp = s.values[5] self.assert_(result == xp) @@ -320,7 +321,7 @@ def test_iloc_getitem_int(self): # integer self.check_result('integer', 'iloc', 2, 'ix', { 0 : 4, 1: 6, 2: 8 }, typs = ['ints']) self.check_result('integer', 'iloc', 2, 'indexer', 2, typs = ['labels','mixed','ts','floats','empty'], fails = IndexError) - + def test_iloc_getitem_neg_int(self): # neg integer @@ -332,7 +333,7 @@ def test_iloc_getitem_list_int(self): # list of ints self.check_result('list int', 'iloc', [0,1,2], 'ix', { 0 : [0,2,4], 1 : [0,3,6], 2: [0,4,8] }, typs = ['ints']) self.check_result('list int', 'iloc', [0,1,2], 'indexer', [0,1,2], typs = ['labels','mixed','ts','floats','empty'], fails = IndexError) - + def test_iloc_getitem_dups(self): # no dups in panel (bug?) @@ -378,7 +379,7 @@ def test_iloc_setitem(self): assert_frame_equal(result, expected) def test_iloc_multiindex(self): - df = DataFrame(np.random.randn(3, 3), + df = DataFrame(np.random.randn(3, 3), columns=[[2,2,4],[6,8,10]], index=[[4,4,8],[8,10,12]]) @@ -415,7 +416,7 @@ def test_loc_getitem_label_out_of_range(self): # out of range label self.check_result('label range', 'loc', 'f', 'ix', 'f', typs = ['ints','labels','mixed','ts','floats'], fails=KeyError) - + def test_loc_getitem_label_list(self): # list of labels @@ -426,7 +427,7 @@ def test_loc_getitem_label_list(self): self.check_result('list lbl', 'loc', ['A','B','C'], 'ix', ['A','B','C'], typs = ['labels'], axes=1) self.check_result('list lbl', 'loc', ['Z','Y','W'], 'ix', ['Z','Y','W'], typs = ['labels'], axes=2) self.check_result('list lbl', 'loc', [2,8,'null'], 'ix', [2,8,'null'], typs = ['mixed'], axes=0) - self.check_result('list lbl', 'loc', [Timestamp('20130102'),Timestamp('20130103')], 'ix', + self.check_result('list lbl', 'loc', [Timestamp('20130102'),Timestamp('20130103')], 'ix', [Timestamp('20130102'),Timestamp('20130103')], typs = ['ts'], axes=0) # fails @@ -434,7 +435,7 @@ def test_loc_getitem_label_list(self): self.check_result('list lbl', 'loc', [0,2,3], 'ix', [0,2,3], typs = ['ints'], axes=0, fails = KeyError) self.check_result('list lbl', 'loc', [3,6,7], 'ix', [3,6,9], typs = ['ints'], axes=1, fails = KeyError) self.check_result('list lbl', 'loc', [4,8,10], 'ix', [4,8,12], typs = ['ints'], axes=2, fails = KeyError) - + # array like self.check_result('array like', 'loc', Series(index=[0,2,4]).index, 'ix', [0,2,4], typs = ['ints'], axes=0) self.check_result('array like', 'loc', Series(index=[3,6,9]).index, 'ix', [3,6,9], typs = ['ints'], axes=1) @@ -449,10 +450,10 @@ def test_loc_getitem_bool(self): def test_loc_getitem_int_slice(self): - # int slices in int + # int slices in int self.check_result('int slice1', 'loc', slice(2,4), 'ix', { 0 : [2,4], 1: [3,6], 2: [4,8] }, typs = ['ints'], fails=KeyError) - # ok + # ok self.check_result('int slice2', 'loc', slice(2,4), 'ix', [2,4], typs = ['ints'], axes = 0) self.check_result('int slice2', 'loc', slice(3,6), 'ix', [3,6], typs = ['ints'], axes = 1) self.check_result('int slice2', 'loc', slice(4,8), 'ix', [4,8], typs = ['ints'], axes = 2) @@ -589,7 +590,7 @@ def test_iloc_getitem_frame(self): result = df.iloc[s.index] expected = df.ix[[2,4,6,8]] assert_frame_equal(result, expected) - + # out-of-bounds slice self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(None),slice(1,5,None)])) self.assertRaises(IndexError, df.iloc.__getitem__, tuple([slice(None),slice(-5,3,None)])) @@ -648,7 +649,7 @@ def test_iloc_multiindex(self): ['A', 'A', 'B']], index=[['i', 'i', 'j', 'k'], ['X', 'X', 'Y','Y']]) - mi_int = DataFrame(np.random.randn(3, 3), + mi_int = DataFrame(np.random.randn(3, 3), columns=[[2,2,4],[6,8,10]], index=[[4,4,8],[8,10,12]]) @@ -679,7 +680,7 @@ def test_loc_multiindex(self): ['A', 'A', 'B']], index=[['i', 'i', 'j'], ['X', 'X', 'Y']]) - mi_int = DataFrame(np.random.randn(3, 3), + mi_int = DataFrame(np.random.randn(3, 3), columns=[[2,2,4],[6,8,10]], index=[[4,4,8],[8,10,12]]) @@ -749,7 +750,7 @@ def test_xs_multiindex(self): assert_frame_equal(result, expected) def test_setitem_dtype_upcast(self): - + # GH3216 df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) df['c'] = np.nan @@ -761,7 +762,7 @@ def test_setitem_dtype_upcast(self): def test_setitem_iloc(self): - + # setitem with an iloc list df = DataFrame(np.arange(9).reshape((3, 3)), index=["A", "B", "C"], columns=["A", "B", "C"]) df.iloc[[0,1],[1,2]] @@ -830,20 +831,20 @@ def test_indexing_mixed_frame_bug(self): self.assert_(df.iloc[0,2] == '-----') #if I look at df, then element [0,2] equals '_'. If instead I type df.ix[idx,'test'], I get '-----', finally by typing df.iloc[0,2] I get '_'. - + def test_set_index_nan(self): # GH 3586 - df = DataFrame({'PRuid': {17: 'nonQC', 18: 'nonQC', 19: 'nonQC', 20: '10', 21: '11', 22: '12', 23: '13', - 24: '24', 25: '35', 26: '46', 27: '47', 28: '48', 29: '59', 30: '10'}, - 'QC': {17: 0.0, 18: 0.0, 19: 0.0, 20: nan, 21: nan, 22: nan, 23: nan, 24: 1.0, 25: nan, - 26: nan, 27: nan, 28: nan, 29: nan, 30: nan}, - 'data': {17: 7.9544899999999998, 18: 8.0142609999999994, 19: 7.8591520000000008, 20: 0.86140349999999999, + df = DataFrame({'PRuid': {17: 'nonQC', 18: 'nonQC', 19: 'nonQC', 20: '10', 21: '11', 22: '12', 23: '13', + 24: '24', 25: '35', 26: '46', 27: '47', 28: '48', 29: '59', 30: '10'}, + 'QC': {17: 0.0, 18: 0.0, 19: 0.0, 20: nan, 21: nan, 22: nan, 23: nan, 24: 1.0, 25: nan, + 26: nan, 27: nan, 28: nan, 29: nan, 30: nan}, + 'data': {17: 7.9544899999999998, 18: 8.0142609999999994, 19: 7.8591520000000008, 20: 0.86140349999999999, 21: 0.87853110000000001, 22: 0.8427041999999999, 23: 0.78587700000000005, 24: 0.73062459999999996, - 25: 0.81668560000000001, 26: 0.81927080000000008, 27: 0.80705009999999999, 28: 0.81440240000000008, - 29: 0.80140849999999997, 30: 0.81307740000000006}, - 'year': {17: 2006, 18: 2007, 19: 2008, 20: 1985, 21: 1985, 22: 1985, 23: 1985, + 25: 0.81668560000000001, 26: 0.81927080000000008, 27: 0.80705009999999999, 28: 0.81440240000000008, + 29: 0.80140849999999997, 30: 0.81307740000000006}, + 'year': {17: 2006, 18: 2007, 19: 2008, 20: 1985, 21: 1985, 22: 1985, 23: 1985, 24: 1985, 25: 1985, 26: 1985, 27: 1985, 28: 1985, 29: 1985, 30: 1986}}).reset_index() result = df.set_index(['year','PRuid','QC']).reset_index().reindex(columns=df.columns) @@ -871,7 +872,7 @@ def test_iloc_panel_issue(self): self.assert_(p.iloc[1, 1, :3].shape == (3,)) self.assert_(p.iloc[1, :3, 1].shape == (3,)) self.assert_(p.iloc[:3, 1, 1].shape == (3,)) - + def test_multi_assign(self): # GH 3626, an assignement of a sub-df to a df @@ -892,7 +893,7 @@ def test_multi_assign(self): 'PF':[0,0,0,0,1,1], 'col1':Series([0,1,4,6,8,10]), 'col2':[12,7,16,np.nan,20,22]}) - + # frame on rhs df2.ix[mask, cols]= dft.ix[mask, cols] @@ -1006,7 +1007,7 @@ def test_non_unique_loc(self): ## https://groups.google.com/forum/?fromgroups#!topic/pydata/zTm2No0crYs # these are going to raise becuase the we are non monotonic - df = DataFrame({'A' : [1,2,3,4,5,6], 'B' : [3,4,5,6,7,8]}, index = [0,1,0,1,2,3]) + df = DataFrame({'A' : [1,2,3,4,5,6], 'B' : [3,4,5,6,7,8]}, index = [0,1,0,1,2,3]) self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(1,None)])) self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(0,None)])) self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(1,2)])) @@ -1066,6 +1067,36 @@ def test_iloc_non_unique_indexing(self): result = df2.loc[idx] assert_frame_equal(result, expected) + def test_mi_access(self): + + # GH 4145 + data = """h1 main h3 sub h5 +0 a A 1 A1 1 +1 b B 2 B1 2 +2 c B 3 A1 3 +3 d A 4 B2 4 +4 e A 5 B2 5 +5 f B 6 A2 6 +""" + + df = pd.read_csv(StringIO(data),sep='\s+',index_col=0) + df2 = df.set_index(['main', 'sub']).T.sort_index(1) + index = Index(['h1','h3','h5']) + columns = MultiIndex.from_tuples([('A','A1')],names=['main','sub']) + expected = DataFrame([['a',1,1]],index=columns,columns=index).T + + result = df2.loc[:,('A','A1')] + assert_frame_equal(result,expected) + + result = df2[('A','A1')] + assert_frame_equal(result,expected) + + # GH 4146, not returning a block manager when selecting a unique index + # from a duplicate index + expected = DataFrame([['a',1,1]],index=['A1'],columns=['h1','h3','h5'],).T + df3 = df2['A'] + result = df3['A1'] + assert_frame_equal(result,expected) if __name__ == '__main__': import nose