From 432c67254edc09a3ef456047156c814e4b43fed1 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 29 Apr 2013 12:04:14 -0400 Subject: [PATCH 1/5] BUG: GH3468 Fix assigning a new index to a duplicate index in a DataFrame would fail --- RELEASE.rst | 2 ++ pandas/core/internals.py | 15 ++++++++++----- pandas/tests/test_frame.py | 15 +++++++++++++++ 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index f3fb98535cb61..e368b70b721ce 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -61,6 +61,7 @@ pandas 0.11.1 - Fix regression in a DataFrame apply with axis=1, objects were not being converted back to base dtypes correctly (GH3480_) - Fix issue when storing uint dtypes in an HDFStore. (GH3493_) + - Fix assigning a new index to a duplicate index in a DataFrame would fail (GH3468_) .. _GH3164: https://github.com/pydata/pandas/issues/3164 .. _GH3251: https://github.com/pydata/pandas/issues/3251 @@ -75,6 +76,7 @@ pandas 0.11.1 .. _GH3455: https://github.com/pydata/pandas/issues/3455 .. _GH3457: https://github.com/pydata/pandas/issues/3457 .. _GH3461: https://github.com/pydata/pandas/issues/3461 +.. _GH3468: https://github.com/pydata/pandas/issues/3468 .. _GH3448: https://github.com/pydata/pandas/issues/3448 .. _GH3449: https://github.com/pydata/pandas/issues/3449 .. _GH3493: https://github.com/pydata/pandas/issues/3493 diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 03cfd18f5afe5..2052b269a8165 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -56,11 +56,16 @@ def _gi(self, arg): @property def ref_locs(self): if self._ref_locs is None: - indexer = self.ref_items.get_indexer(self.items) - indexer = com._ensure_platform_int(indexer) - if (indexer == -1).any(): - raise AssertionError('Some block items were not in block ' - 'ref_items') + ri = self.ref_items + if ri.is_unique: + indexer = ri.get_indexer(self.items) + indexer = com._ensure_platform_int(indexer) + if (indexer == -1).any(): + raise AssertionError('Some block items were not in block ' + 'ref_items') + else: + indexer = np.arange(len(ri)) + self._ref_locs = indexer return self._ref_locs diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 7bafed216b9b9..6b69de604818f 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -9201,6 +9201,21 @@ def test_assign_columns(self): assert_series_equal(self.frame['C'], frame['baz']) assert_series_equal(self.frame['hi'], frame['foo2']) + def test_assign_columns_with_dups(self): + + # GH 3468 related + df = DataFrame([[1,2]], columns=['a','a']) + df.columns = ['a','a.1'] + + expected = DataFrame([[1,2]], columns=['a','a.1']) + assert_frame_equal(df, expected) + + df = DataFrame([[1,2]], columns=['a','a']) + df.columns = ['b','b'] + + expected = DataFrame([[1,2]], columns=['b','b']) + assert_frame_equal(df, expected) + def test_cast_internals(self): casted = DataFrame(self.frame._data, dtype=int) expected = DataFrame(self.frame._series, dtype=int) From 4c756e207ac8b6fe4411176bda30e089b8b3c9cc Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 29 Apr 2013 16:09:03 -0400 Subject: [PATCH 2/5] ENH: support for having duplicative indices across blocks (dtypes) BUG: fix construction of a DataFrame with duplicative indices --- RELEASE.rst | 6 ++ pandas/core/internals.py | 138 +++++++++++++++++++++++---------- pandas/tests/test_frame.py | 34 +++++++- pandas/tests/test_indexing.py | 7 ++ pandas/tests/test_internals.py | 2 +- 5 files changed, 145 insertions(+), 42 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index e368b70b721ce..feb94053b5f73 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -62,6 +62,12 @@ pandas 0.11.1 to base dtypes correctly (GH3480_) - Fix issue when storing uint dtypes in an HDFStore. (GH3493_) - Fix assigning a new index to a duplicate index in a DataFrame would fail (GH3468_) + - ref_locs support to allow duplicative indices across dtypes (GH3468_) + - Non-unique index support clarified (GH3468_) + + - Fix assigning a new index to a duplicate index in a DataFrame would fail + - Fix construction of a DataFrame with a duplicate index + - ref_locs support to allow duplicative indices across dtypes .. _GH3164: https://github.com/pydata/pandas/issues/3164 .. _GH3251: https://github.com/pydata/pandas/issues/3251 diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 2052b269a8165..5b690869708dd 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -56,15 +56,11 @@ def _gi(self, arg): @property def ref_locs(self): if self._ref_locs is None: - ri = self.ref_items - if ri.is_unique: - indexer = ri.get_indexer(self.items) - indexer = com._ensure_platform_int(indexer) - if (indexer == -1).any(): - raise AssertionError('Some block items were not in block ' - 'ref_items') - else: - indexer = np.arange(len(ri)) + indexer = self.ref_items.get_indexer(self.items) + indexer = com._ensure_platform_int(indexer) + if (indexer == -1).any(): + raise AssertionError('Some block items were not in block ' + 'ref_items') self._ref_locs = indexer return self._ref_locs @@ -884,7 +880,7 @@ class BlockManager(object): ----- This is *not* a public API class """ - __slots__ = ['axes', 'blocks', '_known_consolidated', '_is_consolidated'] + __slots__ = ['axes', 'blocks', '_known_consolidated', '_is_consolidated', '_ref_locs'] def __init__(self, blocks, axes, do_integrity_check=True): self.axes = [_ensure_index(ax) for ax in axes] @@ -920,11 +916,83 @@ def set_axis(self, axis, value): if len(value) != len(cur_axis): raise Exception('Length mismatch (%d vs %d)' % (len(value), len(cur_axis))) + self.axes[axis] = value if axis == 0: - for block in self.blocks: - block.set_ref_items(self.items, maybe_rename=True) + # unique, we can take + if cur_axis.is_unique: + for block in self.blocks: + block.set_ref_items(self.items, maybe_rename=True) + + # compute a duplicate indexer that we can use to take + # the new items from ref_items (in place of _ref_items) + else: + self.set_ref_locs(cur_axis) + for block in self.blocks: + block.set_ref_items(self.items, maybe_rename=True) + + def set_ref_locs(self, labels = None): + # if we have a non-unique index on this axis, set the indexers + # we need to set an absolute indexer for the blocks + # return the indexer if we are not unique + if labels is None: + labels = self.items + + if labels.is_unique: + return None + + #### THIS IS POTENTIALLY VERY SLOW ##### + + # if we are already computed, then we are done + if getattr(self,'_ref_locs',None) is not None: + return self._ref_locs + + blocks = self.blocks + + # initialize + blockmap = dict() + for b in blocks: + arr = np.empty(len(b.items),dtype='int64') + arr.fill(-1) + b._ref_locs = arr + + # add this block to the blockmap for each + # of the items in the block + for item in b.items: + if item not in blockmap: + blockmap[item] = [] + blockmap[item].append(b) + + rl = np.empty(len(labels),dtype=object) + for i, item in enumerate(labels.values): + + try: + block = blockmap[item].pop(0) + except: + raise Exception("not enough items in set_ref_locs") + + indexer = np.arange(len(block.items)) + mask = (block.items == item) & (block._ref_locs == -1) + if not mask.any(): + + # this case will catch a comparison of a index of tuples + mask = np.empty(len(block.items),dtype=bool) + mask.fill(False) + for j, (bitem, brl) in enumerate(zip(block.items,block._ref_locs)): + mask[j] = bitem == item and brl == -1 + + indices = indexer[mask] + if len(indices): + idx = indices[0] + else: + raise Exception("already set too many items in set_ref_locs") + + block._ref_locs[idx] = i + rl[i] = (block,idx) + + self._ref_locs = rl + return rl # make items read only for now def _get_items(self): @@ -1392,26 +1460,11 @@ def iget(self, i): item = self.items[i] if self.items.is_unique: return self.get(item) - else: - # ugh - try: - inds, = (self.items == item).nonzero() - except AttributeError: # MultiIndex - inds, = self.items.map(lambda x: x == item).nonzero() - - _, block = self._find_block(item) - - try: - binds, = (block.items == item).nonzero() - except AttributeError: # MultiIndex - binds, = block.items.map(lambda x: x == item).nonzero() - for j, (k, b) in enumerate(zip(inds, binds)): - if i == k: - return block.values[b] - - raise Exception('Cannot have duplicate column names ' - 'split across dtypes') + # compute the duplicative indexer if needed + ref_locs = self.set_ref_locs() + b, loc = ref_locs[i] + return b.values[loc] def get_scalar(self, tup): """ @@ -1587,6 +1640,8 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value): # keep track of what items aren't found anywhere mask = np.zeros(len(item_order), dtype=bool) + new_axes = [new_items] + self.axes[1:] + new_blocks = [] for blk in self.blocks: blk_indexer = blk.items.get_indexer(item_order) @@ -1610,7 +1665,7 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value): new_blocks.append(na_block) new_blocks = _consolidate(new_blocks, new_items) - return BlockManager(new_blocks, [new_items] + self.axes[1:]) + return BlockManager(new_blocks, new_axes) def reindex_items(self, new_items, copy=True, fill_value=np.nan): """ @@ -1624,6 +1679,7 @@ def reindex_items(self, new_items, copy=True, fill_value=np.nan): # TODO: this part could be faster (!) new_items, indexer = self.items.reindex(new_items) + new_axes = [new_items] + self.axes[1:] # could have so me pathological (MultiIndex) issues here new_blocks = [] @@ -1648,7 +1704,7 @@ def reindex_items(self, new_items, copy=True, fill_value=np.nan): new_blocks.append(na_block) new_blocks = _consolidate(new_blocks, new_items) - return BlockManager(new_blocks, [new_items] + self.axes[1:]) + return BlockManager(new_blocks, new_axes) def _make_na_block(self, items, ref_items, fill_value=np.nan): # TODO: infer dtypes other than float64 from fill_value @@ -1690,11 +1746,11 @@ def merge(self, other, lsuffix=None, rsuffix=None): this, other = self._maybe_rename_join(other, lsuffix, rsuffix) cons_items = this.items + other.items - consolidated = _consolidate(this.blocks + other.blocks, cons_items) - new_axes = list(this.axes) new_axes[0] = cons_items + consolidated = _consolidate(this.blocks + other.blocks, cons_items) + return BlockManager(consolidated, new_axes) def _maybe_rename_join(self, other, lsuffix, rsuffix, copydata=True): @@ -1907,7 +1963,6 @@ def form_blocks(arrays, names, axes): na_block = make_block(block_values, extra_items, items) blocks.append(na_block) - blocks = _consolidate(blocks, items) return blocks @@ -1958,9 +2013,6 @@ def _shape_compat(x): names, arrays = zip(*tuples) - # index may box values - items = ref_items[ref_items.isin(names)] - first = arrays[0] shape = (len(arrays),) + _shape_compat(first) @@ -1968,6 +2020,14 @@ def _shape_compat(x): for i, arr in enumerate(arrays): stacked[i] = _asarray_compat(arr) + # index may box values + if ref_items.is_unique: + items = ref_items[ref_items.isin(names)] + else: + items = _ensure_index([ n for n in names if n in ref_items ]) + if len(items) != len(stacked): + raise Exception("invalid names passed _stack_arrays") + return items, stacked diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 6b69de604818f..ee409c4a83256 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -9204,18 +9204,48 @@ def test_assign_columns(self): def test_assign_columns_with_dups(self): # GH 3468 related + + # basic df = DataFrame([[1,2]], columns=['a','a']) df.columns = ['a','a.1'] - + str(df) expected = DataFrame([[1,2]], columns=['a','a.1']) assert_frame_equal(df, expected) + df = DataFrame([[1,2,3]], columns=['b','a','a']) + df.columns = ['b','a','a.1'] + str(df) + expected = DataFrame([[1,2,3]], columns=['b','a','a.1']) + assert_frame_equal(df, expected) + + # with a dup index df = DataFrame([[1,2]], columns=['a','a']) df.columns = ['b','b'] - + str(df) expected = DataFrame([[1,2]], columns=['b','b']) assert_frame_equal(df, expected) + # multi-dtype + df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=['a','a','b','b','d','c','c']) + df.columns = list('ABCDEFG') + str(df) + expected = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('ABCDEFG')) + assert_frame_equal(df, expected) + + # this is an error because we cannot disambiguate the dup columns + self.assertRaises(Exception, lambda x: DataFrame([[1,2,'foo','bar']], columns=['a','a','a','a'])) + + # dups across blocks + df_float = DataFrame(np.random.randn(10, 3),dtype='float64') + df_int = DataFrame(np.random.randn(10, 3),dtype='int64') + df_bool = DataFrame(True,index=df_float.index,columns=df_float.columns) + df_object = DataFrame('foo',index=df_float.index,columns=df_float.columns) + df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns) + df = pan.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1) + + result = df._data.set_ref_locs() + self.assert_(len(result) == len(df.columns)) + def test_cast_internals(self): casted = DataFrame(self.frame._data, dtype=int) expected = DataFrame(self.frame._series, dtype=int) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 86cd0ef524b35..8e1ea569973a6 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -772,6 +772,13 @@ def test_dups_fancy_indexing(self): expected = Index(['b','a','a']) self.assert_(result.equals(expected)) + # across dtypes + df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('aaaaaaa')) + result = DataFrame([[1,2,1.,2.,3.,'foo','bar']]) + result.columns = list('aaaaaaa') + assert_frame_equal(df,result) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index eec5f5632d36b..e25bd0de769a7 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -268,7 +268,7 @@ def test_duplicate_item_failure(self): b.ref_items = items mgr = BlockManager(blocks, [items, np.arange(N)]) - self.assertRaises(Exception, mgr.iget, 1) + mgr.iget(1) def test_contains(self): self.assert_('a' in self.mgr) From b4677c195f337224cd960fcb2d856ca122a98b5c Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 30 Apr 2013 13:13:30 -0400 Subject: [PATCH 3/5] BUG: enabled applymap to work (and updated internals/convert to use iget) when using a non-unique index (GH2786 for the warning and GH3230 for applymap) TST: test for GH2194 (which is fixed) --- RELEASE.rst | 6 ++++++ pandas/core/frame.py | 3 --- pandas/core/internals.py | 25 +++++++++++++------------ pandas/tests/test_frame.py | 28 +++++++++++++++++++++------- 4 files changed, 40 insertions(+), 22 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index feb94053b5f73..38298fde12ff0 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -68,8 +68,14 @@ pandas 0.11.1 - Fix assigning a new index to a duplicate index in a DataFrame would fail - Fix construction of a DataFrame with a duplicate index - ref_locs support to allow duplicative indices across dtypes + (GH2194_) + - applymap on a DataFrame with a non-unique index now works + (removed warning) (GH2786_), and fix (GH3230_) .. _GH3164: https://github.com/pydata/pandas/issues/3164 +.. _GH2786: https://github.com/pydata/pandas/issues/2786 +.. _GH2194: https://github.com/pydata/pandas/issues/2194 +.. _GH3230: https://github.com/pydata/pandas/issues/3230 .. _GH3251: https://github.com/pydata/pandas/issues/3251 .. _GH3379: https://github.com/pydata/pandas/issues/3379 .. _GH3480: https://github.com/pydata/pandas/issues/3480 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2cb7608c7aba6..8bfdee3b75170 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4261,9 +4261,6 @@ def infer(x): if com.is_datetime64_dtype(x): x = lib.map_infer(x, lib.Timestamp) return lib.map_infer(x, func) - #GH2786 - if not self.columns.is_unique: - raise ValueError("applymap does not support dataframes having duplicate column labels") return self.apply(infer) #---------------------------------------------------------------------- diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 5b690869708dd..c874b061dd63d 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -165,6 +165,9 @@ def get(self, item): loc = self.items.get_loc(item) return self.values[loc] + def iget(self, i): + return self.values[i] + def set(self, item, value): """ Modify Block in-place with new item value @@ -711,7 +714,7 @@ def convert(self, convert_dates = True, convert_numeric = True, copy = True): # attempt to create new type blocks blocks = [] for i, c in enumerate(self.items): - values = self.get(c) + values = self.iget(i) values = com._possibly_convert_objects(values, convert_dates=convert_dates, convert_numeric=convert_numeric) values = _block_shape(values) @@ -920,17 +923,14 @@ def set_axis(self, axis, value): self.axes[axis] = value if axis == 0: - # unique, we can take - if cur_axis.is_unique: - for block in self.blocks: - block.set_ref_items(self.items, maybe_rename=True) - # compute a duplicate indexer that we can use to take - # the new items from ref_items (in place of _ref_items) - else: + # we have a non-unique index, so setup the ref_locs + if not cur_axis.is_unique: self.set_ref_locs(cur_axis) - for block in self.blocks: - block.set_ref_items(self.items, maybe_rename=True) + + # take via ref_locs + for block in self.blocks: + block.set_ref_items(self.items, maybe_rename=True) def set_ref_locs(self, labels = None): # if we have a non-unique index on this axis, set the indexers @@ -945,8 +945,9 @@ def set_ref_locs(self, labels = None): #### THIS IS POTENTIALLY VERY SLOW ##### # if we are already computed, then we are done - if getattr(self,'_ref_locs',None) is not None: - return self._ref_locs + rl = getattr(self,'_ref_locs',None) + if rl is not None: + return rl blocks = self.blocks diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index ee409c4a83256..cb3799c28d0cf 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -7492,12 +7492,15 @@ def test_applymap(self): self.assert_(result.dtypes[0] == object) # GH2786 - df = DataFrame(np.random.random((3,4))) - df.columns = ['a','a','a','a'] - try: - df.applymap(str) - except ValueError as e: - self.assertTrue("support" in str(e)) + df = DataFrame(np.random.random((3,4))) + df2 = df.copy() + cols = ['a','a','a','a'] + df.columns = cols + + expected = df2.applymap(str) + expected.columns = cols + result = df.applymap(str) + assert_frame_equal(result,expected) def test_filter(self): # items @@ -9201,7 +9204,7 @@ def test_assign_columns(self): assert_series_equal(self.frame['C'], frame['baz']) assert_series_equal(self.frame['hi'], frame['foo2']) - def test_assign_columns_with_dups(self): + def test_columns_with_dups(self): # GH 3468 related @@ -9246,6 +9249,17 @@ def test_assign_columns_with_dups(self): result = df._data.set_ref_locs() self.assert_(len(result) == len(df.columns)) + # testing iget + for i in range(len(df.columns)): + df.iloc[:,i] + + # dup columns across dtype GH 2079/2194 + vals = [[1, -1, 2.], [2, -2, 3.]] + rs = DataFrame(vals, columns=['A', 'A', 'B']) + xp = DataFrame(vals) + xp.columns = ['A', 'A', 'B'] + assert_frame_equal(rs, xp) + def test_cast_internals(self): casted = DataFrame(self.frame._data, dtype=int) expected = DataFrame(self.frame._series, dtype=int) From b8382a3ca71f1c06d453b909005024a0ff7cab93 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 1 May 2013 20:23:43 -0400 Subject: [PATCH 4/5] BUG: GH3495 change core/format/CSVFormatter.save to allow generic way of dealing with columns duplicate or not --- RELEASE.rst | 10 +- pandas/core/format.py | 35 ++--- pandas/core/internals.py | 232 ++++++++++++++++++++++------------ pandas/tests/test_frame.py | 34 +++-- pandas/tests/test_indexing.py | 6 + 5 files changed, 198 insertions(+), 119 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 38298fde12ff0..1a86ac02b2f7e 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -61,16 +61,15 @@ pandas 0.11.1 - Fix regression in a DataFrame apply with axis=1, objects were not being converted back to base dtypes correctly (GH3480_) - Fix issue when storing uint dtypes in an HDFStore. (GH3493_) - - Fix assigning a new index to a duplicate index in a DataFrame would fail (GH3468_) - - ref_locs support to allow duplicative indices across dtypes (GH3468_) - Non-unique index support clarified (GH3468_) - - Fix assigning a new index to a duplicate index in a DataFrame would fail + - Fix assigning a new index to a duplicate index in a DataFrame would fail (GH3468_) - Fix construction of a DataFrame with a duplicate index - - ref_locs support to allow duplicative indices across dtypes - (GH2194_) + - ref_locs support to allow duplicative indices across dtypes, + allows iget support to always find the index (even across dtypes) (GH2194_) - applymap on a DataFrame with a non-unique index now works (removed warning) (GH2786_), and fix (GH3230_) + - Fix to_csv to handle non-unique columns (GH3495_) .. _GH3164: https://github.com/pydata/pandas/issues/3164 .. _GH2786: https://github.com/pydata/pandas/issues/2786 @@ -91,6 +90,7 @@ pandas 0.11.1 .. _GH3468: https://github.com/pydata/pandas/issues/3468 .. _GH3448: https://github.com/pydata/pandas/issues/3448 .. _GH3449: https://github.com/pydata/pandas/issues/3449 +.. _GH3495: https://github.com/pydata/pandas/issues/3495 .. _GH3493: https://github.com/pydata/pandas/issues/3493 diff --git a/pandas/core/format.py b/pandas/core/format.py index 5b68b26a41b77..fa2135bb4310c 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -820,21 +820,7 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, self.blocks = self.obj._data.blocks ncols = sum(len(b.items) for b in self.blocks) self.data =[None] * ncols - - if self.obj.columns.is_unique: - self.colname_map = dict((k,i) for i,k in enumerate(self.obj.columns)) - else: - ks = [set(x.items) for x in self.blocks] - u = len(reduce(lambda a,x: a.union(x),ks,set())) - t = sum(map(len,ks)) - if u != t: - if len(set(self.cols)) != len(self.cols): - raise NotImplementedError("duplicate columns with differing dtypes are unsupported") - else: - # if columns are not unique and we acces this, - # we're doing it wrong - pass - + self.column_map = self.obj._data.get_items_map() if chunksize is None: chunksize = (100000/ (len(self.cols) or 1)) or 1 @@ -1034,18 +1020,13 @@ def _save_chunk(self, start_i, end_i): # create the data for a chunk slicer = slice(start_i,end_i) - if self.obj.columns.is_unique: - for i in range(len(self.blocks)): - b = self.blocks[i] - d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format) - for j, k in enumerate(b.items): - # self.data is a preallocated list - self.data[self.colname_map[k]] = d[j] - else: - # self.obj should contain a proper view of the dataframes - # with the specified ordering of cols if cols was specified - for i in range(len(self.obj.columns)): - self.data[i] = self.obj.icol(i).values[slicer].tolist() + for i in range(len(self.blocks)): + b = self.blocks[i] + d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format) + for i, item in enumerate(b.items): + + # self.data is a preallocated list + self.data[self.column_map[b][i]] = d[i] ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c874b061dd63d..5c0f9253beb62 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -65,6 +65,11 @@ def ref_locs(self): self._ref_locs = indexer return self._ref_locs + def set_ref_locs(self, placement): + """ explicity set the ref_locs indexer, only necessary for duplicate indicies """ + if placement is not None: + self._ref_locs = np.array(placement,dtype='int64') + def set_ref_items(self, ref_items, maybe_rename=True): """ If maybe_rename=True, need to set the items for this guy @@ -883,7 +888,7 @@ class BlockManager(object): ----- This is *not* a public API class """ - __slots__ = ['axes', 'blocks', '_known_consolidated', '_is_consolidated', '_ref_locs'] + __slots__ = ['axes', 'blocks', '_known_consolidated', '_is_consolidated', '_ref_locs', '_items_map'] def __init__(self, blocks, axes, do_integrity_check=True): self.axes = [_ensure_index(ax) for ax in axes] @@ -901,6 +906,10 @@ def __init__(self, blocks, axes, do_integrity_check=True): self._consolidate_check() + # we have a duplicate items index, setup the block maps + if not self.items.is_unique: + self._set_ref_locs(do_refs=True) + @classmethod def make_empty(self): return BlockManager([], [[], []]) @@ -924,76 +933,135 @@ def set_axis(self, axis, value): if axis == 0: - # we have a non-unique index, so setup the ref_locs - if not cur_axis.is_unique: - self.set_ref_locs(cur_axis) + # set/reset ref_locs based on the current index + # and map the new index if needed + self._set_ref_locs(labels=cur_axis) # take via ref_locs for block in self.blocks: block.set_ref_items(self.items, maybe_rename=True) - def set_ref_locs(self, labels = None): - # if we have a non-unique index on this axis, set the indexers - # we need to set an absolute indexer for the blocks - # return the indexer if we are not unique + # set/reset ref_locs based on the new index + self._set_ref_locs(labels=value, do_refs=True) + + def _set_ref_locs(self, labels=None, do_refs=False): + """ + if we have a non-unique index on this axis, set the indexers + we need to set an absolute indexer for the blocks + return the indexer if we are not unique + + labels : the (new) labels for this manager + ref : boolean, whether to set the labels (one a 1-1 mapping) + + """ + + im = None if labels is None: labels = self.items + else: + _ensure_index(labels) - if labels.is_unique: - return None + # we are unique, and coming from a unique + if labels.is_unique and not do_refs: - #### THIS IS POTENTIALLY VERY SLOW ##### + # reset our ref locs + self._ref_locs = None + for b in self.blocks: + b._ref_locs = None - # if we are already computed, then we are done - rl = getattr(self,'_ref_locs',None) - if rl is not None: - return rl + return None - blocks = self.blocks + # we are going to a non-unique index + # we have ref_locs on the block at this point + # or if ref_locs are not set, then we must assume a block + # ordering + if not labels.is_unique and do_refs: + + # create the items map + im = getattr(self,'_items_map',None) + if im is None: + + im = dict() + def maybe_create_block(block): + try: + return d[block] + except: + im[block] = l = [ None ] * len(block.items) + return l + + count_items = 0 + for block in self.blocks: + + # if we have a duplicate index but + # _ref_locs have not been set....then + # have to assume ordered blocks are passed + num_items = len(block.items) + try: + rl = block.ref_locs + except: + rl = np.arange(num_items) + count_items + + m = maybe_create_block(block) + for i, item in enumerate(block.items): + m[i] = rl[i] + count_items += num_items + + self._items_map = im + + # create the _ref_loc map here + rl = np.empty(len(labels),dtype=object) + for block, items in im.items(): + for i, loc in enumerate(items): + rl[loc] = (block,i) + self._ref_locs = rl + return rl - # initialize - blockmap = dict() - for b in blocks: - arr = np.empty(len(b.items),dtype='int64') - arr.fill(-1) - b._ref_locs = arr + # return our cached _ref_locs (or will compute again + # when we recreate the block manager if needed + return getattr(self,'_ref_locs',None) - # add this block to the blockmap for each - # of the items in the block - for item in b.items: - if item not in blockmap: - blockmap[item] = [] - blockmap[item].append(b) + def get_items_map(self): + """ + return an inverted ref_loc map for an item index + block -> item (in that block) location -> column location + """ - rl = np.empty(len(labels),dtype=object) - for i, item in enumerate(labels.values): + # cache check + im = getattr(self,'_items_map',None) + if im is not None: + return im + + im = dict() + rl = self._set_ref_locs() + def maybe_create_block(block): try: - block = blockmap[item].pop(0) + return im[block] except: - raise Exception("not enough items in set_ref_locs") + im[block] = l = [ None ] * len(block.items) + return l - indexer = np.arange(len(block.items)) - mask = (block.items == item) & (block._ref_locs == -1) - if not mask.any(): + # we have a non-duplicative index + if rl is None: - # this case will catch a comparison of a index of tuples - mask = np.empty(len(block.items),dtype=bool) - mask.fill(False) - for j, (bitem, brl) in enumerate(zip(block.items,block._ref_locs)): - mask[j] = bitem == item and brl == -1 + axis = self.axes[0] + for block in self.blocks: - indices = indexer[mask] - if len(indices): - idx = indices[0] - else: - raise Exception("already set too many items in set_ref_locs") + m = maybe_create_block(block) + for i, item in enumerate(block.items): + m[i] = axis.get_loc(item) + + + # use the ref_locs to construct the map + else: - block._ref_locs[idx] = i - rl[i] = (block,idx) - - self._ref_locs = rl - return rl + for i, (block, idx) in enumerate(rl): + + m = maybe_create_block(block) + m[idx] = i + + self._items_map = im + return im # make items read only for now def _get_items(self): @@ -1259,13 +1327,16 @@ def get_slice(self, slobj, axis=0, raise_on_error=False): new_items, klass=blk.__class__, fastpath=True) + newb.set_ref_locs(blk._ref_locs) new_blocks = [newb] else: return self.reindex_items(new_items) else: new_blocks = self._slice_blocks(slobj, axis) - return BlockManager(new_blocks, new_axes, do_integrity_check=False) + bm = BlockManager(new_blocks, new_axes, do_integrity_check=False) + bm._consolidate_inplace() + return bm def _slice_blocks(self, slobj, axis): new_blocks = [] @@ -1280,6 +1351,7 @@ def _slice_blocks(self, slobj, axis): block.ref_items, klass=block.__class__, fastpath=True) + newb.set_ref_locs(block._ref_locs) new_blocks.append(newb) return new_blocks @@ -1463,9 +1535,9 @@ def iget(self, i): return self.get(item) # compute the duplicative indexer if needed - ref_locs = self.set_ref_locs() + ref_locs = self._set_ref_locs() b, loc = ref_locs[i] - return b.values[loc] + return b.iget(loc) def get_scalar(self, tup): """ @@ -1904,54 +1976,55 @@ def form_blocks(arrays, names, axes): bool_items = [] object_items = [] datetime_items = [] - for k, v in zip(names, arrays): + for i, (k, v) in enumerate(zip(names, arrays)): if issubclass(v.dtype.type, np.floating): - float_items.append((k, v)) + float_items.append((i, k, v)) elif issubclass(v.dtype.type, np.complexfloating): - complex_items.append((k, v)) + complex_items.append((i, k, v)) elif issubclass(v.dtype.type, np.datetime64): if v.dtype != _NS_DTYPE: v = tslib.cast_to_nanoseconds(v) if hasattr(v, 'tz') and v.tz is not None: - object_items.append((k, v)) + object_items.append((i, k, v)) else: - datetime_items.append((k, v)) + datetime_items.append((i, k, v)) elif issubclass(v.dtype.type, np.integer): if v.dtype == np.uint64: # HACK #2355 definite overflow if (v > 2 ** 63 - 1).any(): - object_items.append((k, v)) + object_items.append((i, k, v)) continue - int_items.append((k, v)) + int_items.append((i, k, v)) elif v.dtype == np.bool_: - bool_items.append((k, v)) + bool_items.append((i, k, v)) else: - object_items.append((k, v)) + object_items.append((i, k, v)) + is_unique = items.is_unique blocks = [] if len(float_items): - float_blocks = _multi_blockify(float_items, items) + float_blocks = _multi_blockify(float_items, items, is_unique=is_unique) blocks.extend(float_blocks) if len(complex_items): - complex_blocks = _simple_blockify(complex_items, items, np.complex128) + complex_blocks = _simple_blockify(complex_items, items, np.complex128, is_unique=is_unique) blocks.extend(complex_blocks) if len(int_items): - int_blocks = _multi_blockify(int_items, items) + int_blocks = _multi_blockify(int_items, items, is_unique=is_unique) blocks.extend(int_blocks) if len(datetime_items): - datetime_blocks = _simple_blockify(datetime_items, items, _NS_DTYPE) + datetime_blocks = _simple_blockify(datetime_items, items, _NS_DTYPE, is_unique=is_unique) blocks.extend(datetime_blocks) if len(bool_items): - bool_blocks = _simple_blockify(bool_items, items, np.bool_) + bool_blocks = _simple_blockify(bool_items, items, np.bool_, is_unique=is_unique) blocks.extend(bool_blocks) if len(object_items) > 0: - object_blocks = _simple_blockify(object_items, items, np.object_) + object_blocks = _simple_blockify(object_items, items, np.object_, is_unique=is_unique) blocks.extend(object_blocks) if len(extra_items): @@ -1959,7 +2032,6 @@ def form_blocks(arrays, names, axes): # empty items -> dtype object block_values = np.empty(shape, dtype=object) - block_values.fill(nan) na_block = make_block(block_values, extra_items, items) @@ -1968,28 +2040,32 @@ def form_blocks(arrays, names, axes): return blocks -def _simple_blockify(tuples, ref_items, dtype): +def _simple_blockify(tuples, ref_items, dtype, is_unique=True): """ return a single array of a block that has a single dtype; if dtype is not None, coerce to this dtype """ - block_items, values = _stack_arrays(tuples, ref_items, dtype) + block_items, values, placement = _stack_arrays(tuples, ref_items, dtype) # CHECK DTYPE? if dtype is not None and values.dtype != dtype: # pragma: no cover values = values.astype(dtype) - return [ make_block(values, block_items, ref_items) ] - + block = make_block(values, block_items, ref_items) + if not is_unique: + block.set_ref_locs(placement) + return [ block ] -def _multi_blockify(tuples, ref_items, dtype = None): +def _multi_blockify(tuples, ref_items, dtype = None, is_unique=True): """ return an array of blocks that potentially have different dtypes """ # group by dtype - grouper = itertools.groupby(tuples, lambda x: x[1].dtype) + grouper = itertools.groupby(tuples, lambda x: x[2].dtype) new_blocks = [] for dtype, tup_block in grouper: - block_items, values = _stack_arrays(list(tup_block), ref_items, dtype) + block_items, values, placement = _stack_arrays(list(tup_block), ref_items, dtype) block = make_block(values, block_items, ref_items) + if not is_unique: + block.set_ref_locs(placement) new_blocks.append(block) return new_blocks @@ -2012,7 +2088,7 @@ def _shape_compat(x): else: return x.shape - names, arrays = zip(*tuples) + placement, names, arrays = zip(*tuples) first = arrays[0] shape = (len(arrays),) + _shape_compat(first) @@ -2029,7 +2105,7 @@ def _shape_compat(x): if len(items) != len(stacked): raise Exception("invalid names passed _stack_arrays") - return items, stacked + return items, stacked, placement def _blocks_to_series_dict(blocks, index=None): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index cb3799c28d0cf..69225c40e36df 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4973,17 +4973,33 @@ def test_to_csv_dups_cols(self): with ensure_clean() as filename: df.to_csv(filename) # single dtype, fine + result = read_csv(filename,index_col=0) + result.columns = df.columns + assert_frame_equal(result,df) - df_float = DataFrame(np.random.randn(1000, 30),dtype='float64') - df_int = DataFrame(np.random.randn(1000, 30),dtype='int64') - df_bool = DataFrame(True,index=df_float.index,columns=df_float.columns) - df_object = DataFrame('foo',index=df_float.index,columns=df_float.columns) - df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns) - df = pan.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1) + df_float = DataFrame(np.random.randn(1000, 3),dtype='float64') + df_int = DataFrame(np.random.randn(1000, 3),dtype='int64') + df_bool = DataFrame(True,index=df_float.index,columns=range(3)) + df_object = DataFrame('foo',index=df_float.index,columns=range(3)) + df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=range(3)) + df = pan.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1, ignore_index=True) - #### this raises because we have duplicate column names across dtypes #### + cols = [] + for i in range(5): + cols.extend([0,1,2]) + df.columns = cols + + from pandas import to_datetime with ensure_clean() as filename: - self.assertRaises(Exception, df.to_csv, filename) + df.to_csv(filename) + result = read_csv(filename,index_col=0) + + # date cols + for i in ['0.4','1.4','2.4']: + result[i] = to_datetime(result[i]) + + result.columns = df.columns + assert_frame_equal(result,df) # GH3457 from pandas.util.testing import makeCustomDataframe as mkdf @@ -9246,7 +9262,7 @@ def test_columns_with_dups(self): df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns) df = pan.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1) - result = df._data.set_ref_locs() + result = df._data._set_ref_locs() self.assert_(len(result) == len(df.columns)) # testing iget diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 8e1ea569973a6..ae71ec8b35422 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -774,8 +774,14 @@ def test_dups_fancy_indexing(self): # across dtypes df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('aaaaaaa')) + df.head() + str(df) result = DataFrame([[1,2,1.,2.,3.,'foo','bar']]) result.columns = list('aaaaaaa') + + df_v = df.iloc[:,4] + res_v = result.iloc[:,4] + assert_frame_equal(df,result) From 8c08acaef77e3b901dcbe09b612de785b0c5e782 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 2 May 2013 10:11:09 -0400 Subject: [PATCH 5/5] PERF: allow a cache_readonly to be 'set' if allow_settings is passed on the decoration useful when specifiying an index that is **known** to be unique (e.g. in the case of a default range index) --- pandas/core/common.py | 1 + pandas/core/index.py | 2 +- pandas/src/properties.pyx | 35 +++++++++++++++++++++++++---------- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index e6ce9fc5fc925..490f269c8c104 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1156,6 +1156,7 @@ def _default_index(n): values = np.arange(n, dtype=np.int64) result = values.view(Int64Index) result.name = None + result.is_unique = True return result diff --git a/pandas/core/index.py b/pandas/core/index.py index 34edd26a49617..101b69ffc3c7e 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -278,7 +278,7 @@ def is_monotonic(self): def is_lexsorted_for_tuple(self, tup): return True - @cache_readonly + @cache_readonly(allow_setting=True) def is_unique(self): return self._engine.is_unique diff --git a/pandas/src/properties.pyx b/pandas/src/properties.pyx index 53bb561ef9110..1df11cecf7b94 100644 --- a/pandas/src/properties.pyx +++ b/pandas/src/properties.pyx @@ -4,16 +4,20 @@ from cpython cimport PyDict_Contains, PyDict_GetItem, PyDict_GetItem cdef class cache_readonly(object): cdef readonly: - object fget, name + object func, name, allow_setting - def __init__(self, func): - self.fget = func - self.name = func.__name__ + def __init__(self, func=None, allow_setting=False): + if func is not None: + self.func = func + self.name = func.__name__ + self.allow_setting = allow_setting - def __get__(self, obj, type): - if obj is None: - return self.fget + def __call__(self, func, doc=None): + self.func = func + self.name = func.__name__ + return self + def __get__(self, obj, typ): # Get the cache or set a default one if needed cache = getattr(obj, '_cache', None) @@ -23,12 +27,23 @@ cdef class cache_readonly(object): if PyDict_Contains(cache, self.name): # not necessary to Py_INCREF val = PyDict_GetItem(cache, self.name) - return val else: - val = self.fget(obj) + val = self.func(obj) PyDict_SetItem(cache, self.name, val) - return val + return val + + def __set__(self, obj, value): + + if not self.allow_setting: + raise Exception("cannot set values for [%s]" % self.name) + + # Get the cache or set a default one if needed + cache = getattr(obj, '_cache', None) + if cache is None: + cache = obj._cache = {} + PyDict_SetItem(cache, self.name, value) + cdef class AxisProperty(object): cdef: Py_ssize_t axis