From ecf084769c35a997584123059d636041bf9a5e0b Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 7 Sep 2013 15:12:04 -0400 Subject: [PATCH 1/2] BUG: Bug in concatenation with duplicate columns across dtypes not merging with axis=0 (GH4771) TST: Bug in iloc with a slice index failing (GH4771) --- doc/source/release.rst | 2 ++ pandas/core/internals.py | 2 +- pandas/tests/test_indexing.py | 25 ++++++++++++++++++++++++- pandas/tools/merge.py | 6 +++++- pandas/tools/tests/test_merge.py | 27 +++++++++++++++++++++++++++ 5 files changed, 59 insertions(+), 3 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index e12e6c91d46d0..930f100fd86dc 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -331,6 +331,8 @@ See :ref:`Internal Refactoring` - Bug in multi-indexing with a partial string selection as one part of a MultIndex (:issue:`4758`) - Bug with reindexing on the index with a non-unique index will now raise ``ValueError`` (:issue:`4746`) - Bug in setting with ``loc/ix`` a single indexer with a multi-index axis and a numpy array, related to (:issue:`3777`) + - Bug in concatenation with duplicate columns across dtypes not merging with axis=0 (:issue:`4771`) + - Bug in ``iloc`` with a slice index failing (:issue:`4771`) pandas 0.12 =========== diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 57db36b252e3c..e27430b06c45c 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2174,7 +2174,7 @@ def get_slice(self, slobj, axis=0, raise_on_error=False): placement=blk._ref_locs) new_blocks = [newb] else: - return self.reindex_items(new_items) + return self.reindex_items(new_items, indexer=np.arange(len(self.items))[slobj]) else: new_blocks = self._slice_blocks(slobj, axis) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index d6088c2d72525..18ee89fbc5c66 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -16,7 +16,7 @@ MultiIndex, DatetimeIndex, Timestamp) from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal) -from pandas import compat +from pandas import compat, concat import pandas.util.testing as tm import pandas.lib as lib @@ -359,6 +359,29 @@ def test_iloc_getitem_slice(self): self.check_result('slice', 'iloc', slice(1,3), 'ix', { 0 : [2,4], 1: [3,6], 2: [4,8] }, typs = ['ints']) self.check_result('slice', 'iloc', slice(1,3), 'indexer', slice(1,3), typs = ['labels','mixed','ts','floats','empty'], fails = IndexError) + def test_iloc_getitem_slice_dups(self): + + df1 = DataFrame(np.random.randn(10,4),columns=['A','A','B','B']) + df2 = DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C']) + + # axis=1 + df = concat([df1,df2],axis=1) + assert_frame_equal(df.iloc[:,:4],df1) + assert_frame_equal(df.iloc[:,4:],df2) + + df = concat([df2,df1],axis=1) + assert_frame_equal(df.iloc[:,:2],df2) + assert_frame_equal(df.iloc[:,2:],df1) + + assert_frame_equal(df.iloc[:,0:3],concat([df2,df1.iloc[:,[0]]],axis=1)) + + # axis=0 + df = concat([df,df],axis=0) + assert_frame_equal(df.iloc[0:10,:2],df2) + assert_frame_equal(df.iloc[0:10,2:],df1) + assert_frame_equal(df.iloc[10:,:2],df2) + assert_frame_equal(df.iloc[10:,2:],df1) + def test_iloc_getitem_out_of_bounds(self): # out-of-bounds slice diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 765dbc07b464f..077a3fe6294da 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -992,6 +992,7 @@ def _prepare_blocks(self): blockmaps = [] for data in reindexed_data: data = data.consolidate() + data._set_ref_locs() blockmaps.append(data.get_block_map(typ='dict')) return blockmaps, reindexed_data @@ -1063,7 +1064,10 @@ def _concat_blocks(self, blocks): # or maybe would require performance test) raise PandasError('dtypes are not consistent throughout ' 'DataFrames') - return make_block(concat_values, blocks[0].items, self.new_axes[0]) + return make_block(concat_values, + blocks[0].items, + self.new_axes[0], + placement=blocks[0]._ref_locs) else: offsets = np.r_[0, np.cumsum([len(x._data.axes[0]) for diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 5cfe22781f362..fde6eb59dfa10 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -1396,6 +1396,33 @@ def test_crossed_dtypes_weird_corner(self): [df, df2], keys=['one', 'two'], names=['first', 'second']) self.assertEqual(result.index.names, ('first', 'second')) + def test_dups_index(self): + # GH 4771 + + # single dtypes + df = DataFrame(np.random.randint(0,10,size=40).reshape(10,4),columns=['A','A','C','C']) + + result = concat([df,df],axis=1) + assert_frame_equal(result.iloc[:,:4],df) + assert_frame_equal(result.iloc[:,4:],df) + + result = concat([df,df],axis=0) + assert_frame_equal(result.iloc[:10],df) + assert_frame_equal(result.iloc[10:],df) + + # multi dtypes + df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']), + DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])], + axis=1) + + result = concat([df,df],axis=1) + assert_frame_equal(result.iloc[:,:6],df) + assert_frame_equal(result.iloc[:,6:],df) + + result = concat([df,df],axis=0) + assert_frame_equal(result.iloc[:10],df) + assert_frame_equal(result.iloc[10:],df) + def test_handle_empty_objects(self): df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) From 75d378bd0213891fcabb6caba43247a0ade662b9 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 7 Sep 2013 16:08:51 -0400 Subject: [PATCH 2/2] TST: add append/join tests for merging dup columns BUG: join on dup columns (internally) failing --- pandas/tools/merge.py | 36 +++++++++++++++++++++++--------- pandas/tools/tests/test_merge.py | 21 +++++++++++++++++++ 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 077a3fe6294da..d7fedecdb0ef2 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -649,6 +649,7 @@ def __init__(self, data_list, join_index, indexers, axis=1, copy=True): for data, indexer in zip(data_list, indexers): if not data.is_consolidated(): data = data.consolidate() + data._set_ref_locs() self.units.append(_JoinUnit(data.blocks, indexer)) self.join_index = join_index @@ -682,7 +683,6 @@ def get_result(self): blockmaps = self._prepare_blocks() kinds = _get_merge_block_kinds(blockmaps) - result_is_unique = self.result_axes[0].is_unique result_blocks = [] # maybe want to enable flexible copying <-- what did I mean? @@ -692,23 +692,28 @@ def get_result(self): if klass in mapping: klass_blocks.extend((unit, b) for b in mapping[klass]) res_blk = self._get_merged_block(klass_blocks) - - # if we have a unique result index, need to clear the _ref_locs - # a non-unique is set as we are creating - if result_is_unique: - res_blk.set_ref_locs(None) - result_blocks.append(res_blk) return BlockManager(result_blocks, self.result_axes) def _get_merged_block(self, to_merge): if len(to_merge) > 1: + + # placement set here return self._merge_blocks(to_merge) else: unit, block = to_merge[0] - return unit.reindex_block(block, self.axis, - self.result_items, copy=self.copy) + blk = unit.reindex_block(block, self.axis, + self.result_items, copy=self.copy) + + # set placement / invalidate on a unique result + if self.result_items.is_unique and blk._ref_locs is not None: + if not self.copy: + blk = blk.copy() + blk.set_ref_locs(None) + + return blk + def _merge_blocks(self, merge_chunks): """ @@ -736,7 +741,18 @@ def _merge_blocks(self, merge_chunks): # does not sort new_block_items = _concat_indexes([b.items for _, b in merge_chunks]) - return make_block(out, new_block_items, self.result_items) + + # need to set placement if we have a non-unique result + # calculate by the existing placement plus the offset in the result set + placement = None + if not self.result_items.is_unique: + nchunks = len(merge_chunks) + offsets = np.array([0] + [ len(self.result_items) / nchunks ] * (nchunks-1)).cumsum() + placement = [] + for (unit, blk), offset in zip(merge_chunks,offsets): + placement.extend(blk.ref_locs+offset) + + return make_block(out, new_block_items, self.result_items, placement=placement) class _JoinUnit(object): diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index fde6eb59dfa10..f7eb3c125db61 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -1423,6 +1423,27 @@ def test_dups_index(self): assert_frame_equal(result.iloc[:10],df) assert_frame_equal(result.iloc[10:],df) + # append + result = df.iloc[0:8,:].append(df.iloc[8:]) + assert_frame_equal(result, df) + + result = df.iloc[0:8,:].append(df.iloc[8:9]).append(df.iloc[9:10]) + assert_frame_equal(result, df) + + expected = concat([df,df],axis=0) + result = df.append(df) + assert_frame_equal(result, expected) + + def test_join_dups(self): + df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']), + DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])], + axis=1) + + expected = concat([df,df],axis=1) + result = df.join(df,rsuffix='_2') + result.columns = expected.columns + assert_frame_equal(result, expected) + def test_handle_empty_objects(self): df = DataFrame(np.random.randn(10, 4), columns=list('abcd'))