Skip to content

BUG: Bug in concatenation with duplicate columns across dtypes not merging with axis=0 (GH4771) #4772

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 7, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,8 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
- Bug in multi-indexing with a partial string selection as one part of a MultIndex (:issue:`4758`)
- Bug with reindexing on the index with a non-unique index will now raise ``ValueError`` (:issue:`4746`)
- Bug in setting with ``loc/ix`` a single indexer with a multi-index axis and a numpy array, related to (:issue:`3777`)
- Bug in concatenation with duplicate columns across dtypes not merging with axis=0 (:issue:`4771`)
- Bug in ``iloc`` with a slice index failing (:issue:`4771`)

pandas 0.12
===========
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -2174,7 +2174,7 @@ def get_slice(self, slobj, axis=0, raise_on_error=False):
placement=blk._ref_locs)
new_blocks = [newb]
else:
return self.reindex_items(new_items)
return self.reindex_items(new_items, indexer=np.arange(len(self.items))[slobj])
else:
new_blocks = self._slice_blocks(slobj, axis)

Expand Down
25 changes: 24 additions & 1 deletion pandas/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
MultiIndex, DatetimeIndex, Timestamp)
from pandas.util.testing import (assert_almost_equal, assert_series_equal,
assert_frame_equal, assert_panel_equal)
from pandas import compat
from pandas import compat, concat

import pandas.util.testing as tm
import pandas.lib as lib
Expand Down Expand Up @@ -359,6 +359,29 @@ def test_iloc_getitem_slice(self):
self.check_result('slice', 'iloc', slice(1,3), 'ix', { 0 : [2,4], 1: [3,6], 2: [4,8] }, typs = ['ints'])
self.check_result('slice', 'iloc', slice(1,3), 'indexer', slice(1,3), typs = ['labels','mixed','ts','floats','empty'], fails = IndexError)

def test_iloc_getitem_slice_dups(self):

df1 = DataFrame(np.random.randn(10,4),columns=['A','A','B','B'])
df2 = DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])

# axis=1
df = concat([df1,df2],axis=1)
assert_frame_equal(df.iloc[:,:4],df1)
assert_frame_equal(df.iloc[:,4:],df2)

df = concat([df2,df1],axis=1)
assert_frame_equal(df.iloc[:,:2],df2)
assert_frame_equal(df.iloc[:,2:],df1)

assert_frame_equal(df.iloc[:,0:3],concat([df2,df1.iloc[:,[0]]],axis=1))

# axis=0
df = concat([df,df],axis=0)
assert_frame_equal(df.iloc[0:10,:2],df2)
assert_frame_equal(df.iloc[0:10,2:],df1)
assert_frame_equal(df.iloc[10:,:2],df2)
assert_frame_equal(df.iloc[10:,2:],df1)

def test_iloc_getitem_out_of_bounds(self):

# out-of-bounds slice
Expand Down
42 changes: 31 additions & 11 deletions pandas/tools/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,7 @@ def __init__(self, data_list, join_index, indexers, axis=1, copy=True):
for data, indexer in zip(data_list, indexers):
if not data.is_consolidated():
data = data.consolidate()
data._set_ref_locs()
self.units.append(_JoinUnit(data.blocks, indexer))

self.join_index = join_index
Expand Down Expand Up @@ -682,7 +683,6 @@ def get_result(self):
blockmaps = self._prepare_blocks()
kinds = _get_merge_block_kinds(blockmaps)

result_is_unique = self.result_axes[0].is_unique
result_blocks = []

# maybe want to enable flexible copying <-- what did I mean?
Expand All @@ -692,23 +692,28 @@ def get_result(self):
if klass in mapping:
klass_blocks.extend((unit, b) for b in mapping[klass])
res_blk = self._get_merged_block(klass_blocks)

# if we have a unique result index, need to clear the _ref_locs
# a non-unique is set as we are creating
if result_is_unique:
res_blk.set_ref_locs(None)

result_blocks.append(res_blk)

return BlockManager(result_blocks, self.result_axes)

def _get_merged_block(self, to_merge):
if len(to_merge) > 1:

# placement set here
return self._merge_blocks(to_merge)
else:
unit, block = to_merge[0]
return unit.reindex_block(block, self.axis,
self.result_items, copy=self.copy)
blk = unit.reindex_block(block, self.axis,
self.result_items, copy=self.copy)

# set placement / invalidate on a unique result
if self.result_items.is_unique and blk._ref_locs is not None:
if not self.copy:
blk = blk.copy()
blk.set_ref_locs(None)

return blk


def _merge_blocks(self, merge_chunks):
"""
Expand Down Expand Up @@ -736,7 +741,18 @@ def _merge_blocks(self, merge_chunks):

# does not sort
new_block_items = _concat_indexes([b.items for _, b in merge_chunks])
return make_block(out, new_block_items, self.result_items)

# need to set placement if we have a non-unique result
# calculate by the existing placement plus the offset in the result set
placement = None
if not self.result_items.is_unique:
nchunks = len(merge_chunks)
offsets = np.array([0] + [ len(self.result_items) / nchunks ] * (nchunks-1)).cumsum()
placement = []
for (unit, blk), offset in zip(merge_chunks,offsets):
placement.extend(blk.ref_locs+offset)

return make_block(out, new_block_items, self.result_items, placement=placement)


class _JoinUnit(object):
Expand Down Expand Up @@ -992,6 +1008,7 @@ def _prepare_blocks(self):
blockmaps = []
for data in reindexed_data:
data = data.consolidate()
data._set_ref_locs()
blockmaps.append(data.get_block_map(typ='dict'))
return blockmaps, reindexed_data

Expand Down Expand Up @@ -1063,7 +1080,10 @@ def _concat_blocks(self, blocks):
# or maybe would require performance test)
raise PandasError('dtypes are not consistent throughout '
'DataFrames')
return make_block(concat_values, blocks[0].items, self.new_axes[0])
return make_block(concat_values,
blocks[0].items,
self.new_axes[0],
placement=blocks[0]._ref_locs)
else:

offsets = np.r_[0, np.cumsum([len(x._data.axes[0]) for
Expand Down
48 changes: 48 additions & 0 deletions pandas/tools/tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1396,6 +1396,54 @@ def test_crossed_dtypes_weird_corner(self):
[df, df2], keys=['one', 'two'], names=['first', 'second'])
self.assertEqual(result.index.names, ('first', 'second'))

def test_dups_index(self):
# GH 4771

# single dtypes
df = DataFrame(np.random.randint(0,10,size=40).reshape(10,4),columns=['A','A','C','C'])

result = concat([df,df],axis=1)
assert_frame_equal(result.iloc[:,:4],df)
assert_frame_equal(result.iloc[:,4:],df)

result = concat([df,df],axis=0)
assert_frame_equal(result.iloc[:10],df)
assert_frame_equal(result.iloc[10:],df)

# multi dtypes
df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']),
DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])],
axis=1)

result = concat([df,df],axis=1)
assert_frame_equal(result.iloc[:,:6],df)
assert_frame_equal(result.iloc[:,6:],df)

result = concat([df,df],axis=0)
assert_frame_equal(result.iloc[:10],df)
assert_frame_equal(result.iloc[10:],df)

# append
result = df.iloc[0:8,:].append(df.iloc[8:])
assert_frame_equal(result, df)

result = df.iloc[0:8,:].append(df.iloc[8:9]).append(df.iloc[9:10])
assert_frame_equal(result, df)

expected = concat([df,df],axis=0)
result = df.append(df)
assert_frame_equal(result, expected)

def test_join_dups(self):
df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']),
DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])],
axis=1)

expected = concat([df,df],axis=1)
result = df.join(df,rsuffix='_2')
result.columns = expected.columns
assert_frame_equal(result, expected)

def test_handle_empty_objects(self):
df = DataFrame(np.random.randn(10, 4), columns=list('abcd'))

Expand Down