diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index a30400322716c..98186c8fc32b1 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -310,6 +310,7 @@ Bug Fixes - Bug in ``DatetimeIndex.value_counts`` doesn't preserve tz (:issue:`7735`) - Bug in ``PeriodIndex.value_counts`` results in ``Int64Index`` (:issue:`7735`) +- Bug in ``DataFrame.join`` when doing left join on index and there are multiple matches (:issue:`5391`) diff --git a/pandas/src/join.pyx b/pandas/src/join.pyx index 91102a2fa6a18..4c32aa902d64d 100644 --- a/pandas/src/join.pyx +++ b/pandas/src/join.pyx @@ -103,12 +103,18 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, left_indexer = _get_result_indexer(left_sorter, left_indexer) right_indexer = _get_result_indexer(right_sorter, right_indexer) - if not sort: - if left_sorter.dtype != np.int_: - left_sorter = left_sorter.astype(np.int_) - - rev = np.empty(len(left), dtype=np.int_) - rev.put(left_sorter, np.arange(len(left))) + if not sort: # if not asked to sort, revert to original order + if len(left) == len(left_indexer): + # no multiple matches for any row on the left + # this is a short-cut to avoid np.argsort; + # otherwise, the `else` path also works in this case + if left_sorter.dtype != np.int_: + left_sorter = left_sorter.astype(np.int_) + + rev = np.empty(len(left), dtype=np.int_) + rev.put(left_sorter, np.arange(len(left))) + else: + rev = np.argsort(left_indexer) right_indexer = right_indexer.take(rev) left_indexer = left_indexer.take(rev) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index ee594ef031e82..7ad871e78a53b 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -235,7 +235,9 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): key_col.put(na_indexer, com.take_1d(self.left_join_keys[i], left_na_indexer)) - elif left_indexer is not None: + elif left_indexer is not None \ + and isinstance(self.left_join_keys[i], np.ndarray): + if name is None: name = 'key_%d' % i @@ -562,9 +564,6 @@ def _get_single_indexer(join_key, index, sort=False): def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): - join_index = left_ax - left_indexer = None - if len(join_keys) > 1: if not ((isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels)): @@ -573,22 +572,21 @@ def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): "number of join keys must be the number of " "levels in right_ax") - left_tmp, right_indexer = \ - _get_multiindex_indexer(join_keys, right_ax, - sort=sort) - if sort: - left_indexer = left_tmp - join_index = left_ax.take(left_indexer) + left_indexer, right_indexer = \ + _get_multiindex_indexer(join_keys, right_ax, sort=sort) else: jkey = join_keys[0] - if sort: - left_indexer, right_indexer = \ - _get_single_indexer(jkey, right_ax, sort=sort) - join_index = left_ax.take(left_indexer) - else: - right_indexer = right_ax.get_indexer(jkey) - return join_index, left_indexer, right_indexer + left_indexer, right_indexer = \ + _get_single_indexer(jkey, right_ax, sort=sort) + + if sort or len(left_ax) != len(left_indexer): + # if asked to sort or there are 1-to-many matches + join_index = left_ax.take(left_indexer) + return join_index, left_indexer, right_indexer + else: + # left frame preserves order & length of its index + return left_ax, None, right_indexer def _right_outer_join(x, y, max_groups): diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index df2f270346e20..151bb82f9c61f 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -958,6 +958,98 @@ def test_left_join_index_preserve_order(self): right_on=['k1', 'k2'], how='right') tm.assert_frame_equal(joined.ix[:, expected.columns], expected) + def test_left_join_index_multi_match_multiindex(self): + left = DataFrame([ + ['X', 'Y', 'C', 'a'], + ['W', 'Y', 'C', 'e'], + ['V', 'Q', 'A', 'h'], + ['V', 'R', 'D', 'i'], + ['X', 'Y', 'D', 'b'], + ['X', 'Y', 'A', 'c'], + ['W', 'Q', 'B', 'f'], + ['W', 'R', 'C', 'g'], + ['V', 'Y', 'C', 'j'], + ['X', 'Y', 'B', 'd']], + columns=['cola', 'colb', 'colc', 'tag'], + index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8]) + + right = DataFrame([ + ['W', 'R', 'C', 0], + ['W', 'Q', 'B', 3], + ['W', 'Q', 'B', 8], + ['X', 'Y', 'A', 1], + ['X', 'Y', 'A', 4], + ['X', 'Y', 'B', 5], + ['X', 'Y', 'C', 6], + ['X', 'Y', 'C', 9], + ['X', 'Q', 'C', -6], + ['X', 'R', 'C', -9], + ['V', 'Y', 'C', 7], + ['V', 'R', 'D', 2], + ['V', 'R', 'D', -1], + ['V', 'Q', 'A', -3]], + columns=['col1', 'col2', 'col3', 'val']) + + right.set_index(['col1', 'col2', 'col3'], inplace=True) + result = left.join(right, on=['cola', 'colb', 'colc'], how='left') + + expected = DataFrame([ + ['X', 'Y', 'C', 'a', 6], + ['X', 'Y', 'C', 'a', 9], + ['W', 'Y', 'C', 'e', nan], + ['V', 'Q', 'A', 'h', -3], + ['V', 'R', 'D', 'i', 2], + ['V', 'R', 'D', 'i', -1], + ['X', 'Y', 'D', 'b', nan], + ['X', 'Y', 'A', 'c', 1], + ['X', 'Y', 'A', 'c', 4], + ['W', 'Q', 'B', 'f', 3], + ['W', 'Q', 'B', 'f', 8], + ['W', 'R', 'C', 'g', 0], + ['V', 'Y', 'C', 'j', 7], + ['X', 'Y', 'B', 'd', 5]], + columns=['cola', 'colb', 'colc', 'tag', 'val'], + index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8]) + + tm.assert_frame_equal(result, expected) + + def test_left_join_index_multi_match(self): + left = DataFrame([ + ['c', 0], + ['b', 1], + ['a', 2], + ['b', 3]], + columns=['tag', 'val'], + index=[2, 0, 1, 3]) + + right = DataFrame([ + ['a', 'v'], + ['c', 'w'], + ['c', 'x'], + ['d', 'y'], + ['a', 'z'], + ['c', 'r'], + ['e', 'q'], + ['c', 's']], + columns=['tag', 'char']) + + right.set_index('tag', inplace=True) + result = left.join(right, on='tag', how='left') + + expected = DataFrame([ + ['c', 0, 'w'], + ['c', 0, 'x'], + ['c', 0, 'r'], + ['c', 0, 's'], + ['b', 1, nan], + ['a', 2, 'v'], + ['a', 2, 'z'], + ['b', 3, nan]], + columns=['tag', 'val', 'char'], + index=[2, 2, 2, 2, 0, 1, 1, 3]) + + tm.assert_frame_equal(result, expected) + def test_join_multi_dtypes(self): # test with multi dtypes in the join index