Skip to content

BUG: left join on index with multiple matches now works (GH5391) #7853

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,7 @@ Bug Fixes

- Bug in ``DatetimeIndex.value_counts`` doesn't preserve tz (:issue:`7735`)
- Bug in ``PeriodIndex.value_counts`` results in ``Int64Index`` (:issue:`7735`)
- Bug in ``DataFrame.join`` when doing left join on index and there are multiple matches (:issue:`5391`)



Expand Down
18 changes: 12 additions & 6 deletions pandas/src/join.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,18 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
left_indexer = _get_result_indexer(left_sorter, left_indexer)
right_indexer = _get_result_indexer(right_sorter, right_indexer)

if not sort:
if left_sorter.dtype != np.int_:
left_sorter = left_sorter.astype(np.int_)

rev = np.empty(len(left), dtype=np.int_)
rev.put(left_sorter, np.arange(len(left)))
if not sort: # if not asked to sort, revert to original order
if len(left) == len(left_indexer):
# no multiple matches for any row on the left
# this is a short-cut to avoid np.argsort;
# otherwise, the `else` path also works in this case
if left_sorter.dtype != np.int_:
left_sorter = left_sorter.astype(np.int_)

rev = np.empty(len(left), dtype=np.int_)
rev.put(left_sorter, np.arange(len(left)))
else:
rev = np.argsort(left_indexer)

right_indexer = right_indexer.take(rev)
left_indexer = left_indexer.take(rev)
Expand Down
32 changes: 15 additions & 17 deletions pandas/tools/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,9 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
key_col.put(na_indexer, com.take_1d(self.left_join_keys[i],
left_na_indexer))

elif left_indexer is not None:
elif left_indexer is not None \
and isinstance(self.left_join_keys[i], np.ndarray):

if name is None:
name = 'key_%d' % i

Expand Down Expand Up @@ -562,9 +564,6 @@ def _get_single_indexer(join_key, index, sort=False):


def _left_join_on_index(left_ax, right_ax, join_keys, sort=False):
join_index = left_ax
left_indexer = None

if len(join_keys) > 1:
if not ((isinstance(right_ax, MultiIndex) and
len(join_keys) == right_ax.nlevels)):
Expand All @@ -573,22 +572,21 @@ def _left_join_on_index(left_ax, right_ax, join_keys, sort=False):
"number of join keys must be the number of "
"levels in right_ax")

left_tmp, right_indexer = \
_get_multiindex_indexer(join_keys, right_ax,
sort=sort)
if sort:
left_indexer = left_tmp
join_index = left_ax.take(left_indexer)
left_indexer, right_indexer = \
_get_multiindex_indexer(join_keys, right_ax, sort=sort)
else:
jkey = join_keys[0]
if sort:
left_indexer, right_indexer = \
_get_single_indexer(jkey, right_ax, sort=sort)
join_index = left_ax.take(left_indexer)
else:
right_indexer = right_ax.get_indexer(jkey)

return join_index, left_indexer, right_indexer
left_indexer, right_indexer = \
_get_single_indexer(jkey, right_ax, sort=sort)

if sort or len(left_ax) != len(left_indexer):
# if asked to sort or there are 1-to-many matches
join_index = left_ax.take(left_indexer)
return join_index, left_indexer, right_indexer
else:
# left frame preserves order & length of its index
return left_ax, None, right_indexer


def _right_outer_join(x, y, max_groups):
Expand Down
92 changes: 92 additions & 0 deletions pandas/tools/tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -958,6 +958,98 @@ def test_left_join_index_preserve_order(self):
right_on=['k1', 'k2'], how='right')
tm.assert_frame_equal(joined.ix[:, expected.columns], expected)

def test_left_join_index_multi_match_multiindex(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this work similarly on right / inner / outer when multiple matches? e.g. is left special case behavior, if so, why is that? if not, can you test with other how's. thanks.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have not tried right join;
As in your own comment in #5391 it works for outer join.

For left join, Wes adds this extra if not sort: part to preserve order of left frame, which has this implicit assumption that there are no 1-to-many matches, and breaks if there are 1-to-many matches. see here

all the way up-to just before if not sort: things work fine.

left = DataFrame([
['X', 'Y', 'C', 'a'],
['W', 'Y', 'C', 'e'],
['V', 'Q', 'A', 'h'],
['V', 'R', 'D', 'i'],
['X', 'Y', 'D', 'b'],
['X', 'Y', 'A', 'c'],
['W', 'Q', 'B', 'f'],
['W', 'R', 'C', 'g'],
['V', 'Y', 'C', 'j'],
['X', 'Y', 'B', 'd']],
columns=['cola', 'colb', 'colc', 'tag'],
index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8])

right = DataFrame([
['W', 'R', 'C', 0],
['W', 'Q', 'B', 3],
['W', 'Q', 'B', 8],
['X', 'Y', 'A', 1],
['X', 'Y', 'A', 4],
['X', 'Y', 'B', 5],
['X', 'Y', 'C', 6],
['X', 'Y', 'C', 9],
['X', 'Q', 'C', -6],
['X', 'R', 'C', -9],
['V', 'Y', 'C', 7],
['V', 'R', 'D', 2],
['V', 'R', 'D', -1],
['V', 'Q', 'A', -3]],
columns=['col1', 'col2', 'col3', 'val'])

right.set_index(['col1', 'col2', 'col3'], inplace=True)
result = left.join(right, on=['cola', 'colb', 'colc'], how='left')

expected = DataFrame([
['X', 'Y', 'C', 'a', 6],
['X', 'Y', 'C', 'a', 9],
['W', 'Y', 'C', 'e', nan],
['V', 'Q', 'A', 'h', -3],
['V', 'R', 'D', 'i', 2],
['V', 'R', 'D', 'i', -1],
['X', 'Y', 'D', 'b', nan],
['X', 'Y', 'A', 'c', 1],
['X', 'Y', 'A', 'c', 4],
['W', 'Q', 'B', 'f', 3],
['W', 'Q', 'B', 'f', 8],
['W', 'R', 'C', 'g', 0],
['V', 'Y', 'C', 'j', 7],
['X', 'Y', 'B', 'd', 5]],
columns=['cola', 'colb', 'colc', 'tag', 'val'],
index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8])

tm.assert_frame_equal(result, expected)

def test_left_join_index_multi_match(self):
left = DataFrame([
['c', 0],
['b', 1],
['a', 2],
['b', 3]],
columns=['tag', 'val'],
index=[2, 0, 1, 3])

right = DataFrame([
['a', 'v'],
['c', 'w'],
['c', 'x'],
['d', 'y'],
['a', 'z'],
['c', 'r'],
['e', 'q'],
['c', 's']],
columns=['tag', 'char'])

right.set_index('tag', inplace=True)
result = left.join(right, on='tag', how='left')

expected = DataFrame([
['c', 0, 'w'],
['c', 0, 'x'],
['c', 0, 'r'],
['c', 0, 's'],
['b', 1, nan],
['a', 2, 'v'],
['a', 2, 'z'],
['b', 3, nan]],
columns=['tag', 'val', 'char'],
index=[2, 2, 2, 2, 0, 1, 1, 3])

tm.assert_frame_equal(result, expected)

def test_join_multi_dtypes(self):

# test with multi dtypes in the join index
Expand Down