-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: Allow for join between two multi-index dataframe instances #20356
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
b581789
2d61a12
4d4acc5
66d82fb
c091bb4
d56ebcd
0cdad73
c2a65aa
571fdf7
ae2d8ad
405c1a4
1d2d9f3
f0ac24d
5ac40ff
be862c7
e10cbde
06d48d0
f54c151
c75108d
c690260
4092b34
cfd5fcc
6c8131d
ecaf515
8b5d0aa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -813,8 +813,11 @@ def _get_merge_keys(self): | |
|
||
left, right = self.left, self.right | ||
|
||
is_lkey = lambda x: is_array_like(x) and len(x) == len(left) | ||
is_rkey = lambda x: is_array_like(x) and len(x) == len(right) | ||
def is_lkey(x): | ||
return is_array_like(x) and len(x) == len(left) | ||
|
||
def is_rkey(x): | ||
return is_array_like(x) and len(x) == len(right) | ||
|
||
# Note that pd.merge_asof() has separate 'on' and 'by' parameters. A | ||
# user could, for example, request 'left_index' and 'left_by'. In a | ||
|
@@ -968,11 +971,11 @@ def _maybe_coerce_merge_keys(self): | |
|
||
# boolean values are considered as numeric, but are still allowed | ||
# to be merged on object boolean values | ||
elif ((is_numeric_dtype(lk) and not is_bool_dtype(lk)) | ||
and not is_numeric_dtype(rk)): | ||
elif ((is_numeric_dtype(lk) and not is_bool_dtype(lk)) and not | ||
is_numeric_dtype(rk)): | ||
raise ValueError(msg) | ||
elif (not is_numeric_dtype(lk) | ||
and (is_numeric_dtype(rk) and not is_bool_dtype(rk))): | ||
elif (not is_numeric_dtype(lk) and | ||
(is_numeric_dtype(rk) and not is_bool_dtype(rk))): | ||
raise ValueError(msg) | ||
elif is_datetimelike(lk) and not is_datetimelike(rk): | ||
raise ValueError(msg) | ||
|
@@ -1138,6 +1141,82 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', | |
return join_func(lkey, rkey, count, **kwargs) | ||
|
||
|
||
def _complete_multilevel_join(left, right, how, dropped_levels, | ||
join_idx, lidx, ridx): | ||
""" | ||
*this is an internal non-public method* | ||
|
||
Returns the levels, labels and names of a multilevel to multilevel join | ||
Depending on the type of join, this method restores the appropriate | ||
dropped levels of the joined multi-index. The method relies on lidx, ridx | ||
which hold the index positions of left and right, where a join was feasible | ||
|
||
Parameters | ||
---------- | ||
left : Index | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are always MultiIndexes, correct? If so let's specify that. Also, to shorten these a bit, numpydoc allows you to group similar parameters
could do the same for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you update this |
||
left index | ||
right : Index | ||
right index | ||
join_idx : Index | ||
the index of the join between the common levels of left and right | ||
how : {'left', 'right', 'outer', 'inner'} | ||
harisbal marked this conversation as resolved.
Show resolved
Hide resolved
|
||
lidx : intp array | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't do these types formally, but I think something like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you update |
||
left indexer | ||
right : intp array | ||
right indexer | ||
dropped_levels : str array | ||
list of non-common levels | ||
|
||
Returns | ||
------- | ||
levels : intp array | ||
harisbal marked this conversation as resolved.
Show resolved
Hide resolved
|
||
levels of combined multiindexes | ||
labels : str array | ||
labels of combined multiindexes | ||
names : str array | ||
names of combined multiindexes | ||
|
||
""" | ||
|
||
join_levels = join_idx.levels | ||
join_labels = join_idx.labels | ||
join_names = join_idx.names | ||
|
||
# lidx and ridx hold the indexes where the join occured | ||
# for left and right respectively. If left (right) is None it means that | ||
# the join occured on all indices of left (right) | ||
if lidx is None: | ||
lidx = range(0, len(left)) | ||
|
||
if ridx is None: | ||
ridx = range(0, len(right)) | ||
|
||
# Iterate through the levels that must be restored | ||
for dl in dropped_levels: | ||
harisbal marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if dl in left.names: | ||
idx = left | ||
indexer = lidx | ||
else: | ||
idx = right | ||
indexer = ridx | ||
|
||
# The index of the level name to be restored | ||
name_idx = idx.names.index(dl) | ||
|
||
restore_levels = idx.levels[name_idx].values | ||
restore_labels = idx.labels[name_idx] | ||
|
||
join_levels = join_levels.__add__([restore_levels]) | ||
harisbal marked this conversation as resolved.
Show resolved
Hide resolved
|
||
join_names = join_names.__add__([dl]) | ||
|
||
# Inject -1 in the labels list where a join was not possible | ||
# IOW indexer[i]=-1 | ||
labels = [restore_labels[i] if i != -1 else -1 for i in indexer] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should be a set operation on the arrays i think There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry @jreback but I'm not sure what you mean There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this was addressed here (Thanks to @TomAugspurger): |
||
join_labels = join_labels.__add__([labels]) | ||
|
||
return join_levels, join_labels, join_names | ||
|
||
|
||
class _OrderedMerge(_MergeOperation): | ||
_merge_type = 'ordered_merge' | ||
|
||
|
@@ -1450,6 +1529,9 @@ def flip(xs): | |
def _get_multiindex_indexer(join_keys, index, sort): | ||
from functools import partial | ||
|
||
def i8copy(a): | ||
return a.astype('i8', subok=False, copy=True) | ||
|
||
# bind `sort` argument | ||
fkeys = partial(_factorize_keys, sort=sort) | ||
|
||
|
@@ -1458,7 +1540,6 @@ def _get_multiindex_indexer(join_keys, index, sort): | |
if sort: | ||
rlab = list(map(np.take, rlab, index.labels)) | ||
else: | ||
i8copy = lambda a: a.astype('i8', subok=False, copy=True) | ||
rlab = list(map(i8copy, index.labels)) | ||
|
||
# fix right labels if there were any nulls | ||
|
@@ -1604,8 +1685,11 @@ def _sort_labels(uniques, left, right): | |
|
||
def _get_join_keys(llab, rlab, shape, sort): | ||
|
||
def pred(i): | ||
return not is_int64_overflow_possible(shape[:i]) | ||
|
||
# how many levels can be done without overflow | ||
pred = lambda i: not is_int64_overflow_possible(shape[:i]) | ||
|
||
nlev = next(filter(pred, range(len(shape), 0, -1))) | ||
|
||
# get keys for the first `nlev` levels | ||
|
Uh oh!
There was an error while loading. Please reload this page.