Skip to content

Commit 1d2d9f3

Browse files
harisbalharisbal
authored and
harisbal
committed
Fix ci
1 parent 2eef865 commit 1d2d9f3

File tree

5 files changed

+868
-555
lines changed

5 files changed

+868
-555
lines changed

doc/source/whatsnew/v0.24.0.txt

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,47 @@ array, but rather an ``ExtensionArray``:
180180
This is the same behavior as ``Series.values`` for categorical data. See
181181
:ref:`whatsnew_0240.api_breaking.interval_values` for more.
182182

183+
.. _whatsnew_0240.enhancements.join_with_two_multiindexes:
184+
185+
Joining with two multi-indexes
186+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
187+
188+
:func:`Datafame.merge` and :func:`Dataframe.join` can now be used to join multi-indexed ``Dataframe`` instances on the overlaping index levels (:issue:`6360`)
189+
190+
See the :ref:`Merge, join, and concatenate
191+
<merging.Join_with_two_multi_indexes>` documentation section.
192+
193+
.. ipython:: python
194+
195+
index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'),
196+
('K1', 'X2')],
197+
names=['key', 'X'])
198+
199+
200+
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
201+
'B': ['B0', 'B1', 'B2']},
202+
index=index_left)
203+
204+
205+
index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'),
206+
('K2', 'Y2'), ('K2', 'Y3')],
207+
names=['key', 'Y'])
208+
209+
210+
right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'],
211+
'D': ['D0', 'D1', 'D2', 'D3']},
212+
index=index_right)
213+
214+
215+
left.join(right)
216+
217+
For earlier versions this can be done using the following.
218+
219+
.. ipython:: python
220+
221+
pd.merge(left.reset_index(), right.reset_index(),
222+
on=['key'], how='inner').set_index(['key','X','Y'])
223+
183224
.. _whatsnew_0240.enhancements.rename_axis:
184225

185226
Renaming names in a MultiIndex
@@ -227,6 +268,7 @@ Other Enhancements
227268
- :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`)
228269
- :class:`DatetimeIndex` gained :attr:`DatetimeIndex.timetz` attribute. Returns local time with timezone information. (:issue:`21358`)
229270
- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`)
271+
- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support a ``nonexistent`` argument for handling datetimes that are rounded to nonexistent times. See :ref:`timeseries.timezone_nonexsistent` (:issue:`22647`)
230272
- :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`).
231273
- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`).
232274
- :meth:`pandas.core.dtypes.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``,
@@ -1060,6 +1102,7 @@ Performance Improvements
10601102
- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`)
10611103
- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`)
10621104
- Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`)
1105+
- Improved performance of :func:`pd.concat` for `Series` objects (:issue:`23404`)
10631106

10641107

10651108
.. _whatsnew_0240.docs:

pandas/core/indexes/base.py

Lines changed: 69 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from pandas.core.dtypes.generic import (
1919
ABCSeries, ABCDataFrame,
2020
ABCMultiIndex,
21-
ABCPeriodIndex, ABCTimedeltaIndex,
21+
ABCPeriodIndex, ABCTimedeltaIndex, ABCDatetimeIndex,
2222
ABCDateOffset)
2323
from pandas.core.dtypes.missing import isna, array_equivalent
2424
from pandas.core.dtypes.cast import maybe_cast_to_integer_array
@@ -545,6 +545,10 @@ def _shallow_copy(self, values=None, **kwargs):
545545

546546
# _simple_new expects an ndarray
547547
values = getattr(values, 'values', values)
548+
if isinstance(values, ABCDatetimeIndex):
549+
# `self.values` returns `self` for tz-aware, so we need to unwrap
550+
# more specifically
551+
values = values.asi8
548552

549553
return self._simple_new(values, **attributes)
550554

@@ -2947,7 +2951,8 @@ def difference(self, other):
29472951
self._assert_can_do_setop(other)
29482952

29492953
if self.equals(other):
2950-
return self._shallow_copy([])
2954+
# pass an empty np.ndarray with the appropriate dtype
2955+
return self._shallow_copy(self._data[:0])
29512956

29522957
other, result_name = self._convert_can_do_setop(other)
29532958

@@ -3715,7 +3720,8 @@ def reindex(self, target, method=None, level=None, limit=None,
37153720
if not isinstance(target, Index) and len(target) == 0:
37163721
attrs = self._get_attributes_dict()
37173722
attrs.pop('freq', None) # don't preserve freq
3718-
target = self._simple_new(None, dtype=self.dtype, **attrs)
3723+
values = self._data[:0] # appropriately-dtyped empty array
3724+
target = self._simple_new(values, dtype=self.dtype, **attrs)
37193725
else:
37203726
target = ensure_index(target)
37213727

@@ -3930,46 +3936,72 @@ def join(self, other, how='left', level=None, return_indexers=False,
39303936

39313937
def _join_multi(self, other, how, return_indexers=True):
39323938
from .multi import MultiIndex
3939+
from pandas.core.reshape.merge import _restore_dropped_levels_multijoin
3940+
3941+
# figure out join names
3942+
self_names = set(com._not_none(*self.names))
3943+
other_names = set(com._not_none(*other.names))
3944+
overlap = self_names & other_names
3945+
3946+
# need at least 1 in common
3947+
if not overlap:
3948+
raise ValueError("cannot join with no overlapping index names")
3949+
39333950
self_is_mi = isinstance(self, MultiIndex)
39343951
other_is_mi = isinstance(other, MultiIndex)
39353952

3936-
# figure out join names
3937-
self_names = com._not_none(*self.names)
3938-
other_names = com._not_none(*other.names)
3939-
overlap = list(set(self_names) & set(other_names))
3940-
3941-
# need at least 1 in common, but not more than 1
3942-
if not len(overlap):
3943-
raise ValueError("cannot join with no level specified and no "
3944-
"overlapping names")
3945-
if len(overlap) > 1:
3946-
raise NotImplementedError("merging with more than one level "
3947-
"overlap on a multi-index is not "
3948-
"implemented")
3949-
jl = overlap[0]
3953+
if self_is_mi and other_is_mi:
3954+
3955+
# Drop the non-matching levels from left and right respectively
3956+
ldrop_names = list(self_names - overlap)
3957+
rdrop_names = list(other_names - overlap)
3958+
3959+
self_jnlevels = self.droplevel(ldrop_names)
3960+
other_jnlevels = other.droplevel(rdrop_names)
3961+
3962+
# Join left and right
3963+
# Join on same leveled multi-index frames is supported
3964+
join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how,
3965+
return_indexers=True)
3966+
3967+
# Restore the dropped levels
3968+
# Returned index level order is
3969+
# common levels, ldrop_names, rdrop_names
3970+
dropped_names = ldrop_names + rdrop_names
3971+
3972+
levels, labels, names = (
3973+
_restore_dropped_levels_multijoin(self, other,
3974+
dropped_names,
3975+
join_idx,
3976+
lidx, ridx))
39503977

3978+
# Re-create the multi-index
3979+
multi_join_idx = MultiIndex(levels=levels, labels=labels,
3980+
names=names, verify_integrity=False)
3981+
3982+
multi_join_idx = multi_join_idx.remove_unused_levels()
3983+
3984+
return multi_join_idx, lidx, ridx
3985+
3986+
jl = list(overlap)[0]
3987+
3988+
# Case where only one index is multi
39513989
# make the indices into mi's that match
3952-
if not (self_is_mi and other_is_mi):
3953-
3954-
flip_order = False
3955-
if self_is_mi:
3956-
self, other = other, self
3957-
flip_order = True
3958-
# flip if join method is right or left
3959-
how = {'right': 'left', 'left': 'right'}.get(how, how)
3960-
3961-
level = other.names.index(jl)
3962-
result = self._join_level(other, level, how=how,
3963-
return_indexers=return_indexers)
3964-
3965-
if flip_order:
3966-
if isinstance(result, tuple):
3967-
return result[0], result[2], result[1]
3968-
return result
3990+
flip_order = False
3991+
if self_is_mi:
3992+
self, other = other, self
3993+
flip_order = True
3994+
# flip if join method is right or left
3995+
how = {'right': 'left', 'left': 'right'}.get(how, how)
3996+
3997+
level = other.names.index(jl)
3998+
result = self._join_level(other, level, how=how,
3999+
return_indexers=return_indexers)
39694000

3970-
# 2 multi-indexes
3971-
raise NotImplementedError("merging with both multi-indexes is not "
3972-
"implemented")
4001+
if flip_order:
4002+
if isinstance(result, tuple):
4003+
return result[0], result[2], result[1]
4004+
return result
39734005

39744006
def _join_non_unique(self, other, how='left', return_indexers=False):
39754007
from pandas.core.reshape.merge import _get_join_indexers

pandas/core/reshape/merge.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1122,6 +1122,88 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
11221122
return join_func(lkey, rkey, count, **kwargs)
11231123

11241124

1125+
def _restore_dropped_levels_multijoin(left, right, dropped_level_names,
1126+
join_idx, lidx, ridx):
1127+
"""
1128+
*this is an internal non-public method*
1129+
1130+
Returns the levels, labels and names of a multil-index to multi-index join.
1131+
Depending on the type of join, this method restores the appropriate
1132+
dropped levels of the joined multi-index. The method relies on lidx, ridx
1133+
which hold the index positions of left and right, where a join was feasible
1134+
1135+
Parameters
1136+
----------
1137+
left : Index
1138+
left index
1139+
right : Index
1140+
right index
1141+
dropped_level_names : str array
1142+
list of non-common levels
1143+
join_idx : Index
1144+
the index of the join between the common levels of left and right
1145+
lidx : intp array
1146+
left indexer
1147+
right : intp array
1148+
right indexer
1149+
1150+
Returns
1151+
-------
1152+
levels : intp ndarray
1153+
levels of combined multiindexes
1154+
labels : str array
1155+
labels of combined multiindexes
1156+
names : str array
1157+
names of combined multiindexes
1158+
1159+
"""
1160+
1161+
# Convert to 1 level multi-index if not
1162+
if not isinstance(join_idx, MultiIndex):
1163+
levels = [join_idx.values]
1164+
labels = [list(range(join_idx.size))]
1165+
names = [join_idx.name]
1166+
join_idx = MultiIndex(levels=levels, labels=labels,
1167+
names=names, verify_integrity=False)
1168+
1169+
join_levels = join_idx.levels
1170+
join_labels = join_idx.labels
1171+
join_names = join_idx.names
1172+
1173+
# lidx and ridx hold the indexes where the join occured
1174+
# for left and right respectively. If left (right) is None then
1175+
# the join occured on all indices of left (right)
1176+
if lidx is None:
1177+
lidx = range(left.size)
1178+
1179+
if ridx is None:
1180+
ridx = range(right.size)
1181+
1182+
# Iterate through the levels that must be restored
1183+
for dropped_level_name in dropped_level_names:
1184+
if dropped_level_name in left.names:
1185+
idx = left
1186+
indexer = lidx
1187+
else:
1188+
idx = right
1189+
indexer = ridx
1190+
1191+
# The index of the level name to be restored
1192+
name_idx = idx.names.index(dropped_level_name)
1193+
1194+
restore_levels = idx.levels[name_idx].values
1195+
# Inject -1 in the labels list where a join was not possible
1196+
# IOW indexer[i]=-1
1197+
labels = idx.labels[name_idx]
1198+
restore_labels = algos.take_nd(labels, indexer, fill_value=-1)
1199+
1200+
join_levels = join_levels.__add__([restore_levels])
1201+
join_labels = join_labels.__add__([restore_labels])
1202+
join_names = join_names.__add__([dropped_level_name])
1203+
1204+
return join_levels, join_labels, join_names
1205+
1206+
11251207
class _OrderedMerge(_MergeOperation):
11261208
_merge_type = 'ordered_merge'
11271209

0 commit comments

Comments
 (0)