Skip to content

Commit b6ee864

Browse files
committed
ENH: closes #720, clarification on docs, vbench for sortlevel
1 parent 30d8d6a commit b6ee864

File tree

5 files changed

+47
-21
lines changed

5 files changed

+47
-21
lines changed

doc/source/indexing.rst

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -654,7 +654,7 @@ instance:
654654

655655
.. ipython:: python
656656
657-
midx = MultiIndex(levels=[['one', 'two'], ['x','y']],
657+
midx = MultiIndex(levels=[['zero', 'one'], ['x','y']],
658658
labels=[[1,1,0,0],[1,0,1,0]])
659659
df = DataFrame(randn(4,2), index=midx)
660660
print df
@@ -670,13 +670,15 @@ The need for sortedness
670670
~~~~~~~~~~~~~~~~~~~~~~~
671671

672672
**Caveat emptor**: the present implementation of ``MultiIndex`` requires that
673-
the labels be lexicographically sorted into groups for some of the slicing /
674-
indexing routines to work correctly. You can think about this as meaning that
675-
the axis is broken up into a tree structure, where every leaf in a particular
676-
branch shares the same labels at that level of the hierarchy. However, the
677-
``MultiIndex`` does not enforce this: **you are responsible for ensuring that
678-
things are properly sorted**. There is an important new method ``sortlevel``
679-
which will lexicographically sort an axis with a ``MultiIndex``:
673+
the labels be sorted for some of the slicing / indexing routines to work
674+
correctly. You can think about breaking the axis into unique groups, where at
675+
the hierarchical level of interest, each distinct group shares a label, but no
676+
two have the same label. However, the ``MultiIndex`` does not enforce this:
677+
**you are responsible for ensuring that things are properly sorted**. There is
678+
an important new method ``sortlevel`` to sort an axis within a ``MultiIndex``
679+
so that its labels are grouped and sorted by the original ordering of the
680+
associated factor at that level. Note that this does not necessarily mean the
681+
labels will be sorted lexicographically!
680682

681683
.. ipython:: python
682684

pandas/core/frame.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4055,9 +4055,17 @@ def complete_dataframe(obj, prev_completions):
40554055
except Exception:
40564056
pass
40574057

4058-
def _lexsort_indexer(keys):
4058+
def _indexer_from_factorized(labels, shape):
40594059
from pandas.core.groupby import get_group_index, _compress_group_index
40604060

4061+
group_index = get_group_index(labels, shape)
4062+
comp_ids, obs_ids = _compress_group_index(group_index)
4063+
max_group = len(obs_ids)
4064+
indexer, _ = lib.groupsort_indexer(comp_ids.astype('i4'), max_group)
4065+
4066+
return indexer
4067+
4068+
def _lexsort_indexer(keys):
40614069
labels = []
40624070
shape = []
40634071
for key in keys:
@@ -4069,12 +4077,7 @@ def _lexsort_indexer(keys):
40694077
ids, _ = rizer.factorize(key, sort=True)
40704078
labels.append(ids)
40714079
shape.append(len(rizer.uniques))
4072-
4073-
group_index = get_group_index(labels, shape)
4074-
comp_ids, obs_ids = _compress_group_index(group_index)
4075-
max_group = len(obs_ids)
4076-
indexer, _ = lib.groupsort_indexer(comp_ids.astype('i4'), max_group)
4077-
return indexer
4080+
return _indexer_from_factorized(labels, shape)
40784081

40794082
if __name__ == '__main__':
40804083
import nose

pandas/core/groupby.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1323,6 +1323,10 @@ def _get_slice(slob):
13231323
yield i, _get_slice(slice(start, end))
13241324

13251325
def get_group_index(label_list, shape):
1326+
"""
1327+
Gets the offsets into what would be the cartesian product of all
1328+
possible labels given the label_list.
1329+
"""
13261330
if len(label_list) == 1:
13271331
return label_list[0]
13281332

pandas/core/index.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1506,7 +1506,8 @@ def __getslice__(self, i, j):
15061506

15071507
def sortlevel(self, level=0, ascending=True):
15081508
"""
1509-
Sort MultiIndex lexicographically by requested level
1509+
Sort MultiIndex at the requested level. The result will respect the
1510+
original ordering of the associated factor at that level.
15101511
15111512
Parameters
15121513
----------
@@ -1519,19 +1520,19 @@ def sortlevel(self, level=0, ascending=True):
15191520
-------
15201521
sorted_index : MultiIndex
15211522
"""
1522-
# TODO: check if lexsorted when level=0
1523+
from pandas.core.frame import _indexer_from_factorized
15231524

15241525
labels = list(self.labels)
1526+
15251527
level = self._get_level_number(level)
15261528
primary = labels.pop(level)
1527-
1528-
# Lexsort starts from END
1529-
indexer = np.lexsort(tuple(labels[::-1]) + (primary,))
1530-
1529+
indexer = _indexer_from_factorized((primary,) + tuple(labels),
1530+
self.levshape)
15311531
if not ascending:
15321532
indexer = indexer[::-1]
15331533

15341534
new_labels = [lab.take(indexer) for lab in self.labels]
1535+
15351536
new_index = MultiIndex(levels=self.levels, labels=new_labels,
15361537
names=self.names, sortorder=level)
15371538

vb_suite/indexing.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,3 +93,19 @@
9393
indexing_dataframe_boolean_rows_object = \
9494
Benchmark("df[obj_indexer]", setup,
9595
name='indexing_dataframe_boolean_rows_object')
96+
97+
#----------------------------------------------------------------------
98+
# MultiIndex sortlevel
99+
100+
setup = common_setup + """
101+
level1 = np.array([tm.rands(10) for _ in xrange(1000)], dtype='O')
102+
level2 = np.array([tm.rands(10) for _ in xrange(10)], dtype='O')
103+
label1 = np.random.randint(0, 1000, size=100000)
104+
label2 = np.random.randint(0, 10, size=100000)
105+
midx = MultiIndex(labels=[label1,label2],
106+
levels=[level1,label2])
107+
"""
108+
sorting_level_zero = Benchmark("midx.sortlevel(0)", setup,
109+
start_date=datetime(2012,1,1))
110+
sorting_level_one = Benchmark("midx.sortlevel(1)", setup,
111+
start_date=datetime(2012,1,1))

0 commit comments

Comments
 (0)