ENH: closes #720, clarification on docs, vbench for sortlevel

adamklein · adamklein · commit b6ee864616bd · 2012-01-31T16:40:23.000-05:00
diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst
@@ -654,7 +654,7 @@ instance:
 
 .. ipython:: python
 
-   midx = MultiIndex(levels=[['one', 'two'], ['x','y']],
+   midx = MultiIndex(levels=[['zero', 'one'], ['x','y']],
                      labels=[[1,1,0,0],[1,0,1,0]])
    df = DataFrame(randn(4,2), index=midx)
    print df
@@ -670,13 +670,15 @@ The need for sortedness
 ~~~~~~~~~~~~~~~~~~~~~~~
 
 **Caveat emptor**: the present implementation of ``MultiIndex`` requires that
-the labels be lexicographically sorted into groups for some of the slicing /
-indexing routines to work correctly. You can think about this as meaning that
-the axis is broken up into a tree structure, where every leaf in a particular
-branch shares the same labels at that level of the hierarchy. However, the
-``MultiIndex`` does not enforce this: **you are responsible for ensuring that
-things are properly sorted**. There is an important new method ``sortlevel``
-which will lexicographically sort an axis with a ``MultiIndex``:
+the labels be sorted for some of the slicing / indexing routines to work
+correctly. You can think about breaking the axis into unique groups, where at
+the hierarchical level of interest, each distinct group shares a label, but no
+two have the same label. However, the ``MultiIndex`` does not enforce this:
+**you are responsible for ensuring that things are properly sorted**. There is
+an important new method ``sortlevel`` to sort an axis within a ``MultiIndex``
+so that its labels are grouped and sorted by the original ordering of the
+associated factor at that level. Note that this does not necessarily mean the
+labels will be sorted lexicographically!
 
 .. ipython:: python
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4055,9 +4055,17 @@ def complete_dataframe(obj, prev_completions):
     except Exception:
         pass
 
-def _lexsort_indexer(keys):
+def _indexer_from_factorized(labels, shape):
     from pandas.core.groupby import get_group_index, _compress_group_index
 
+    group_index = get_group_index(labels, shape)
+    comp_ids, obs_ids = _compress_group_index(group_index)
+    max_group = len(obs_ids)
+    indexer, _ = lib.groupsort_indexer(comp_ids.astype('i4'), max_group)
+
+    return indexer
+
+def _lexsort_indexer(keys):
     labels = []
     shape = []
     for key in keys:
@@ -4069,12 +4077,7 @@ def _lexsort_indexer(keys):
         ids, _ = rizer.factorize(key, sort=True)
         labels.append(ids)
         shape.append(len(rizer.uniques))
-
-    group_index = get_group_index(labels, shape)
-    comp_ids, obs_ids = _compress_group_index(group_index)
-    max_group = len(obs_ids)
-    indexer, _ = lib.groupsort_indexer(comp_ids.astype('i4'), max_group)
-    return indexer
+    return _indexer_from_factorized(labels, shape)
 
 if __name__ == '__main__':
     import nose
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1323,6 +1323,10 @@ def _get_slice(slob):
         yield i, _get_slice(slice(start, end))
 
 def get_group_index(label_list, shape):
+    """
+    Gets the offsets into what would be the cartesian product of all
+    possible labels given the label_list.
+    """
     if len(label_list) == 1:
         return label_list[0]
 
diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -1506,7 +1506,8 @@ def __getslice__(self, i, j):
 
     def sortlevel(self, level=0, ascending=True):
         """
-        Sort MultiIndex lexicographically by requested level
+        Sort MultiIndex at the requested level. The result will respect the
+        original ordering of the associated factor at that level.
 
         Parameters
         ----------
@@ -1519,19 +1520,19 @@ def sortlevel(self, level=0, ascending=True):
         -------
         sorted_index : MultiIndex
         """
-        # TODO: check if lexsorted when level=0
+        from pandas.core.frame import _indexer_from_factorized
 
         labels = list(self.labels)
+
         level = self._get_level_number(level)
         primary = labels.pop(level)
-
-        # Lexsort starts from END
-        indexer = np.lexsort(tuple(labels[::-1]) + (primary,))
-
+        indexer = _indexer_from_factorized((primary,) + tuple(labels),
+                                           self.levshape)
         if not ascending:
             indexer = indexer[::-1]
 
         new_labels = [lab.take(indexer) for lab in self.labels]
+
         new_index = MultiIndex(levels=self.levels, labels=new_labels,
                                names=self.names, sortorder=level)
 
diff --git a/vb_suite/indexing.py b/vb_suite/indexing.py
@@ -93,3 +93,19 @@
 indexing_dataframe_boolean_rows_object = \
     Benchmark("df[obj_indexer]", setup,
               name='indexing_dataframe_boolean_rows_object')
+
+#----------------------------------------------------------------------
+# MultiIndex sortlevel
+
+setup = common_setup + """
+level1 = np.array([tm.rands(10) for _ in xrange(1000)], dtype='O')
+level2 = np.array([tm.rands(10) for _ in xrange(10)], dtype='O')
+label1 = np.random.randint(0, 1000, size=100000)
+label2 = np.random.randint(0, 10, size=100000)
+midx = MultiIndex(labels=[label1,label2],
+                  levels=[level1,label2])
+"""
+sorting_level_zero = Benchmark("midx.sortlevel(0)", setup, 
+                               start_date=datetime(2012,1,1))
+sorting_level_one = Benchmark("midx.sortlevel(1)", setup,
+                              start_date=datetime(2012,1,1))