ENH: better MultiIndex.__repr__

topper-123 · topper-123 · commit ce4622329f41 · 2019-06-11T21:42:12.000+02:00
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -29,7 +29,8 @@
 from pandas.core.indexes.frozen import FrozenList, _ensure_frozen
 import pandas.core.missing as missing
 
-from pandas.io.formats.printing import pprint_thing
+from pandas.io.formats.printing import (
+    default_pprint, format_object_summary, pprint_thing)
 
 _index_doc_kwargs = dict(ibase._index_doc_kwargs)
 _index_doc_kwargs.update(
@@ -947,28 +948,57 @@ def _nbytes(self, deep=False):
 
     # --------------------------------------------------------------------
     # Rendering Methods
+    def _formatter_func(self, tup):
+        """
+        Formats each item in tup according to its level's formatter function.
+        """
+        formatter_funcs = [level._formatter_func for level in self.levels]
+        return tuple(func(val) for func, val in zip(formatter_funcs, tup))
 
     def _format_attrs(self):
         """
         Return a list of tuples of the (attr,formatted_value)
         """
-        attrs = [
-            ('levels', ibase.default_pprint(self._levels,
-                                            max_seq_items=False)),
-            ('codes', ibase.default_pprint(self._codes,
-                                           max_seq_items=False))]
-        if com._any_not_none(*self.names):
-            attrs.append(('names', ibase.default_pprint(self.names)))
-        if self.sortorder is not None:
-            attrs.append(('sortorder', ibase.default_pprint(self.sortorder)))
+        attrs = []
+        attrs.append(('dtype', "'{}'".format(self.dtype)))
+        if self.names is not None and any(self.names):
+            attrs.append(('names', default_pprint(self.names)))
+        max_seq_items = get_option('display.max_seq_items') or len(self)
+        if len(self) > max_seq_items:
+            attrs.append(('length', len(self)))
         return attrs
 
     def _format_space(self):
-        return "\n%s" % (' ' * (len(self.__class__.__name__) + 1))
+        return " "
 
     def _format_data(self, name=None):
-        # we are formatting thru the attributes
-        return None
+        """
+        Return the formatted data as a unicode string
+        """
+        return format_object_summary(self, self._formatter_func,
+                                     name=name, is_multi=True)
+
+    def __unicode__(self):
+        """
+        Return a string representation for this MultiIndex.
+
+        Invoked by unicode(df) in py2 only. Yields a Unicode String in both
+        py2/py3.
+        """
+        klass = self.__class__.__name__
+        data = self._format_data()
+        attrs = self._format_attrs()
+        space = self._format_space()
+
+        prepr = (",%s" % space).join("%s=%s" % (k, v) for k, v in attrs)
+
+        # no data provided, just attributes
+        if data is None:
+            data = ''
+
+        res = "%s(%s%s)" % (klass, data, prepr)
+
+        return res
 
     def _format_native_types(self, na_rep='nan', **kwargs):
         new_levels = []
diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py
@@ -265,7 +265,7 @@ class TableSchemaFormatter(BaseFormatter):
 
 
 def format_object_summary(obj, formatter, is_justify=True, name=None,
-                          indent_for_name=True):
+                          indent_for_name=True, is_multi=False):
     """
     Return the formatted obj as a unicode string
 
@@ -282,6 +282,8 @@ def format_object_summary(obj, formatter, is_justify=True, name=None,
     indent_for_name : bool, default True
         Whether subsequent lines should be be indented to
         align with the name.
+    is_multi : bool, default False
+        Is ``obj`` a :class:`MultiIndex` or not
 
     Returns
     -------
@@ -306,7 +308,7 @@ def format_object_summary(obj, formatter, is_justify=True, name=None,
         space2 = "\n "  # space for the opening '['
 
     n = len(obj)
-    sep = ','
+    sep = ',' if not is_multi else (',\n ' + ' ' * len(name))
     max_seq_items = get_option('display.max_seq_items') or n
 
     # are we a truncated display
@@ -334,10 +336,10 @@ def best_len(values):
 
     if n == 0:
         summary = '[]{}'.format(close)
-    elif n == 1:
+    elif n == 1 and not is_multi:
         first = formatter(obj[0])
         summary = '[{}]{}'.format(first, close)
-    elif n == 2:
+    elif n == 2 and not is_multi:
         first = formatter(obj[0])
         last = formatter(obj[-1])
         summary = '[{}, {}]{}'.format(first, last, close)
@@ -353,15 +355,16 @@ def best_len(values):
 
         # adjust all values to max length if needed
         if is_justify:
-
-            # however, if we are not truncated and we are only a single
-            # line, then don't justify
-            if (is_truncated or
-                    not (len(', '.join(head)) < display_width and
-                         len(', '.join(tail)) < display_width)):
-                max_len = max(best_len(head), best_len(tail))
-                head = [x.rjust(max_len) for x in head]
-                tail = [x.rjust(max_len) for x in tail]
+            head, tail = _justify(head, tail, display_width, best_len,
+                                  is_truncated, is_multi)
+        if is_multi:
+            max_space = display_width - len(space2)
+            item = tail[0]
+            for i in reversed(range(1, len(item) + 1)):
+                if len(_pprint_seq(item, max_seq_items=i)) < max_space:
+                    break
+            head = [_pprint_seq(x, max_seq_items=i) for x in head]
+            tail = [_pprint_seq(x, max_seq_items=i) for x in tail]
 
         summary = ""
         line = space2
@@ -391,7 +394,7 @@ def best_len(values):
         close = ']' + close.rstrip(' ')
         summary += close
 
-        if len(summary) > (display_width):
+        if len(summary) > (display_width) or is_multi:
             summary += space1
         else:  # one row
             summary += ' '
@@ -402,6 +405,52 @@ def best_len(values):
     return summary
 
 
+def _justify(head, tail, display_width, best_len,
+             is_truncated=False, is_multi=False):
+    """
+    Justify each item in head and tail, so they align properly.
+    """
+    if is_multi:
+        max_length = _max_level_item_length(head + tail)
+        head = [tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length))
+                for seq in head]
+        tail = [tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length))
+                for seq in tail]
+    elif (is_truncated or not (len(', '.join(head)) < display_width and
+                               len(', '.join(tail)) < display_width)):
+        max_length = max(best_len(head), best_len(tail))
+        head = [x.rjust(max_length) for x in head]
+        tail = [x.rjust(max_length) for x in tail]
+
+    return head, tail
+
+
+def _max_level_item_length(seq):
+    """
+    For each position for the sequences in ``seq``, find the largest length.
+
+    Used for justifying individual values in a :class:`pandas.MultiIndex`.
+
+    Parameters
+    ----------
+    seq : list-like of list-likes of strings
+
+    Returns
+    -------
+    max_length : list of ints
+
+    Examples
+    --------
+    >>> _max_level_item_length([['s', 'ab'], ['abc', 'a']])
+    [3, 2]
+    """
+    max_length = [0] * len(seq[0])
+    for inner_seq in seq:
+        length = [len(item) for item in inner_seq]
+        max_length = [max(x, y) for x, y in zip(max_length, length)]
+    return max_length
+
+
 def format_object_attrs(obj):
     """
     Return a list of tuples of the (attr, formatted_value)
diff --git a/pandas/tests/indexes/multi/test_format.py b/pandas/tests/indexes/multi/test_format.py
@@ -94,3 +94,132 @@ def test_repr_max_seq_item_setting(idx):
     with pd.option_context("display.max_seq_items", None):
         repr(idx)
         assert '...' not in str(idx)
+
+
+class TestRepr(object):
+
+    def setup_class(self):
+        n = 1000
+        ci = pd.CategoricalIndex(list('a' * n) + (['abc'] * n))
+        dti = pd.date_range('2000-01-01', freq='s', periods=n * 2)
+        self.narrow_mi = pd.MultiIndex.from_arrays([ci, ci.codes + 9, dti],
+                                                   names=['a', 'b', 'dti'])
+
+        levels = [ci, ci.codes + 9, dti, dti, dti]
+        names = ['a', 'b', 'dti_1', 'dti_2', 'dti_3']
+        self.wide_mi = pd.MultiIndex.from_arrays(levels, names=names)
+
+    def test_repr(self, idx):
+        result = idx[:1].__repr__()
+        expected = """MultiIndex([('foo', 'one')],
+           dtype='object', names=['first', 'second'])"""
+        assert result == expected
+
+        result = idx.__repr__()
+        expected = """MultiIndex([('foo', 'one'),
+            ('foo', 'two'),
+            ('bar', 'one'),
+            ('baz', 'two'),
+            ('qux', 'one'),
+            ('qux', 'two')],
+           dtype='object', names=['first', 'second'])"""
+        assert result == expected
+
+        with pd.option_context('display.max_seq_items', 5):
+            result = idx.__repr__()
+            expected = """MultiIndex([('foo', 'one'),
+            ('foo', 'two'),
+            ...
+            ('qux', 'one'),
+            ('qux', 'two')],
+           dtype='object', names=['first', 'second'], length=6)"""
+            assert result == expected
+
+    def test_rjust(self):
+        result = self.narrow_mi[:1].__repr__()
+        expected = """\
+MultiIndex([('a', 9, '2000-01-01 00:00:00')],
+           dtype='object', names=['a', 'b', 'dti'])"""
+        assert result == expected
+
+        result = self.narrow_mi[::500].__repr__()
+        expected = """\
+MultiIndex([(  'a',  9, '2000-01-01 00:00:00'),
+            (  'a',  9, '2000-01-01 00:08:20'),
+            ('abc', 10, '2000-01-01 00:16:40'),
+            ('abc', 10, '2000-01-01 00:25:00')],
+           dtype='object', names=['a', 'b', 'dti'])"""
+        assert result == expected
+
+        result = self.narrow_mi.__repr__()
+        expected = """\
+MultiIndex([(  'a',  9, '2000-01-01 00:00:00'),
+            (  'a',  9, '2000-01-01 00:00:01'),
+            (  'a',  9, '2000-01-01 00:00:02'),
+            (  'a',  9, '2000-01-01 00:00:03'),
+            (  'a',  9, '2000-01-01 00:00:04'),
+            (  'a',  9, '2000-01-01 00:00:05'),
+            (  'a',  9, '2000-01-01 00:00:06'),
+            (  'a',  9, '2000-01-01 00:00:07'),
+            (  'a',  9, '2000-01-01 00:00:08'),
+            (  'a',  9, '2000-01-01 00:00:09'),
+            ...
+            ('abc', 10, '2000-01-01 00:33:10'),
+            ('abc', 10, '2000-01-01 00:33:11'),
+            ('abc', 10, '2000-01-01 00:33:12'),
+            ('abc', 10, '2000-01-01 00:33:13'),
+            ('abc', 10, '2000-01-01 00:33:14'),
+            ('abc', 10, '2000-01-01 00:33:15'),
+            ('abc', 10, '2000-01-01 00:33:16'),
+            ('abc', 10, '2000-01-01 00:33:17'),
+            ('abc', 10, '2000-01-01 00:33:18'),
+            ('abc', 10, '2000-01-01 00:33:19')],
+           dtype='object', names=['a', 'b', 'dti'], length=2000)"""
+        assert result == expected
+
+    def test_tuple_width(self):
+        result = self.wide_mi[:1].__repr__()
+        expected = """MultiIndex([('a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...)],
+           dtype='object', names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'])"""
+        assert result == expected
+
+        result = self.wide_mi[:10].__repr__()
+        expected = """\
+MultiIndex([('a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...),
+            ('a', 9, '2000-01-01 00:00:01', '2000-01-01 00:00:01', ...),
+            ('a', 9, '2000-01-01 00:00:02', '2000-01-01 00:00:02', ...),
+            ('a', 9, '2000-01-01 00:00:03', '2000-01-01 00:00:03', ...),
+            ('a', 9, '2000-01-01 00:00:04', '2000-01-01 00:00:04', ...),
+            ('a', 9, '2000-01-01 00:00:05', '2000-01-01 00:00:05', ...),
+            ('a', 9, '2000-01-01 00:00:06', '2000-01-01 00:00:06', ...),
+            ('a', 9, '2000-01-01 00:00:07', '2000-01-01 00:00:07', ...),
+            ('a', 9, '2000-01-01 00:00:08', '2000-01-01 00:00:08', ...),
+            ('a', 9, '2000-01-01 00:00:09', '2000-01-01 00:00:09', ...)],
+           dtype='object', names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'])"""
+        assert result == expected
+
+        result = self.wide_mi.__repr__()
+        expected = """\
+MultiIndex([(  'a',  9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...),
+            (  'a',  9, '2000-01-01 00:00:01', '2000-01-01 00:00:01', ...),
+            (  'a',  9, '2000-01-01 00:00:02', '2000-01-01 00:00:02', ...),
+            (  'a',  9, '2000-01-01 00:00:03', '2000-01-01 00:00:03', ...),
+            (  'a',  9, '2000-01-01 00:00:04', '2000-01-01 00:00:04', ...),
+            (  'a',  9, '2000-01-01 00:00:05', '2000-01-01 00:00:05', ...),
+            (  'a',  9, '2000-01-01 00:00:06', '2000-01-01 00:00:06', ...),
+            (  'a',  9, '2000-01-01 00:00:07', '2000-01-01 00:00:07', ...),
+            (  'a',  9, '2000-01-01 00:00:08', '2000-01-01 00:00:08', ...),
+            (  'a',  9, '2000-01-01 00:00:09', '2000-01-01 00:00:09', ...),
+            ...
+            ('abc', 10, '2000-01-01 00:33:10', '2000-01-01 00:33:10', ...),
+            ('abc', 10, '2000-01-01 00:33:11', '2000-01-01 00:33:11', ...),
+            ('abc', 10, '2000-01-01 00:33:12', '2000-01-01 00:33:12', ...),
+            ('abc', 10, '2000-01-01 00:33:13', '2000-01-01 00:33:13', ...),
+            ('abc', 10, '2000-01-01 00:33:14', '2000-01-01 00:33:14', ...),
+            ('abc', 10, '2000-01-01 00:33:15', '2000-01-01 00:33:15', ...),
+            ('abc', 10, '2000-01-01 00:33:16', '2000-01-01 00:33:16', ...),
+            ('abc', 10, '2000-01-01 00:33:17', '2000-01-01 00:33:17', ...),
+            ('abc', 10, '2000-01-01 00:33:18', '2000-01-01 00:33:18', ...),
+            ('abc', 10, '2000-01-01 00:33:19', '2000-01-01 00:33:19', ...)],
+           dtype='object', names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'], length=2000)"""  # noqa
+        assert result == expected