diff --git a/doc/source/release.rst b/doc/source/release.rst index 805b8d24d70d9..6ea4e5a3046b2 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -400,6 +400,8 @@ See :ref:`Internal Refactoring` instead they are generated and cached on the fly. The internal representation and handling of DateOffsets has also been clarified. (:issue:`5189`, related :issue:`5004`) + - ``MultiIndex`` constructor now validates that passed levels and labels are + compatible. (:issue:`5213`, :issue:`5214`) .. _release.bug_fixes-0.13.0: diff --git a/pandas/core/index.py b/pandas/core/index.py index 773ca4acf80df..a79670579198b 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1394,7 +1394,7 @@ def _join_level(self, other, level, how='left', return_indexers=False): new_levels[level] = new_level join_index = MultiIndex(levels=new_levels, labels=new_labels, - names=left.names) + names=left.names, verify_integrity=False) left_indexer = np.arange(len(left))[new_lev_labels != -1] else: join_index = left @@ -1856,7 +1856,7 @@ class MultiIndex(Index): rename = Index.set_names def __new__(cls, levels=None, labels=None, sortorder=None, names=None, - copy=False): + copy=False, verify_integrity=True): if levels is None or labels is None: raise TypeError("Must pass both levels and labels") if len(levels) != len(labels): @@ -1886,12 +1886,36 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None, else: subarr.sortorder = sortorder + if verify_integrity: + subarr._verify_integrity() + return subarr + def _verify_integrity(self): + """Raises ValueError if length of levels and labels don't match or any + label would exceed level bounds""" + # NOTE: Currently does not check, among other things, that cached + # nlevels matches nor that sortorder matches actually sortorder. + labels, levels = self.labels, self.levels + if len(levels) != len(labels): + raise ValueError("Length of levels and labels must match. NOTE:" + " this index is in an inconsistent state.") + label_length = len(self.labels[0]) + for i, (level, label) in enumerate(zip(levels, labels)): + if len(label) != label_length: + raise ValueError("Unequal label lengths: %s" % ( + [len(lab) for lab in labels])) + if len(label) and label.max() >= len(level): + raise ValueError("On level %d, label max (%d) >= length of" + " level (%d). NOTE: this index is in an" + " inconsistent state" % (i, label.max(), + len(level))) + def _get_levels(self): return self._levels - def _set_levels(self, levels, copy=False, validate=True): + def _set_levels(self, levels, copy=False, validate=True, + verify_integrity=False): # This is NOT part of the levels property because it should be # externally not allowed to set levels. User beware if you change # _levels directly @@ -1907,7 +1931,10 @@ def _set_levels(self, levels, copy=False, validate=True): self._set_names(names) self._tuples = None - def set_levels(self, levels, inplace=False): + if verify_integrity: + self._verify_integrity() + + def set_levels(self, levels, inplace=False, verify_integrity=True): """ Set new levels on MultiIndex. Defaults to returning new index. @@ -1918,6 +1945,8 @@ def set_levels(self, levels, inplace=False): new levels to apply inplace : bool if True, mutates in place + verify_integrity : bool (default True) + if True, checks that levels and labels are compatible Returns ------- @@ -1930,27 +1959,33 @@ def set_levels(self, levels, inplace=False): else: idx = self._shallow_copy() idx._reset_identity() - idx._set_levels(levels) + idx._set_levels(levels, validate=True, + verify_integrity=verify_integrity) if not inplace: return idx # remove me in 0.14 and change to read only property __set_levels = deprecate("setting `levels` directly", - partial(set_levels, inplace=True), + partial(set_levels, inplace=True, + verify_integrity=True), alt_name="set_levels") levels = property(fget=_get_levels, fset=__set_levels) def _get_labels(self): return self._labels - def _set_labels(self, labels, copy=False, validate=True): + def _set_labels(self, labels, copy=False, validate=True, + verify_integrity=False): if validate and len(labels) != self.nlevels: raise ValueError("Length of labels must match length of levels") self._labels = FrozenList(_ensure_frozen(labs, copy=copy)._shallow_copy() for labs in labels) self._tuples = None - def set_labels(self, labels, inplace=False): + if verify_integrity: + self._verify_integrity() + + def set_labels(self, labels, inplace=False, verify_integrity=True): """ Set new labels on MultiIndex. Defaults to returning new index. @@ -1961,6 +1996,8 @@ def set_labels(self, labels, inplace=False): new labels to apply inplace : bool if True, mutates in place + verify_integrity : bool (default True) + if True, checks that levels and labels are compatible Returns ------- @@ -1973,13 +2010,14 @@ def set_labels(self, labels, inplace=False): else: idx = self._shallow_copy() idx._reset_identity() - idx._set_labels(labels) + idx._set_labels(labels, verify_integrity=verify_integrity) if not inplace: return idx # remove me in 0.14 and change to readonly property __set_labels = deprecate("setting labels directly", - partial(set_labels, inplace=True), + partial(set_labels, inplace=True, + verify_integrity=True), alt_name="set_labels") labels = property(fget=_get_labels, fset=__set_labels) @@ -2392,7 +2430,8 @@ def from_arrays(cls, arrays, sortorder=None, names=None): names = [c.name for c in cats] return MultiIndex(levels=levels, labels=labels, - sortorder=sortorder, names=names) + sortorder=sortorder, names=names, + verify_integrity=False) @classmethod def from_tuples(cls, tuples, sortorder=None, names=None): @@ -2463,6 +2502,7 @@ def __setstate__(self, state): self._set_labels(labels) self._set_names(names) self.sortorder = sortorder + self._verify_integrity() def __getitem__(self, key): if np.isscalar(key): @@ -2502,7 +2542,7 @@ def take(self, indexer, axis=None): indexer = com._ensure_platform_int(indexer) new_labels = [lab.take(indexer) for lab in self.labels] return MultiIndex(levels=self.levels, labels=new_labels, - names=self.names) + names=self.names, verify_integrity=False) def append(self, other): """ @@ -2618,7 +2658,7 @@ def droplevel(self, level=0): return result else: return MultiIndex(levels=new_levels, labels=new_labels, - names=new_names) + names=new_names, verify_integrity=False) def swaplevel(self, i, j): """ @@ -2645,7 +2685,7 @@ def swaplevel(self, i, j): new_names[i], new_names[j] = new_names[j], new_names[i] return MultiIndex(levels=new_levels, labels=new_labels, - names=new_names) + names=new_names, verify_integrity=False) def reorder_levels(self, order): """ @@ -2664,7 +2704,7 @@ def reorder_levels(self, order): new_names = [self.names[i] for i in order] return MultiIndex(levels=new_levels, labels=new_labels, - names=new_names) + names=new_names, verify_integrity=False) def __getslice__(self, i, j): return self.__getitem__(slice(i, j)) @@ -2705,7 +2745,8 @@ def sortlevel(self, level=0, ascending=True): new_labels = [lab.take(indexer) for lab in self.labels] new_index = MultiIndex(labels=new_labels, levels=self.levels, - names=self.names, sortorder=level) + names=self.names, sortorder=level, + verify_integrity=False) return new_index, indexer @@ -3086,7 +3127,8 @@ def truncate(self, before=None, after=None): new_labels = [lab[left:right] for lab in self.labels] new_labels[0] = new_labels[0] - i - return MultiIndex(levels=new_levels, labels=new_labels) + return MultiIndex(levels=new_levels, labels=new_labels, + verify_integrity=False) def equals(self, other): """ @@ -3180,7 +3222,7 @@ def intersection(self, other): if len(uniq_tuples) == 0: return MultiIndex(levels=[[]] * self.nlevels, labels=[[]] * self.nlevels, - names=result_names) + names=result_names, verify_integrity=False) else: return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, names=result_names) @@ -3210,14 +3252,14 @@ def diff(self, other): if self.equals(other): return MultiIndex(levels=[[]] * self.nlevels, labels=[[]] * self.nlevels, - names=result_names) + names=result_names, verify_integrity=False) difference = sorted(set(self.values) - set(other.values)) if len(difference) == 0: return MultiIndex(levels=[[]] * self.nlevels, labels=[[]] * self.nlevels, - names=result_names) + names=result_names, verify_integrity=False) else: return MultiIndex.from_tuples(difference, sortorder=0, names=result_names) @@ -3269,7 +3311,7 @@ def insert(self, loc, item): new_labels.append(np.insert(labels, loc, lev_loc)) return MultiIndex(levels=new_levels, labels=new_labels, - names=self.names) + names=self.names, verify_integrity=False) def delete(self, loc): """ @@ -3281,7 +3323,7 @@ def delete(self, loc): """ new_labels = [np.delete(lab, loc) for lab in self.labels] return MultiIndex(levels=self.levels, labels=new_labels, - names=self.names) + names=self.names, verify_integrity=False) get_major_bounds = slice_locs diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 1389445b29943..87e9121b2dffc 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -96,7 +96,8 @@ def panel_index(time, panels, names=['time', 'panel']): labels = [time_factor.labels, panel_factor.labels] levels = [time_factor.levels, panel_factor.levels] - return MultiIndex(levels, labels, sortorder=None, names=names) + return MultiIndex(levels, labels, sortorder=None, names=names, + verify_integrity=False) @@ -838,7 +839,7 @@ def to_frame(self, filter_observations=True): index = MultiIndex(levels=[self.major_axis, self.minor_axis], labels=[major_labels, minor_labels], - names=[maj_name, min_name]) + names=[maj_name, min_name], verify_integrity=False) return DataFrame(data, index=index, columns=self.items) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index a608b40847228..c2c1a2931d4aa 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -218,7 +218,7 @@ def get_new_columns(self): new_labels.append(np.tile(np.arange(stride), width)) return MultiIndex(levels=new_levels, labels=new_labels, - names=new_names) + names=new_names, verify_integrity=False) def get_new_index(self): result_labels = [] @@ -234,7 +234,8 @@ def get_new_index(self): else: new_index = MultiIndex(levels=self.new_index_levels, labels=result_labels, - names=self.new_index_names) + names=self.new_index_names, + verify_integrity=False) return new_index @@ -286,7 +287,8 @@ def _unstack_multiple(data, clocs): dummy_index = MultiIndex(levels=rlevels + [obs_ids], labels=rlabels + [comp_ids], - names=rnames + ['__placeholder__']) + names=rnames + ['__placeholder__'], + verify_integrity=False) if isinstance(data, Series): dummy = Series(data.values, index=dummy_index) @@ -320,7 +322,7 @@ def _unstack_multiple(data, clocs): new_labels.append(rec.take(unstcols.labels[-1])) new_columns = MultiIndex(levels=new_levels, labels=new_labels, - names=new_names) + names=new_names, verify_integrity=False) if isinstance(unstacked, Series): unstacked.index = new_columns @@ -505,13 +507,14 @@ def stack(frame, level=-1, dropna=True): new_names = list(frame.index.names) new_names.append(frame.columns.name) new_index = MultiIndex(levels=new_levels, labels=new_labels, - names=new_names) + names=new_names, verify_integrity=False) else: ilabels = np.arange(N).repeat(K) clabels = np.tile(np.arange(K), N).ravel() new_index = MultiIndex(levels=[frame.index, frame.columns], labels=[ilabels, clabels], - names=[frame.index.name, frame.columns.name]) + names=[frame.index.name, frame.columns.name], + verify_integrity=False) new_values = frame.values.ravel() if dropna: @@ -590,7 +593,7 @@ def _stack_multi_columns(frame, level=-1, dropna=True): new_names.append(frame.columns.names[level]) new_index = MultiIndex(levels=new_levels, labels=new_labels, - names=new_names) + names=new_names, verify_integrity=False) result = DataFrame(new_data, index=new_index, columns=new_columns) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5b7297c7be2f4..999f0751abe99 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2209,7 +2209,8 @@ def read_multi_index(self, key): lab = self.read_array(label_key) labels.append(lab) - return MultiIndex(levels=levels, labels=labels, names=names) + return MultiIndex(levels=levels, labels=labels, names=names, + verify_integrity=True) def read_index_node(self, node): data = node[:] diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 88464d683d543..b577f5ba8f5ec 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -767,7 +767,8 @@ def stack_sparse_frame(frame): major_labels = np.concatenate(inds_to_concat) stacked_values = np.concatenate(vals_to_concat) index = MultiIndex(levels=[frame.index, frame.columns], - labels=[major_labels, minor_labels]) + labels=[major_labels, minor_labels], + verify_integrity=False) lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index, columns=['foo']) diff --git a/pandas/sparse/panel.py b/pandas/sparse/panel.py index 65a24dc1bf25f..74bca7de89bcc 100644 --- a/pandas/sparse/panel.py +++ b/pandas/sparse/panel.py @@ -317,7 +317,8 @@ def to_frame(self, filter_observations=True): minor_labels = inds // N index = MultiIndex(levels=[self.major_axis, self.minor_axis], - labels=[major_labels, minor_labels]) + labels=[major_labels, minor_labels], + verify_integrity=False) df = DataFrame(values, index=index, columns=self.items) return df.sortlevel(level=0) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 5b2edc31e1fe9..755d74c9ea0bc 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -7,6 +7,7 @@ import re import unittest import nose +import warnings import os import numpy as np @@ -1213,7 +1214,7 @@ def setUp(self): self.index_names = ['first', 'second'] self.index = MultiIndex(levels=[major_axis, minor_axis], labels=[major_labels, minor_labels], - names=self.index_names) + names=self.index_names, verify_integrity=False) def test_hash_error(self): with tm.assertRaisesRegexp(TypeError, @@ -1447,11 +1448,38 @@ def test_constructor_no_levels(self): MultiIndex(labels=[]) def test_constructor_mismatched_label_levels(self): - levels = [np.array([1]), np.array([2]), np.array([3])] - labels = ["a"] + labels = [np.array([1]), np.array([2]), np.array([3])] + levels = ["a"] assertRaisesRegexp(ValueError, "Length of levels and labels must be" " the same", MultiIndex, levels=levels, labels=labels) + length_error = re.compile('>= length of level') + label_error = re.compile(r'Unequal label lengths: \[4, 2\]') + + # important to check that it's looking at the right thing. + with tm.assertRaisesRegexp(ValueError, length_error): + MultiIndex(levels=[['a'], ['b']], labels=[[0, 1, 2, 3], [0, 3, 4, 1]]) + + with tm.assertRaisesRegexp(ValueError, label_error): + MultiIndex(levels=[['a'], ['b']], labels=[[0, 0, 0, 0], [0, 0]]) + + # external API + with tm.assertRaisesRegexp(ValueError, length_error): + self.index.copy().set_levels([['a'], ['b']]) + + with tm.assertRaisesRegexp(ValueError, label_error): + self.index.copy().set_labels([[0, 0, 0, 0], [0, 0]]) + + # deprecated properties + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + + with tm.assertRaisesRegexp(ValueError, length_error): + self.index.copy().levels = [['a'], ['b']] + + with tm.assertRaisesRegexp(ValueError, label_error): + self.index.copy().labels = [[0, 0, 0, 0], [0, 0]] + def assert_multiindex_copied(self, copy, original): # levels shoudl be (at least, shallow copied) diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 2a9e7f8642601..b0a64d282e814 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -488,9 +488,6 @@ def _check(new_mgr,block_type, citems): _check(new_mgr,BoolBlock,['bool']) _check(new_mgr,DatetimeBlock,['dt']) - def test_xs(self): - pass - def test_interleave(self): pass diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 3a99793937096..c76bdea950650 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -1366,7 +1366,8 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): # also copies names = names + _get_consensus_names(indexes) - return MultiIndex(levels=levels, labels=label_list, names=names) + return MultiIndex(levels=levels, labels=label_list, names=names, + verify_integrity=False) new_index = indexes[0] n = len(new_index) @@ -1402,7 +1403,8 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): if len(new_names) < len(new_levels): new_names.extend(new_index.names) - return MultiIndex(levels=new_levels, labels=new_labels, names=new_names) + return MultiIndex(levels=new_levels, labels=new_labels, names=new_names, + verify_integrity=False) def _should_fill(lname, rname):