Skip to content

BUG/CLN: MI now checks level & label compatibility #5214

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 16, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,8 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
instead they are generated and cached on the fly. The internal
representation and handling of DateOffsets has also been clarified.
(:issue:`5189`, related :issue:`5004`)
- ``MultiIndex`` constructor now validates that passed levels and labels are
compatible. (:issue:`5213`, :issue:`5214`)

.. _release.bug_fixes-0.13.0:

Expand Down
86 changes: 64 additions & 22 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1394,7 +1394,7 @@ def _join_level(self, other, level, how='left', return_indexers=False):
new_levels[level] = new_level

join_index = MultiIndex(levels=new_levels, labels=new_labels,
names=left.names)
names=left.names, verify_integrity=False)
left_indexer = np.arange(len(left))[new_lev_labels != -1]
else:
join_index = left
Expand Down Expand Up @@ -1856,7 +1856,7 @@ class MultiIndex(Index):
rename = Index.set_names

def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
copy=False):
copy=False, verify_integrity=True):
if levels is None or labels is None:
raise TypeError("Must pass both levels and labels")
if len(levels) != len(labels):
Expand Down Expand Up @@ -1886,12 +1886,36 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
else:
subarr.sortorder = sortorder

if verify_integrity:
subarr._verify_integrity()

return subarr

def _verify_integrity(self):
"""Raises ValueError if length of levels and labels don't match or any
label would exceed level bounds"""
# NOTE: Currently does not check, among other things, that cached
# nlevels matches nor that sortorder matches actually sortorder.
labels, levels = self.labels, self.levels
if len(levels) != len(labels):
raise ValueError("Length of levels and labels must match. NOTE:"
" this index is in an inconsistent state.")
label_length = len(self.labels[0])
for i, (level, label) in enumerate(zip(levels, labels)):
if len(label) != label_length:
raise ValueError("Unequal label lengths: %s" % (
[len(lab) for lab in labels]))
if len(label) and label.max() >= len(level):
raise ValueError("On level %d, label max (%d) >= length of"
" level (%d). NOTE: this index is in an"
" inconsistent state" % (i, label.max(),
len(level)))

def _get_levels(self):
return self._levels

def _set_levels(self, levels, copy=False, validate=True):
def _set_levels(self, levels, copy=False, validate=True,
verify_integrity=False):
# This is NOT part of the levels property because it should be
# externally not allowed to set levels. User beware if you change
# _levels directly
Expand All @@ -1907,7 +1931,10 @@ def _set_levels(self, levels, copy=False, validate=True):
self._set_names(names)
self._tuples = None

def set_levels(self, levels, inplace=False):
if verify_integrity:
self._verify_integrity()

def set_levels(self, levels, inplace=False, verify_integrity=True):
"""
Set new levels on MultiIndex. Defaults to returning
new index.
Expand All @@ -1918,6 +1945,8 @@ def set_levels(self, levels, inplace=False):
new levels to apply
inplace : bool
if True, mutates in place
verify_integrity : bool (default True)
if True, checks that levels and labels are compatible

Returns
-------
Expand All @@ -1930,27 +1959,33 @@ def set_levels(self, levels, inplace=False):
else:
idx = self._shallow_copy()
idx._reset_identity()
idx._set_levels(levels)
idx._set_levels(levels, validate=True,
verify_integrity=verify_integrity)
if not inplace:
return idx

# remove me in 0.14 and change to read only property
__set_levels = deprecate("setting `levels` directly",
partial(set_levels, inplace=True),
partial(set_levels, inplace=True,
verify_integrity=True),
alt_name="set_levels")
levels = property(fget=_get_levels, fset=__set_levels)

def _get_labels(self):
return self._labels

def _set_labels(self, labels, copy=False, validate=True):
def _set_labels(self, labels, copy=False, validate=True,
verify_integrity=False):
if validate and len(labels) != self.nlevels:
raise ValueError("Length of labels must match length of levels")
self._labels = FrozenList(_ensure_frozen(labs, copy=copy)._shallow_copy()
for labs in labels)
self._tuples = None

def set_labels(self, labels, inplace=False):
if verify_integrity:
self._verify_integrity()

def set_labels(self, labels, inplace=False, verify_integrity=True):
"""
Set new labels on MultiIndex. Defaults to returning
new index.
Expand All @@ -1961,6 +1996,8 @@ def set_labels(self, labels, inplace=False):
new labels to apply
inplace : bool
if True, mutates in place
verify_integrity : bool (default True)
if True, checks that levels and labels are compatible

Returns
-------
Expand All @@ -1973,13 +2010,14 @@ def set_labels(self, labels, inplace=False):
else:
idx = self._shallow_copy()
idx._reset_identity()
idx._set_labels(labels)
idx._set_labels(labels, verify_integrity=verify_integrity)
if not inplace:
return idx

# remove me in 0.14 and change to readonly property
__set_labels = deprecate("setting labels directly",
partial(set_labels, inplace=True),
partial(set_labels, inplace=True,
verify_integrity=True),
alt_name="set_labels")
labels = property(fget=_get_labels, fset=__set_labels)

Expand Down Expand Up @@ -2392,7 +2430,8 @@ def from_arrays(cls, arrays, sortorder=None, names=None):
names = [c.name for c in cats]

return MultiIndex(levels=levels, labels=labels,
sortorder=sortorder, names=names)
sortorder=sortorder, names=names,
verify_integrity=False)

@classmethod
def from_tuples(cls, tuples, sortorder=None, names=None):
Expand Down Expand Up @@ -2463,6 +2502,7 @@ def __setstate__(self, state):
self._set_labels(labels)
self._set_names(names)
self.sortorder = sortorder
self._verify_integrity()

def __getitem__(self, key):
if np.isscalar(key):
Expand Down Expand Up @@ -2502,7 +2542,7 @@ def take(self, indexer, axis=None):
indexer = com._ensure_platform_int(indexer)
new_labels = [lab.take(indexer) for lab in self.labels]
return MultiIndex(levels=self.levels, labels=new_labels,
names=self.names)
names=self.names, verify_integrity=False)

def append(self, other):
"""
Expand Down Expand Up @@ -2618,7 +2658,7 @@ def droplevel(self, level=0):
return result
else:
return MultiIndex(levels=new_levels, labels=new_labels,
names=new_names)
names=new_names, verify_integrity=False)

def swaplevel(self, i, j):
"""
Expand All @@ -2645,7 +2685,7 @@ def swaplevel(self, i, j):
new_names[i], new_names[j] = new_names[j], new_names[i]

return MultiIndex(levels=new_levels, labels=new_labels,
names=new_names)
names=new_names, verify_integrity=False)

def reorder_levels(self, order):
"""
Expand All @@ -2664,7 +2704,7 @@ def reorder_levels(self, order):
new_names = [self.names[i] for i in order]

return MultiIndex(levels=new_levels, labels=new_labels,
names=new_names)
names=new_names, verify_integrity=False)

def __getslice__(self, i, j):
return self.__getitem__(slice(i, j))
Expand Down Expand Up @@ -2705,7 +2745,8 @@ def sortlevel(self, level=0, ascending=True):
new_labels = [lab.take(indexer) for lab in self.labels]

new_index = MultiIndex(labels=new_labels, levels=self.levels,
names=self.names, sortorder=level)
names=self.names, sortorder=level,
verify_integrity=False)

return new_index, indexer

Expand Down Expand Up @@ -3086,7 +3127,8 @@ def truncate(self, before=None, after=None):
new_labels = [lab[left:right] for lab in self.labels]
new_labels[0] = new_labels[0] - i

return MultiIndex(levels=new_levels, labels=new_labels)
return MultiIndex(levels=new_levels, labels=new_labels,
verify_integrity=False)

def equals(self, other):
"""
Expand Down Expand Up @@ -3180,7 +3222,7 @@ def intersection(self, other):
if len(uniq_tuples) == 0:
return MultiIndex(levels=[[]] * self.nlevels,
labels=[[]] * self.nlevels,
names=result_names)
names=result_names, verify_integrity=False)
else:
return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0,
names=result_names)
Expand Down Expand Up @@ -3210,14 +3252,14 @@ def diff(self, other):
if self.equals(other):
return MultiIndex(levels=[[]] * self.nlevels,
labels=[[]] * self.nlevels,
names=result_names)
names=result_names, verify_integrity=False)

difference = sorted(set(self.values) - set(other.values))

if len(difference) == 0:
return MultiIndex(levels=[[]] * self.nlevels,
labels=[[]] * self.nlevels,
names=result_names)
names=result_names, verify_integrity=False)
else:
return MultiIndex.from_tuples(difference, sortorder=0,
names=result_names)
Expand Down Expand Up @@ -3269,7 +3311,7 @@ def insert(self, loc, item):
new_labels.append(np.insert(labels, loc, lev_loc))

return MultiIndex(levels=new_levels, labels=new_labels,
names=self.names)
names=self.names, verify_integrity=False)

def delete(self, loc):
"""
Expand All @@ -3281,7 +3323,7 @@ def delete(self, loc):
"""
new_labels = [np.delete(lab, loc) for lab in self.labels]
return MultiIndex(levels=self.levels, labels=new_labels,
names=self.names)
names=self.names, verify_integrity=False)

get_major_bounds = slice_locs

Expand Down
5 changes: 3 additions & 2 deletions pandas/core/panel.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ def panel_index(time, panels, names=['time', 'panel']):

labels = [time_factor.labels, panel_factor.labels]
levels = [time_factor.levels, panel_factor.levels]
return MultiIndex(levels, labels, sortorder=None, names=names)
return MultiIndex(levels, labels, sortorder=None, names=names,
verify_integrity=False)



Expand Down Expand Up @@ -838,7 +839,7 @@ def to_frame(self, filter_observations=True):

index = MultiIndex(levels=[self.major_axis, self.minor_axis],
labels=[major_labels, minor_labels],
names=[maj_name, min_name])
names=[maj_name, min_name], verify_integrity=False)

return DataFrame(data, index=index, columns=self.items)

Expand Down
17 changes: 10 additions & 7 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def get_new_columns(self):
new_labels.append(np.tile(np.arange(stride), width))

return MultiIndex(levels=new_levels, labels=new_labels,
names=new_names)
names=new_names, verify_integrity=False)

def get_new_index(self):
result_labels = []
Expand All @@ -234,7 +234,8 @@ def get_new_index(self):
else:
new_index = MultiIndex(levels=self.new_index_levels,
labels=result_labels,
names=self.new_index_names)
names=self.new_index_names,
verify_integrity=False)

return new_index

Expand Down Expand Up @@ -286,7 +287,8 @@ def _unstack_multiple(data, clocs):

dummy_index = MultiIndex(levels=rlevels + [obs_ids],
labels=rlabels + [comp_ids],
names=rnames + ['__placeholder__'])
names=rnames + ['__placeholder__'],
verify_integrity=False)

if isinstance(data, Series):
dummy = Series(data.values, index=dummy_index)
Expand Down Expand Up @@ -320,7 +322,7 @@ def _unstack_multiple(data, clocs):
new_labels.append(rec.take(unstcols.labels[-1]))

new_columns = MultiIndex(levels=new_levels, labels=new_labels,
names=new_names)
names=new_names, verify_integrity=False)

if isinstance(unstacked, Series):
unstacked.index = new_columns
Expand Down Expand Up @@ -505,13 +507,14 @@ def stack(frame, level=-1, dropna=True):
new_names = list(frame.index.names)
new_names.append(frame.columns.name)
new_index = MultiIndex(levels=new_levels, labels=new_labels,
names=new_names)
names=new_names, verify_integrity=False)
else:
ilabels = np.arange(N).repeat(K)
clabels = np.tile(np.arange(K), N).ravel()
new_index = MultiIndex(levels=[frame.index, frame.columns],
labels=[ilabels, clabels],
names=[frame.index.name, frame.columns.name])
names=[frame.index.name, frame.columns.name],
verify_integrity=False)

new_values = frame.values.ravel()
if dropna:
Expand Down Expand Up @@ -590,7 +593,7 @@ def _stack_multi_columns(frame, level=-1, dropna=True):
new_names.append(frame.columns.names[level])

new_index = MultiIndex(levels=new_levels, labels=new_labels,
names=new_names)
names=new_names, verify_integrity=False)

result = DataFrame(new_data, index=new_index, columns=new_columns)

Expand Down
3 changes: 2 additions & 1 deletion pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -2209,7 +2209,8 @@ def read_multi_index(self, key):
lab = self.read_array(label_key)
labels.append(lab)

return MultiIndex(levels=levels, labels=labels, names=names)
return MultiIndex(levels=levels, labels=labels, names=names,
verify_integrity=True)

def read_index_node(self, node):
data = node[:]
Expand Down
3 changes: 2 additions & 1 deletion pandas/sparse/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -767,7 +767,8 @@ def stack_sparse_frame(frame):
major_labels = np.concatenate(inds_to_concat)
stacked_values = np.concatenate(vals_to_concat)
index = MultiIndex(levels=[frame.index, frame.columns],
labels=[major_labels, minor_labels])
labels=[major_labels, minor_labels],
verify_integrity=False)

lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index,
columns=['foo'])
Expand Down
3 changes: 2 additions & 1 deletion pandas/sparse/panel.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,8 @@ def to_frame(self, filter_observations=True):
minor_labels = inds // N

index = MultiIndex(levels=[self.major_axis, self.minor_axis],
labels=[major_labels, minor_labels])
labels=[major_labels, minor_labels],
verify_integrity=False)

df = DataFrame(values, index=index, columns=self.items)
return df.sortlevel(level=0)
Expand Down
Loading