Skip to content

Commit e686c5f

Browse files
committed
Merge pull request #5209 from jtratner/validate-everything-on-mi
CLN/BUG: Better validation of levels/labels/names
2 parents 4559e32 + 87fb073 commit e686c5f

File tree

6 files changed

+119
-45
lines changed

6 files changed

+119
-45
lines changed

doc/source/release.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,8 @@ Improvements to existing features
109109
(:issue:`4039`)
110110
- Add ``rename`` and ``set_names`` methods to ``Index`` as well as
111111
``set_names``, ``set_levels``, ``set_labels`` to ``MultiIndex``.
112-
(:issue:`4039`)
112+
(:issue:`4039`) with improved validation for all (:issue:`4039`,
113+
:issue:`4794`)
113114
- A Series of dtype ``timedelta64[ns]`` can now be divided/multiplied
114115
by an integer series (:issue`4521`)
115116
- A Series of dtype ``timedelta64[ns]`` can now be divided by another

doc/source/v0.13.0.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ API changes
5252
# but setting names is not deprecated.
5353
index = index.set_names(["bob", "cranberry"])
5454

55-
# and all methods take an inplace kwarg
55+
# and all methods take an inplace kwarg - but returns None
5656
index.set_names(["bob", "cranberry"], inplace=True)
5757

5858
- Infer and downcast dtype if ``downcast='infer'`` is passed to ``fillna/ffill/bfill`` (:issue:`4604`)

pandas/core/index.py

Lines changed: 39 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1672,8 +1672,8 @@ def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False):
16721672
subarr = np.array(data, dtype=np.int64, copy=copy)
16731673
if len(data) > 0:
16741674
if (subarr != data).any():
1675-
raise TypeError('Unsafe NumPy casting, you must '
1676-
'explicitly cast')
1675+
raise TypeError('Unsafe NumPy casting to integer, you must'
1676+
' explicitly cast')
16771677

16781678
subarr = subarr.view(cls)
16791679
subarr.name = name
@@ -1857,11 +1857,12 @@ class MultiIndex(Index):
18571857

18581858
def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
18591859
copy=False):
1860+
if levels is None or labels is None:
1861+
raise TypeError("Must pass both levels and labels")
18601862
if len(levels) != len(labels):
1861-
raise ValueError(
1862-
'Length of levels and labels must be the same')
1863+
raise ValueError('Length of levels and labels must be the same.')
18631864
if len(levels) == 0:
1864-
raise TypeError('Must pass non-zero number of levels/labels')
1865+
raise ValueError('Must pass non-zero number of levels/labels')
18651866
if len(levels) == 1:
18661867
if names:
18671868
name = names[0]
@@ -1872,10 +1873,12 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
18721873

18731874
# v3, 0.8.0
18741875
subarr = np.empty(0, dtype=object).view(cls)
1875-
subarr._set_levels(levels, copy=copy)
1876-
subarr._set_labels(labels, copy=copy)
1876+
# we've already validated levels and labels, so shortcut here
1877+
subarr._set_levels(levels, copy=copy, validate=False)
1878+
subarr._set_labels(labels, copy=copy, validate=False)
18771879

18781880
if names is not None:
1881+
# handles name validation
18791882
subarr._set_names(names)
18801883

18811884
if sortorder is not None:
@@ -1888,12 +1891,14 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
18881891
def _get_levels(self):
18891892
return self._levels
18901893

1891-
def _set_levels(self, levels, copy=False):
1894+
def _set_levels(self, levels, copy=False, validate=True):
18921895
# This is NOT part of the levels property because it should be
18931896
# externally not allowed to set levels. User beware if you change
18941897
# _levels directly
1895-
if len(levels) == 0:
1896-
raise ValueError("Must set non-zero number of levels.")
1898+
if validate and len(levels) == 0:
1899+
raise ValueError('Must set non-zero number of levels.')
1900+
if validate and len(levels) != len(self._labels):
1901+
raise ValueError('Length of levels must match length of labels.')
18971902
levels = FrozenList(_ensure_index(lev, copy=copy)._shallow_copy()
18981903
for lev in levels)
18991904
names = self.names
@@ -1917,13 +1922,16 @@ def set_levels(self, levels, inplace=False):
19171922
-------
19181923
new index (of same type and class...etc)
19191924
"""
1925+
if not com.is_list_like(levels) or not com.is_list_like(levels[0]):
1926+
raise TypeError("Levels must be list of lists-like")
19201927
if inplace:
19211928
idx = self
19221929
else:
19231930
idx = self._shallow_copy()
19241931
idx._reset_identity()
19251932
idx._set_levels(levels)
1926-
return idx
1933+
if not inplace:
1934+
return idx
19271935

19281936
# remove me in 0.14 and change to read only property
19291937
__set_levels = deprecate("setting `levels` directly",
@@ -1934,9 +1942,9 @@ def set_levels(self, levels, inplace=False):
19341942
def _get_labels(self):
19351943
return self._labels
19361944

1937-
def _set_labels(self, labels, copy=False):
1938-
if len(labels) != self.nlevels:
1939-
raise ValueError("Length of levels and labels must be the same.")
1945+
def _set_labels(self, labels, copy=False, validate=True):
1946+
if validate and len(labels) != self.nlevels:
1947+
raise ValueError("Length of labels must match length of levels")
19401948
self._labels = FrozenList(_ensure_frozen(labs, copy=copy)._shallow_copy()
19411949
for labs in labels)
19421950

@@ -1956,13 +1964,16 @@ def set_labels(self, labels, inplace=False):
19561964
-------
19571965
new index (of same type and class...etc)
19581966
"""
1967+
if not com.is_list_like(labels) or not com.is_list_like(labels[0]):
1968+
raise TypeError("Labels must be list of lists-like")
19591969
if inplace:
19601970
idx = self
19611971
else:
19621972
idx = self._shallow_copy()
19631973
idx._reset_identity()
19641974
idx._set_labels(labels)
1965-
return idx
1975+
if not inplace:
1976+
return idx
19661977

19671978
# remove me in 0.14 and change to readonly property
19681979
__set_labels = deprecate("setting labels directly",
@@ -2021,7 +2032,8 @@ def __array_finalize__(self, obj):
20212032
# instance.
20222033
return
20232034

2024-
self._set_levels(getattr(obj, 'levels', []))
2035+
# skip the validation on first, rest will catch the errors
2036+
self._set_levels(getattr(obj, 'levels', []), validate=False)
20252037
self._set_labels(getattr(obj, 'labels', []))
20262038
self._set_names(getattr(obj, 'names', []))
20272039
self.sortorder = getattr(obj, 'sortorder', None)
@@ -2083,16 +2095,15 @@ def _convert_slice_indexer(self, key, typ=None):
20832095
def _get_names(self):
20842096
return FrozenList(level.name for level in self.levels)
20852097

2086-
def _set_names(self, values):
2098+
def _set_names(self, values, validate=True):
20872099
"""
20882100
sets names on levels. WARNING: mutates!
20892101
20902102
Note that you generally want to set this *after* changing levels, so that it only
20912103
acts on copies"""
20922104
values = list(values)
2093-
if len(values) != self.nlevels:
2094-
raise ValueError('Length of names (%d) must be same as level '
2095-
'(%d)' % (len(values), self.nlevels))
2105+
if validate and len(values) != self.nlevels:
2106+
raise ValueError('Length of names must match length of levels')
20962107
# set the name
20972108
for name, level in zip(values, self.levels):
20982109
level.rename(name, inplace=True)
@@ -2446,7 +2457,7 @@ def __setstate__(self, state):
24462457
np.ndarray.__setstate__(self, nd_state)
24472458
levels, labels, sortorder, names = own_state
24482459

2449-
self._set_levels([Index(x) for x in levels])
2460+
self._set_levels([Index(x) for x in levels], validate=False)
24502461
self._set_labels(labels)
24512462
self._set_names(names)
24522463
self.sortorder = sortorder
@@ -2473,7 +2484,7 @@ def __getitem__(self, key):
24732484
new_labels = [lab[key] for lab in self.labels]
24742485

24752486
# an optimization
2476-
result._set_levels(self.levels)
2487+
result._set_levels(self.levels, validate=False)
24772488
result._set_labels(new_labels)
24782489
result.sortorder = sortorder
24792490
result._set_names(self.names)
@@ -3351,17 +3362,12 @@ def _ensure_index(index_like, copy=False):
33513362
return Index(index_like)
33523363

33533364

3354-
def _ensure_frozen(nd_array_like, copy=False):
3355-
if not isinstance(nd_array_like, FrozenNDArray):
3356-
arr = np.asarray(nd_array_like, dtype=np.int_)
3357-
# have to do this separately so that non-index input gets copied
3358-
if copy:
3359-
arr = arr.copy()
3360-
nd_array_like = arr.view(FrozenNDArray)
3361-
else:
3362-
if copy:
3363-
nd_array_like = nd_array_like.copy()
3364-
return nd_array_like
3365+
def _ensure_frozen(array_like, copy=False):
3366+
array_like = np.asanyarray(array_like, dtype=np.int_)
3367+
array_like = array_like.view(FrozenNDArray)
3368+
if copy:
3369+
array_like = array_like.copy()
3370+
return array_like
33653371

33663372

33673373
def _validate_join_method(method):

pandas/tests/test_index.py

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -823,7 +823,12 @@ def test_constructor_corner(self):
823823

824824
# preventing casting
825825
arr = np.array([1, '2', 3, '4'], dtype=object)
826-
self.assertRaises(TypeError, Int64Index, arr)
826+
with tm.assertRaisesRegexp(TypeError, 'casting'):
827+
Int64Index(arr)
828+
829+
arr_with_floats = [0, 2, 3, 4, 5, 1.25, 3, -1]
830+
with tm.assertRaisesRegexp(TypeError, 'casting'):
831+
Int64Index(arr_with_floats)
827832

828833
def test_hash_error(self):
829834
with tm.assertRaisesRegexp(TypeError,
@@ -1240,6 +1245,62 @@ def test_set_levels_and_set_labels(self):
12401245
minor_labels = [(x + 1) % 1 for x in minor_labels]
12411246
new_labels = [major_labels, minor_labels]
12421247

1248+
def assert_matching(actual, expected):
1249+
# avoid specifying internal representation
1250+
# as much as possible
1251+
self.assertEqual(len(actual), len(expected))
1252+
for act, exp in zip(actual, expected):
1253+
act = np.asarray(act)
1254+
exp = np.asarray(exp)
1255+
assert_almost_equal(act, exp)
1256+
1257+
# level changing [w/o mutation]
1258+
ind2 = self.index.set_levels(new_levels)
1259+
assert_matching(ind2.levels, new_levels)
1260+
assert_matching(self.index.levels, levels)
1261+
1262+
# level changing [w/ mutation]
1263+
ind2 = self.index.copy()
1264+
inplace_return = ind2.set_levels(new_levels, inplace=True)
1265+
self.assert_(inplace_return is None)
1266+
assert_matching(ind2.levels, new_levels)
1267+
1268+
# label changing [w/o mutation]
1269+
ind2 = self.index.set_labels(new_labels)
1270+
assert_matching(ind2.labels, new_labels)
1271+
assert_matching(self.index.labels, labels)
1272+
1273+
# label changing [w/ mutation]
1274+
ind2 = self.index.copy()
1275+
inplace_return = ind2.set_labels(new_labels, inplace=True)
1276+
self.assert_(inplace_return is None)
1277+
assert_matching(ind2.labels, new_labels)
1278+
1279+
def test_set_levels_labels_names_bad_input(self):
1280+
levels, labels = self.index.levels, self.index.labels
1281+
names = self.index.names
1282+
1283+
with tm.assertRaisesRegexp(ValueError, 'Length of levels'):
1284+
self.index.set_levels([levels[0]])
1285+
1286+
with tm.assertRaisesRegexp(ValueError, 'Length of labels'):
1287+
self.index.set_labels([labels[0]])
1288+
1289+
with tm.assertRaisesRegexp(ValueError, 'Length of names'):
1290+
self.index.set_names([names[0]])
1291+
1292+
# shouldn't scalar data error, instead should demand list-like
1293+
with tm.assertRaisesRegexp(TypeError, 'list of lists-like'):
1294+
self.index.set_levels(levels[0])
1295+
1296+
# shouldn't scalar data error, instead should demand list-like
1297+
with tm.assertRaisesRegexp(TypeError, 'list of lists-like'):
1298+
self.index.set_labels(labels[0])
1299+
1300+
# shouldn't scalar data error, instead should demand list-like
1301+
with tm.assertRaisesRegexp(TypeError, 'list-like'):
1302+
self.index.set_names(names[0])
1303+
12431304
def test_metadata_immutable(self):
12441305
levels, labels = self.index.levels, self.index.labels
12451306
# shouldn't be able to set at either the top level or base level
@@ -1342,8 +1403,13 @@ def test_constructor_single_level(self):
13421403
self.assert_(single_level.name is None)
13431404

13441405
def test_constructor_no_levels(self):
1345-
assertRaisesRegexp(TypeError, "non-zero number of levels/labels",
1406+
assertRaisesRegexp(ValueError, "non-zero number of levels/labels",
13461407
MultiIndex, levels=[], labels=[])
1408+
both_re = re.compile('Must pass both levels and labels')
1409+
with tm.assertRaisesRegexp(TypeError, both_re):
1410+
MultiIndex(levels=[])
1411+
with tm.assertRaisesRegexp(TypeError, both_re):
1412+
MultiIndex(labels=[])
13471413

13481414
def test_constructor_mismatched_label_levels(self):
13491415
levels = [np.array([1]), np.array([2]), np.array([3])]

pandas/tests/test_multilevel.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -347,8 +347,8 @@ def test_frame_setitem_multi_column(self):
347347
# it broadcasts
348348
df['B', '1'] = [1, 2, 3]
349349
df['A'] = df['B', '1']
350-
assert_almost_equal(df['A', '1'], df['B', '1'])
351-
assert_almost_equal(df['A', '2'], df['B', '1'])
350+
assert_series_equal(df['A', '1'], df['B', '1'])
351+
assert_series_equal(df['A', '2'], df['B', '1'])
352352

353353
def test_getitem_tuple_plus_slice(self):
354354
# GH #671

pandas/util/testing.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -332,11 +332,11 @@ def equalContents(arr1, arr2):
332332
return frozenset(arr1) == frozenset(arr2)
333333

334334

335-
def assert_isinstance(obj, class_type_or_tuple):
335+
def assert_isinstance(obj, class_type_or_tuple, msg=''):
336336
"""asserts that obj is an instance of class_type_or_tuple"""
337337
assert isinstance(obj, class_type_or_tuple), (
338-
"Expected object to be of type %r, found %r instead" % (
339-
type(obj), class_type_or_tuple))
338+
"%sExpected object to be of type %r, found %r instead" % (
339+
msg, class_type_or_tuple, type(obj)))
340340

341341

342342
def assert_equal(a, b, msg=""):
@@ -355,6 +355,8 @@ def assert_equal(a, b, msg=""):
355355

356356

357357
def assert_index_equal(left, right):
358+
assert_isinstance(left, Index, '[index] ')
359+
assert_isinstance(right, Index, '[index] ')
358360
if not left.equals(right):
359361
raise AssertionError("[index] left [{0} {1}], right [{2} {3}]".format(left.dtype,
360362
left,
@@ -373,6 +375,8 @@ def isiterable(obj):
373375
return hasattr(obj, '__iter__')
374376

375377

378+
# NOTE: don't pass an NDFrame or index to this function - may not handle it
379+
# well.
376380
def assert_almost_equal(a, b, check_less_precise=False):
377381
if isinstance(a, dict) or isinstance(b, dict):
378382
return assert_dict_equal(a, b)
@@ -385,9 +389,6 @@ def assert_almost_equal(a, b, check_less_precise=False):
385389
np.testing.assert_(isiterable(b))
386390
na, nb = len(a), len(b)
387391
assert na == nb, "%s != %s" % (na, nb)
388-
# TODO: Figure out why I thought this needed instance cheacks...
389-
# if (isinstance(a, np.ndarray) and isinstance(b, np.ndarray) and
390-
# np.array_equal(a, b)):
391392
if np.array_equal(a, b):
392393
return True
393394
else:

0 commit comments

Comments
 (0)