Skip to content

Commit d57c17c

Browse files
stephenwlinwesm
authored andcommitted
BUG: Various inconsistencies in DataFrame __getitem__ and __setitem__ behavior
1 parent ccaa428 commit d57c17c

File tree

6 files changed

+197
-156
lines changed

6 files changed

+197
-156
lines changed

pandas/core/frame.py

Lines changed: 108 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626
_default_index, _is_sequence)
2727
from pandas.core.generic import NDFrame
2828
from pandas.core.index import Index, MultiIndex, _ensure_index
29-
from pandas.core.indexing import _NDFrameIndexer, _maybe_droplevels
29+
from pandas.core.indexing import (_NDFrameIndexer, _maybe_droplevels,
30+
_is_index_slice, _check_bool_indexer)
3031
from pandas.core.internals import BlockManager, make_block, form_blocks
3132
from pandas.core.series import Series, _radd_compat, _dtype_from_scalar
3233
from pandas.compat.scipy import scoreatpercentile as _quantile
@@ -313,7 +314,7 @@ def f(self, other):
313314
return self._combine_series_infer(other, func)
314315
else:
315316

316-
# straight boolean comparisions we want to allow all columns
317+
# straight boolean comparisions we want to allow all columns
317318
# (regardless of dtype to pass thru)
318319
return self._combine_const(other, func, raise_on_error = False).fillna(True).astype(bool)
319320

@@ -1972,72 +1973,52 @@ def iget_value(self, i, j):
19721973
return self.get_value(row, col)
19731974

19741975
def __getitem__(self, key):
1975-
# slice rows
19761976
if isinstance(key, slice):
1977-
from pandas.core.indexing import _is_index_slice
1978-
idx_type = self.index.inferred_type
1979-
if idx_type == 'floating':
1980-
indexer = self.ix._convert_to_indexer(key, axis=0)
1981-
elif idx_type == 'integer' or _is_index_slice(key):
1982-
indexer = key
1983-
else:
1984-
indexer = self.ix._convert_to_indexer(key, axis=0)
1985-
new_data = self._data.get_slice(indexer, axis=1)
1986-
return self._constructor(new_data)
1987-
# either boolean or fancy integer index
1977+
# slice rows
1978+
return self._getitem_slice(key)
19881979
elif isinstance(key, (np.ndarray, list)):
1989-
if isinstance(key, list):
1990-
key = lib.list_to_object_array(key)
1991-
1992-
# also raises Exception if object array with NA values
1993-
if com._is_bool_indexer(key):
1994-
key = np.asarray(key, dtype=bool)
1980+
# either boolean or fancy integer index
19951981
return self._getitem_array(key)
1982+
elif isinstance(key, DataFrame):
1983+
return self._getitem_frame(key)
19961984
elif isinstance(self.columns, MultiIndex):
19971985
return self._getitem_multilevel(key)
1998-
elif isinstance(key, DataFrame):
1999-
if key.values.dtype == bool:
2000-
return self.where(key, try_cast = False)
2001-
else:
2002-
raise ValueError('Cannot index using non-boolean DataFrame')
20031986
else:
1987+
# get column
20041988
return self._get_item_cache(key)
20051989

1990+
def _getitem_slice(self, key):
1991+
idx_type = self.index.inferred_type
1992+
if idx_type == 'floating':
1993+
indexer = self.ix._convert_to_indexer(key, axis=0)
1994+
elif idx_type == 'integer' or _is_index_slice(key):
1995+
indexer = key
1996+
else:
1997+
indexer = self.ix._convert_to_indexer(key, axis=0)
1998+
return self._slice(indexer, axis=0)
1999+
20062000
def _getitem_array(self, key):
2007-
if key.dtype == np.bool_:
2008-
if len(key) != len(self.index):
2001+
# also raises Exception if object array with NA values
2002+
if com._is_bool_indexer(key):
2003+
# warning here just in case -- previously __setitem__ was
2004+
# reindexing but __getitem__ was not; it seems more reasonable to
2005+
# go with the __setitem__ behavior since that is more consistent
2006+
# with all other indexing behavior
2007+
if isinstance(key, Series) and not key.index.equals(self.index):
2008+
import warnings
2009+
warnings.warn("Boolean Series key will be reindexed to match "
2010+
"DataFrame index.", UserWarning)
2011+
elif len(key) != len(self.index):
20092012
raise ValueError('Item wrong length %d instead of %d!' %
20102013
(len(key), len(self.index)))
2011-
2012-
inds, = key.nonzero()
2013-
return self.take(inds)
2014-
else:
2015-
if self.columns.is_unique:
2016-
indexer = self.columns.get_indexer(key)
2017-
mask = indexer == -1
2018-
if mask.any():
2019-
raise KeyError("No column(s) named: %s" %
2020-
com.pprint_thing(key[mask]))
2021-
result = self.reindex(columns=key)
2022-
if result.columns.name is None:
2023-
result.columns.name = self.columns.name
2024-
return result
2025-
else:
2026-
mask = self.columns.isin(key)
2027-
for k in key:
2028-
if k not in self.columns:
2029-
raise KeyError("No column(s) named: %s" %
2030-
com.pprint_thing(k))
2031-
return self.take(mask.nonzero()[0], axis=1)
2032-
2033-
def _slice(self, slobj, axis=0):
2034-
if axis == 0:
2035-
mgr_axis = 1
2014+
# _check_bool_indexer will throw exception if Series key cannot
2015+
# be reindexed to match DataFrame rows
2016+
key = _check_bool_indexer(self.index, key)
2017+
indexer = key.nonzero()[0]
2018+
return self.take(indexer, axis=0)
20362019
else:
2037-
mgr_axis = 0
2038-
2039-
new_data = self._data.get_slice(slobj, axis=mgr_axis)
2040-
return self._constructor(new_data)
2020+
indexer = self.ix._convert_to_indexer(key, axis=1)
2021+
return self.take(indexer, axis=1)
20412022

20422023
def _getitem_multilevel(self, key):
20432024
loc = self.columns.get_loc(key)
@@ -2063,6 +2044,20 @@ def _getitem_multilevel(self, key):
20632044
else:
20642045
return self._get_item_cache(key)
20652046

2047+
def _getitem_frame(self, key):
2048+
if key.values.dtype != np.bool_:
2049+
raise ValueError('Must pass DataFrame with boolean values only')
2050+
return self.where(key)
2051+
2052+
def _slice(self, slobj, axis=0):
2053+
if axis == 0:
2054+
mgr_axis = 1
2055+
else:
2056+
mgr_axis = 0
2057+
2058+
new_data = self._data.get_slice(slobj, axis=mgr_axis)
2059+
return self._constructor(new_data)
2060+
20662061
def _box_item_values(self, key, values):
20672062
items = self.columns[self.columns.get_loc(key)]
20682063
if values.ndim == 2:
@@ -2096,34 +2091,56 @@ def __setattr__(self, name, value):
20962091
object.__setattr__(self, name, value)
20972092

20982093
def __setitem__(self, key, value):
2099-
# support boolean setting with DataFrame input, e.g.
2100-
# df[df > df2] = 0
2101-
if isinstance(key, DataFrame):
2102-
self._boolean_set(key, value)
2094+
if isinstance(key, slice):
2095+
# slice rows
2096+
self._setitem_slice(key, value)
21032097
elif isinstance(key, (np.ndarray, list)):
2104-
return self._set_item_multiple(key, value)
2098+
self._setitem_array(key, value)
2099+
elif isinstance(key, DataFrame):
2100+
self._setitem_frame(key, value)
21052101
else:
21062102
# set column
21072103
self._set_item(key, value)
21082104

2109-
def _boolean_set(self, key, value):
2110-
if key.values.dtype != np.bool_:
2111-
raise ValueError('Must pass DataFrame with boolean values only')
2112-
self.where(-key, value, inplace=True)
2105+
def _setitem_slice(self, key, value):
2106+
idx_type = self.index.inferred_type
2107+
if idx_type == 'floating':
2108+
indexer = self.ix._convert_to_indexer(key, axis=0)
2109+
elif idx_type == 'integer' or _is_index_slice(key):
2110+
indexer = key
2111+
else:
2112+
indexer = self.ix._convert_to_indexer(key, axis=0)
2113+
self.ix._setitem_with_indexer(indexer, value)
21132114

2114-
def _set_item_multiple(self, keys, value):
2115-
if isinstance(value, DataFrame):
2116-
if len(value.columns) != len(keys):
2117-
raise AssertionError('Columns must be same length as keys')
2118-
for k1, k2 in zip(keys, value.columns):
2119-
self[k1] = value[k2]
2115+
def _setitem_array(self, key, value):
2116+
# also raises Exception if object array with NA values
2117+
if com._is_bool_indexer(key):
2118+
if len(key) != len(self.index):
2119+
raise ValueError('Item wrong length %d instead of %d!' %
2120+
(len(key), len(self.index)))
2121+
key = _check_bool_indexer(self.index, key)
2122+
indexer = key.nonzero()[0]
2123+
self.ix._setitem_with_indexer(indexer, value)
21202124
else:
2121-
if isinstance(keys, np.ndarray) and keys.dtype == np.bool_:
2122-
# boolean slicing should happen on rows, consistent with
2123-
# behavior of getitem
2124-
self.ix[keys, :] = value
2125+
if isinstance(value, DataFrame):
2126+
if len(value.columns) != len(key):
2127+
raise AssertionError('Columns must be same length as key')
2128+
for k1, k2 in zip(key, value.columns):
2129+
self[k1] = value[k2]
21252130
else:
2126-
self.ix[:, keys] = value
2131+
indexer = self.ix._convert_to_indexer(key, axis=1)
2132+
self.ix._setitem_with_indexer((slice(None), indexer), value)
2133+
2134+
def _setitem_frame(self, key, value):
2135+
# support boolean setting with DataFrame input, e.g.
2136+
# df[df > df2] = 0
2137+
if key.values.dtype != np.bool_:
2138+
raise ValueError('Must pass DataFrame with boolean values only')
2139+
2140+
if self._is_mixed_type:
2141+
raise ValueError('Cannot do boolean setting on mixed-type frame')
2142+
2143+
self.where(-key, value, inplace=True)
21272144

21282145
def _set_item(self, key, value):
21292146
"""
@@ -2918,7 +2935,7 @@ def take(self, indices, axis=0):
29182935
"""
29192936
if isinstance(indices, list):
29202937
indices = np.array(indices)
2921-
if self._data.is_mixed_dtype():
2938+
if self._is_mixed_type:
29222939
if axis == 0:
29232940
new_data = self._data.take(indices, axis=1)
29242941
return DataFrame(new_data)
@@ -3247,7 +3264,7 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False):
32473264

32483265
new_axis, indexer = the_axis.sortlevel(level, ascending=ascending)
32493266

3250-
if self._data.is_mixed_dtype() and not inplace:
3267+
if self._is_mixed_type and not inplace:
32513268
if axis == 0:
32523269
return self.reindex(index=new_axis)
32533270
else:
@@ -3472,7 +3489,7 @@ def replace(self, to_replace, value=None, method='pad', axis=0,
34723489
'in length. Expecting %d got %d ' %
34733490
(len(to_replace), len(value)))
34743491

3475-
new_data = self._data.replace_list(to_replace, value,
3492+
new_data = self._data.replace_list(to_replace, value,
34763493
inplace=inplace)
34773494

34783495
else: # [NA, ''] -> 0
@@ -5055,7 +5072,7 @@ def clip(self, lower=None, upper=None):
50555072
# GH 2747 (arguments were reversed)
50565073
if lower is not None and upper is not None:
50575074
lower, upper = min(lower,upper), max(lower,upper)
5058-
5075+
50595076
return self.apply(lambda x: x.clip(lower=lower, upper=upper))
50605077

50615078
def clip_upper(self, threshold):
@@ -5246,25 +5263,22 @@ def where(self, cond, other=NA, inplace=False, try_cast=False, raise_on_error=Tr
52465263
-------
52475264
wh : DataFrame
52485265
"""
5249-
if not hasattr(cond, 'shape'):
5250-
raise ValueError('where requires an ndarray like object for its '
5251-
'condition')
5252-
5253-
if isinstance(cond, np.ndarray):
5266+
if isinstance(cond, DataFrame):
5267+
# this already checks for index/column equality
5268+
cond = cond.reindex(self.index, columns=self.columns)
5269+
else:
5270+
if not hasattr(cond, 'shape'):
5271+
raise ValueError('where requires an ndarray like object for its '
5272+
'condition')
52545273
if cond.shape != self.shape:
52555274
raise ValueError('Array conditional must be same shape as self')
52565275
cond = self._constructor(cond, index=self.index,
52575276
columns=self.columns)
52585277

5259-
if cond.shape != self.shape:
5260-
cond = cond.reindex(self.index, columns=self.columns)
5261-
5262-
if inplace:
5263-
cond = -(cond.fillna(True).astype(bool))
5264-
else:
5265-
cond = cond.fillna(False).astype(bool)
5266-
elif inplace:
5267-
cond = -cond
5278+
if inplace:
5279+
cond = -(cond.fillna(True).astype(bool))
5280+
else:
5281+
cond = cond.fillna(False).astype(bool)
52685282

52695283
if isinstance(other, DataFrame):
52705284
_, other = self.align(other, join='left', fill_value=NA)

pandas/core/indexing.py

Lines changed: 21 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ def _convert_for_reindex(self, key, axis=0):
265265

266266
if com._is_bool_indexer(key):
267267
key = _check_bool_indexer(labels, key)
268-
return labels[np.asarray(key)]
268+
return labels[key]
269269
else:
270270
if isinstance(key, Index):
271271
# want Index objects to pass through untouched
@@ -340,28 +340,19 @@ def _getitem_axis(self, key, axis=0):
340340
raise ValueError('Cannot index with multidimensional key')
341341

342342
return self._getitem_iterable(key, axis=axis)
343-
elif axis == 0:
344-
is_int_index = _is_integer_index(labels)
345-
346-
idx = key
343+
else:
347344
if com.is_integer(key):
348-
if isinstance(labels, MultiIndex):
345+
if axis == 0 and isinstance(labels, MultiIndex):
349346
try:
350-
return self._get_label(key, axis=0)
347+
return self._get_label(key, axis=axis)
351348
except (KeyError, TypeError):
352349
if _is_integer_index(self.obj.index.levels[0]):
353350
raise
354351

355-
if not is_int_index:
356-
return self._get_loc(key, axis=0)
352+
if not _is_integer_index(labels):
353+
return self._get_loc(key, axis=axis)
357354

358-
return self._get_label(idx, axis=0)
359-
else:
360-
labels = self.obj._get_axis(axis)
361-
lab = key
362-
if com.is_integer(key) and not _is_integer_index(labels):
363-
return self._get_loc(key, axis=axis)
364-
return self._get_label(lab, axis=axis)
355+
return self._get_label(key, axis=axis)
365356

366357
def _getitem_iterable(self, key, axis=0):
367358
labels = self.obj._get_axis(axis)
@@ -377,11 +368,10 @@ def _reindex(keys, level=None):
377368

378369
if com._is_bool_indexer(key):
379370
key = _check_bool_indexer(labels, key)
380-
inds, = np.asarray(key, dtype=bool).nonzero()
371+
inds, = key.nonzero()
381372
return self.obj.take(inds, axis=axis)
382373
else:
383-
was_index = isinstance(key, Index)
384-
if was_index:
374+
if isinstance(key, Index):
385375
# want Index objects to pass through untouched
386376
keyarr = key
387377
else:
@@ -489,8 +479,9 @@ def _convert_to_indexer(self, obj, axis=0):
489479

490480
elif _is_list_like(obj):
491481
if com._is_bool_indexer(obj):
492-
objarr = _check_bool_indexer(labels, obj)
493-
return objarr
482+
obj = _check_bool_indexer(labels, obj)
483+
inds, = obj.nonzero()
484+
return inds
494485
else:
495486
if isinstance(obj, Index):
496487
objarr = obj.values
@@ -672,17 +663,19 @@ def _setitem_with_indexer(self, indexer, value):
672663
def _check_bool_indexer(ax, key):
673664
# boolean indexing, need to check that the data are aligned, otherwise
674665
# disallowed
675-
result = key
676-
if _is_series(key) and key.dtype == np.bool_:
677-
if not key.index.equals(ax):
678-
result = key.reindex(ax)
679666

680-
if isinstance(result, np.ndarray) and result.dtype == np.object_:
667+
# this function assumes that com._is_bool_indexer(key) == True
668+
669+
result = key
670+
if _is_series(key) and not key.index.equals(ax):
671+
result = result.reindex(ax)
681672
mask = com.isnull(result)
682673
if mask.any():
683-
raise IndexingError('cannot index with vector containing '
684-
'NA / NaN values')
674+
raise IndexingError('Unalignable boolean Series key provided')
685675

676+
# com._is_bool_indexer has already checked for nulls in the case of an
677+
# object array key, so no check needed here
678+
result = np.asarray(result, dtype=bool)
686679
return result
687680

688681

0 commit comments

Comments
 (0)