diff --git a/doc/source/release.rst b/doc/source/release.rst index daee460fc50a1..66c3dcd203a6a 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -497,6 +497,7 @@ Bug Fixes - Fixed wrong index name during read_csv if using usecols. Applies to c parser only. (:issue:`4201`) - ``Timestamp`` objects can now appear in the left hand side of a comparison operation with a ``Series`` or ``DataFrame`` object (:issue:`4982`). + - Fix a bug when indexing with ``np.nan`` via ``iloc/loc`` (:issue:`5016`) pandas 0.12.0 ------------- diff --git a/pandas/core/index.py b/pandas/core/index.py index d488a29182a18..63bda40932647 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -424,7 +424,7 @@ def _convert_scalar_indexer(self, key, typ=None): def to_int(): ikey = int(key) if ikey != key: - self._convert_indexer_error(key, 'label') + return self._convert_indexer_error(key, 'label') return ikey if typ == 'iloc': diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index afbeb53d857e2..eb377c4b7955f 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,12 +1,12 @@ # pylint: disable=W0223 from datetime import datetime -from pandas.core.common import _asarray_tuplesafe, is_list_like from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.compat import range, zip import pandas.compat as compat import pandas.core.common as com from pandas.core.common import (_is_bool_indexer, is_integer_dtype, + _asarray_tuplesafe, is_list_like, isnull, ABCSeries, ABCDataFrame, ABCPanel) import pandas.lib as lib @@ -979,12 +979,20 @@ def _has_valid_type(self, key, axis): else: def error(): + if isnull(key): + raise ValueError("cannot use label indexing with a null key") raise KeyError("the label [%s] is not in the [%s]" % (key,self.obj._get_axis_name(axis))) - key = self._convert_scalar_indexer(key, axis) try: + key = self._convert_scalar_indexer(key, axis) if not key in ax: error() + except (TypeError) as e: + + # python 3 type errors should be raised + if 'unorderable' in str(e): # pragma: no cover + error() + raise except: error() diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 8fcb64e6d0eda..f10e1612f7fe9 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -97,8 +97,13 @@ def ref_locs(self): indexer = self.ref_items.get_indexer(self.items) indexer = com._ensure_platform_int(indexer) if (indexer == -1).any(): - raise AssertionError('Some block items were not in block ' - 'ref_items') + + # this means that we have nan's in our block + try: + indexer[indexer == -1] = np.arange(len(self.items))[isnull(self.items)] + except: + raise AssertionError('Some block items were not in block ' + 'ref_items') self._ref_locs = indexer return self._ref_locs @@ -2500,9 +2505,18 @@ def _consolidate_inplace(self): def get(self, item): if self.items.is_unique: + + if isnull(item): + indexer = np.arange(len(self.items))[isnull(self.items)] + return self.get_for_nan_indexer(indexer) + _, block = self._find_block(item) return block.get(item) else: + + if isnull(item): + raise ValueError("cannot label index with a null key") + indexer = self.items.get_loc(item) ref_locs = np.array(self._set_ref_locs()) @@ -2528,14 +2542,31 @@ def get(self, item): def iget(self, i): item = self.items[i] + + # unique if self.items.is_unique: - return self.get(item) + if notnull(item): + return self.get(item) + return self.get_for_nan_indexer(i) - # compute the duplicative indexer if needed ref_locs = self._set_ref_locs() b, loc = ref_locs[i] return b.iget(loc) + def get_for_nan_indexer(self, indexer): + + # allow a single nan location indexer + if not np.isscalar(indexer): + if len(indexer) == 1: + indexer = indexer.item() + else: + raise ValueError("cannot label index with a null key") + + # take a nan indexer and return the values + ref_locs = self._set_ref_locs(do_refs='force') + b, loc = ref_locs[indexer] + return b.iget(loc) + def get_scalar(self, tup): """ Retrieve single item diff --git a/pandas/core/series.py b/pandas/core/series.py index d9e9a0034b56b..77c777042ab5f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1053,10 +1053,10 @@ def __setitem__(self, key, value): except TypeError as e: if isinstance(key, tuple) and not isinstance(self.index, MultiIndex): raise ValueError("Can only tuple-index with a MultiIndex") + # python 3 type errors should be raised if 'unorderable' in str(e): # pragma: no cover raise IndexError(key) - # Could not hash item if _is_bool_indexer(key): key = _check_bool_indexer(self.index, key) diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index 164fc8c94924e..1b132ea91f515 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -643,6 +643,8 @@ cdef class Float64HashTable(HashTable): return uniques.to_array() +na_sentinel = object + cdef class PyObjectHashTable(HashTable): # cdef kh_pymap_t *table @@ -660,6 +662,8 @@ cdef class PyObjectHashTable(HashTable): def __contains__(self, object key): cdef khiter_t k hash(key) + if key != key or key is None: + key = na_sentinel k = kh_get_pymap(self.table, key) return k != self.table.n_buckets @@ -669,6 +673,8 @@ cdef class PyObjectHashTable(HashTable): cpdef get_item(self, object val): cdef khiter_t k + if val != val or val is None: + val = na_sentinel k = kh_get_pymap(self.table, val) if k != self.table.n_buckets: return self.table.vals[k] @@ -677,6 +683,8 @@ cdef class PyObjectHashTable(HashTable): def get_iter_test(self, object key, Py_ssize_t iterations): cdef Py_ssize_t i, val + if key != key or key is None: + key = na_sentinel for i in range(iterations): k = kh_get_pymap(self.table, key) if k != self.table.n_buckets: @@ -689,6 +697,8 @@ cdef class PyObjectHashTable(HashTable): char* buf hash(key) + if key != key or key is None: + key = na_sentinel k = kh_put_pymap(self.table, key, &ret) # self.table.keys[k] = key if kh_exist_pymap(self.table, k): @@ -706,6 +716,9 @@ cdef class PyObjectHashTable(HashTable): for i in range(n): val = values[i] hash(val) + if val != val or val is None: + val = na_sentinel + k = kh_put_pymap(self.table, val, &ret) self.table.vals[k] = i @@ -720,6 +733,9 @@ cdef class PyObjectHashTable(HashTable): for i in range(n): val = values[i] hash(val) + if val != val or val is None: + val = na_sentinel + k = kh_get_pymap(self.table, val) if k != self.table.n_buckets: locs[i] = self.table.vals[k] diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index e5d2bb17ec7a8..eeb2c34ea9394 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -642,6 +642,8 @@ def test_setitem_clear_caches(self): def test_setitem_None(self): # GH #766 self.frame[None] = self.frame['A'] + assert_series_equal(self.frame.iloc[:,-1], self.frame['A']) + assert_series_equal(self.frame.loc[:,None], self.frame['A']) assert_series_equal(self.frame[None], self.frame['A']) repr(self.frame) @@ -4475,6 +4477,41 @@ def test_constructor_lists_to_object_dtype(self): self.assert_(d['a'].dtype == np.object_) self.assert_(d['a'][1] is False) + def test_constructor_with_nas(self): + # GH 5016 + # na's in indicies + + def check(df): + for i in range(len(df.columns)): + df.iloc[:,i] + + # allow single nans to succeed + indexer = np.arange(len(df.columns))[isnull(df.columns)] + + if len(indexer) == 1: + assert_series_equal(df.iloc[:,indexer[0]],df.loc[:,np.nan]) + + + # multiple nans should fail + else: + + def f(): + df.loc[:,np.nan] + self.assertRaises(ValueError, f) + + + df = DataFrame([[1,2,3],[4,5,6]], index=[1,np.nan]) + check(df) + + df = DataFrame([[1,2,3],[4,5,6]], columns=[1.1,2.2,np.nan]) + check(df) + + df = DataFrame([[0,1,2,3],[4,5,6,7]], columns=[np.nan,1.1,2.2,np.nan]) + check(df) + + df = DataFrame([[0.0,1,2,3.0],[4,5,6,7]], columns=[np.nan,1.1,2.2,np.nan]) + check(df) + def test_logical_with_nas(self): d = DataFrame({'a': [np.nan, False], 'b': [True, True]})