Skip to content

remove NaN in categories checking #20372

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 16, 2018
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 6 additions & 34 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1258,7 +1258,7 @@ def isna(self):
"""
Detect missing values

Both missing values (-1 in .codes) and NA as a category are detected.
Missing values (-1 in .codes) are detected.

Returns
-------
Expand All @@ -1273,13 +1273,6 @@ def isna(self):
"""

ret = self._codes == -1

# String/object and float categories can hold np.nan
if self.categories.dtype.kind in ['S', 'O', 'f']:
if np.nan in self.categories:
nan_pos = np.where(isna(self.categories))[0]
# we only have one NA in categories
ret = np.logical_or(ret, self._codes == nan_pos)
return ret
isnull = isna

Expand Down Expand Up @@ -1315,16 +1308,14 @@ def dropna(self):
"""
Return the Categorical without null values.

Both missing values (-1 in .codes) and NA as a category are detected.
NA is removed from the categories if present.
Missing values (-1 in .codes) are detected.

Returns
-------
valid : Categorical
"""
result = self[self.notna()]
if isna(result.categories).any():
result = result.remove_categories([np.nan])

return result

def value_counts(self, dropna=True):
Expand All @@ -1336,7 +1327,7 @@ def value_counts(self, dropna=True):
Parameters
----------
dropna : boolean, default True
Don't include counts of NaN, even if NaN is a category.
Don't include counts of NaN.

Returns
-------
Expand All @@ -1348,11 +1339,9 @@ def value_counts(self, dropna=True):

"""
from numpy import bincount
from pandas import isna, Series, CategoricalIndex
from pandas import Series, CategoricalIndex

obj = (self.remove_categories([np.nan]) if dropna and
isna(self.categories).any() else self)
code, cat = obj._codes, obj.categories
code, cat = self._codes, self.categories
ncat, mask = len(cat), 0 <= code
ix, clean = np.arange(ncat), mask.all()

Expand Down Expand Up @@ -1627,14 +1616,6 @@ def fillna(self, value=None, method=None, limit=None):

values = self._codes

# Make sure that we also get NA in categories
if self.categories.dtype.kind in ['S', 'O', 'f']:
if np.nan in self.categories:
values = values.copy()
nan_pos = np.where(isna(self.categories))[0]
# we only have one NA in categories
values[values == nan_pos] = -1

# pad / bfill
if method is not None:

Expand Down Expand Up @@ -1888,15 +1869,6 @@ def __setitem__(self, key, value):
key = np.asarray(key)

lindexer = self.categories.get_indexer(rvalue)

# FIXME: the following can be removed after GH7820 is fixed:
# https://github.com/pandas-dev/pandas/issues/7820
# float categories do currently return -1 for np.nan, even if np.nan is
# included in the index -> "repair" this here
if isna(rvalue).any() and isna(self.categories).any():
nan_pos = np.where(isna(self.categories))[0]
lindexer[lindexer == -1] = nan_pos

lindexer = self._maybe_coerce_indexer(lindexer)
self._codes[key] = lindexer

Expand Down