Skip to content

REF: share Categorical.fillna with NDArrayBackedExtensionArray #40383

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,9 @@ def __getitem__(
def fillna(
self: NDArrayBackedExtensionArrayT, value=None, method=None, limit=None
) -> NDArrayBackedExtensionArrayT:
value, method = validate_fillna_kwargs(value, method)
value, method = validate_fillna_kwargs(
value, method, validate_scalar_dict_value=False
)

mask = self.isna()
value = missing.check_value_size(value, mask, len(self))
Expand All @@ -291,6 +293,10 @@ def fillna(
new_values = self.copy()
new_values[mask] = value
else:
# We validate the fill_value even if there is nothing to fill
if value is not None:
self._validate_setitem_value(value)

new_values = self.copy()
return new_values

Expand Down
67 changes: 1 addition & 66 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,7 @@
cache_readonly,
deprecate_kwarg,
)
from pandas.util._validators import (
validate_bool_kwarg,
validate_fillna_kwargs,
)
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.cast import (
coerce_indexer_dtype,
Expand Down Expand Up @@ -102,7 +99,6 @@
sanitize_array,
)
from pandas.core.indexers import deprecate_ndim_indexing
from pandas.core.missing import interpolate_2d
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.sorting import nargsort
from pandas.core.strings.object_array import ObjectStringArrayMixin
Expand Down Expand Up @@ -1730,67 +1726,6 @@ def to_dense(self):
)
return np.asarray(self)

def fillna(self, value=None, method=None, limit=None):
"""
Fill NA/NaN values using the specified method.

Parameters
----------
value : scalar, dict, Series
If a scalar value is passed it is used to fill all missing values.
Alternatively, a Series or dict can be used to fill in different
values for each index. The value should not be a list. The
value(s) passed should either be in the categories or should be
NaN.
method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
Method to use for filling holes in reindexed Series
pad / ffill: propagate last valid observation forward to next valid
backfill / bfill: use NEXT valid observation to fill gap
limit : int, default None
(Not implemented yet for Categorical!)
If method is specified, this is the maximum number of consecutive
NaN values to forward/backward fill. In other words, if there is
a gap with more than this number of consecutive NaNs, it will only
be partially filled. If method is not specified, this is the
maximum number of entries along the entire axis where NaNs will be
filled.

Returns
-------
filled : Categorical with NA/NaN filled
"""
value, method = validate_fillna_kwargs(
value, method, validate_scalar_dict_value=False
)
value = extract_array(value, extract_numpy=True)

if value is None:
value = np.nan
if limit is not None:
raise NotImplementedError(
"specifying a limit for fillna has not been implemented yet"
)

if method is not None:
# pad / bfill

# TODO: dispatch when self.categories is EA-dtype
values = np.asarray(self).reshape(-1, len(self))
values = interpolate_2d(values, method, 0, None).astype(
self.categories.dtype
)[0]
codes = _get_codes_for_values(values, self.categories)

else:
# We copy even if there is nothing to fill
codes = self._ndarray.copy()
mask = self.isna()

new_codes = self._validate_setitem_value(value)
np.putmask(codes, mask, new_codes)

return self._from_backing_data(codes)

# ------------------------------------------------------------------
# NDArrayBackedExtensionArray compat

Expand Down
10 changes: 9 additions & 1 deletion pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,15 @@ def __contains__(self, key: Any) -> bool:
@doc(Index.fillna)
def fillna(self, value, downcast=None):
value = self._require_scalar(value)
cat = self._data.fillna(value)
try:
cat = self._data.fillna(value)
except (ValueError, TypeError):
# invalid fill_value
if not self.isna().any():
# nothing to fill, we can get away without casting
return self.copy()
return self.astype(object).fillna(value, downcast=downcast)

return type(self)._simple_new(cat, name=self.name)

@doc(Index.unique)
Expand Down
10 changes: 0 additions & 10 deletions pandas/tests/extension/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,16 +304,6 @@ class TestBooleanReduce(BaseNumPyTests, base.BaseBooleanReduceTests):


class TestMissing(BaseNumPyTests, base.BaseMissingTests):
@skip_nested
def test_fillna_scalar(self, data_missing):
# Non-scalar "scalar" values.
super().test_fillna_scalar(data_missing)

@skip_nested
def test_fillna_no_op_returns_copy(self, data):
# Non-scalar "scalar" values.
super().test_fillna_no_op_returns_copy(data)

@skip_nested
def test_fillna_series(self, data_missing):
# Non-scalar "scalar" values.
Expand Down
15 changes: 11 additions & 4 deletions pandas/tests/indexes/categorical/test_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,16 @@ def test_fillna_categorical(self):
exp = CategoricalIndex([1.0, 1.0, 3.0, 1.0], name="x")
tm.assert_index_equal(idx.fillna(1.0), exp)

# fill by value not in categories raises ValueError
cat = idx._data

# fill by value not in categories raises ValueError on EA, casts on CI
msg = "Cannot setitem on a Categorical with a new category"
with pytest.raises(ValueError, match=msg):
idx.fillna(2.0)
cat.fillna(2.0)

result = idx.fillna(2.0)
expected = idx.astype(object).fillna(2.0)
tm.assert_index_equal(result, expected)

def test_fillna_copies_with_no_nas(self):
# Nothing to fill, should still get a copy
Expand All @@ -37,8 +43,9 @@ def test_fillna_validates_with_no_nas(self):
cat = ci._data

msg = "Cannot setitem on a Categorical with a new category"
with pytest.raises(ValueError, match=msg):
ci.fillna(False)
res = ci.fillna(False)
# nothing to fill, so we dont cast
tm.assert_index_equal(res, ci)

# Same check directly on the Categorical
with pytest.raises(ValueError, match=msg):
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/series/methods/test_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,13 +671,15 @@ def test_fillna_categorical_with_new_categories(self, fill_value, expected_outpu
def test_fillna_categorical_raises(self):
data = ["a", np.nan, "b", np.nan, np.nan]
ser = Series(Categorical(data, categories=["a", "b"]))
cat = ser._values

msg = "Cannot setitem on a Categorical with a new category"
with pytest.raises(ValueError, match=msg):
ser.fillna("d")

with pytest.raises(ValueError, match=msg):
ser.fillna(Series("d"))
msg2 = "Length of 'value' does not match."
with pytest.raises(ValueError, match=msg2):
cat.fillna(Series("d"))

with pytest.raises(ValueError, match=msg):
ser.fillna({1: "d", 3: "a"})
Expand Down