Skip to content

REF: de-duplicate Categorical _validate_foo_value #41919

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions doc/source/user_guide/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -777,8 +777,8 @@ value is included in the ``categories``:
df
try:
df.iloc[2:4, :] = [["c", 3], ["c", 3]]
except ValueError as e:
print("ValueError:", str(e))
except TypeError as e:
print("TypeError:", str(e))

Setting values by assigning categorical data will also check that the ``categories`` match:

Expand All @@ -788,8 +788,8 @@ Setting values by assigning categorical data will also check that the ``categori
df
try:
df.loc["j":"k", "cats"] = pd.Categorical(["b", "b"], categories=["a", "b", "c"])
except ValueError as e:
print("ValueError:", str(e))
except TypeError as e:
print("TypeError:", str(e))

Assigning a ``Categorical`` to parts of a column of other types will use the values:

Expand Down
6 changes: 5 additions & 1 deletion doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,11 @@ Bug fixes

Categorical
^^^^^^^^^^^
-
- Bug in setting dtype-incompatible values into a :class:`Categorical` (or ``Series`` or ``DataFrame`` backed by ``Categorical``) raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
- Bug in :meth:`Categorical.searchsorted` when passing a dtype-incompatible value raising ``KeyError`` instead of ``TypeError`` (:issue:`41919`)
- Bug in :meth:`Series.where` with ``CategoricalDtype`` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
- Bug in :meth:`Categorical.fillna` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
- Bug in :meth:`Categorical.fillna` with a tuple-like category raising ``ValueError`` instead of ``TypeError`` when filling with a non-category tuple (:issue:`41919`)
-

Datetimelike
Expand Down
35 changes: 15 additions & 20 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1393,17 +1393,14 @@ def map(self, mapper):
# -------------------------------------------------------------
# Validators; ideally these can be de-duplicated

def _validate_searchsorted_value(self, value):
# searchsorted is very performance sensitive. By converting codes
# to same dtype as self.codes, we get much faster performance.
if is_scalar(value):
codes = self._unbox_scalar(value)
def _validate_setitem_value(self, value):
if not is_hashable(value):
# wrap scalars and hashable-listlikes in list
return self._validate_listlike(value)
else:
locs = [self.categories.get_loc(x) for x in value]
# error: Incompatible types in assignment (expression has type
# "ndarray", variable has type "int")
codes = np.array(locs, dtype=self.codes.dtype) # type: ignore[assignment]
return codes
return self._validate_scalar(value)

_validate_searchsorted_value = _validate_setitem_value

def _validate_scalar(self, fill_value):
"""
Expand All @@ -1429,8 +1426,8 @@ def _validate_scalar(self, fill_value):
fill_value = self._unbox_scalar(fill_value)
else:
raise TypeError(
f"'fill_value={fill_value}' is not present "
"in this Categorical's categories"
"Cannot setitem on a Categorical with a new "
f"category ({fill_value}), set the categories first"
)
return fill_value

Expand Down Expand Up @@ -2015,37 +2012,35 @@ def __getitem__(self, key):
deprecate_ndim_indexing(result)
return result

def _validate_setitem_value(self, value):
def _validate_listlike(self, value):
# NB: here we assume scalar-like tuples have already been excluded
value = extract_array(value, extract_numpy=True)

# require identical categories set
if isinstance(value, Categorical):
if not is_dtype_equal(self.dtype, value.dtype):
raise ValueError(
raise TypeError(
"Cannot set a Categorical with another, "
"without identical categories"
)
# is_dtype_equal implies categories_match_up_to_permutation
value = self._encode_with_my_categories(value)
return value._codes

# wrap scalars and hashable-listlikes in list
rvalue = value if not is_hashable(value) else [value]

from pandas import Index

# tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914
to_add = Index(rvalue, tupleize_cols=False).difference(self.categories)
to_add = Index(value, tupleize_cols=False).difference(self.categories)

# no assignments of values not in categories, but it's always ok to set
# something to np.nan
if len(to_add) and not isna(to_add).all():
raise ValueError(
raise TypeError(
"Cannot setitem on a Categorical with a new "
"category, set the categories first"
)

codes = self.categories.get_indexer(rvalue)
codes = self.categories.get_indexer(value)
return codes.astype(self._ndarray.dtype, copy=False)

def _reverse_indexer(self) -> dict[Hashable, np.ndarray]:
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1627,6 +1627,10 @@ def where(self, other, cond, errors="raise") -> list[Block]:
# NotImplementedError for class not implementing `__setitem__`
# TypeError for SparseArray, which implements just to raise
# a TypeError
if isinstance(result, Categorical):
# TODO: don't special-case
raise

result = type(self.values)._from_sequence(
np.where(cond, self.values, other), dtype=dtype
)
Expand Down
12 changes: 8 additions & 4 deletions pandas/tests/arrays/categorical/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,15 +186,19 @@ def test_searchsorted(self, ordered):
tm.assert_numpy_array_equal(res_ser, exp)

# Searching for a single value that is not from the Categorical
with pytest.raises(KeyError, match="cucumber"):
with pytest.raises(TypeError, match="cucumber"):
cat.searchsorted("cucumber")
with pytest.raises(KeyError, match="cucumber"):
with pytest.raises(TypeError, match="cucumber"):
ser.searchsorted("cucumber")

# Searching for multiple values one of each is not from the Categorical
with pytest.raises(KeyError, match="cucumber"):
msg = (
"Cannot setitem on a Categorical with a new category, "
"set the categories first"
)
with pytest.raises(TypeError, match=msg):
cat.searchsorted(["bread", "cucumber"])
with pytest.raises(KeyError, match="cucumber"):
with pytest.raises(TypeError, match=msg):
ser.searchsorted(["bread", "cucumber"])

def test_unique(self, ordered):
Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/arrays/categorical/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def test_setitem_different_unordered_raises(self, other):
target = Categorical(["a", "b"], categories=["a", "b"])
mask = np.array([True, False])
msg = "Cannot set a Categorical with another, without identical categories"
with pytest.raises(ValueError, match=msg):
with pytest.raises(TypeError, match=msg):
target[mask] = other[mask]

@pytest.mark.parametrize(
Expand All @@ -89,7 +89,7 @@ def test_setitem_same_ordered_raises(self, other):
target = Categorical(["a", "b"], categories=["a", "b"], ordered=True)
mask = np.array([True, False])
msg = "Cannot set a Categorical with another, without identical categories"
with pytest.raises(ValueError, match=msg):
with pytest.raises(TypeError, match=msg):
target[mask] = other[mask]

def test_setitem_tuple(self):
Expand Down Expand Up @@ -260,7 +260,7 @@ def test_where_other_categorical(self):
def test_where_new_category_raises(self):
ser = Series(Categorical(["a", "b", "c"]))
msg = "Cannot setitem on a Categorical with a new category"
with pytest.raises(ValueError, match=msg):
with pytest.raises(TypeError, match=msg):
ser.where([True, False, True], "d")

def test_where_ordered_differs_rasies(self):
Expand All @@ -270,7 +270,7 @@ def test_where_ordered_differs_rasies(self):
other = Categorical(
["b", "c", "a"], categories=["a", "c", "b", "d"], ordered=True
)
with pytest.raises(ValueError, match="without identical categories"):
with pytest.raises(TypeError, match="without identical categories"):
ser.where([True, False, True], other)


Expand Down
9 changes: 7 additions & 2 deletions pandas/tests/arrays/categorical/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,12 @@ def test_fillna_raises(self, fillna_kwargs, msg):
# https://github.com/pandas-dev/pandas/issues/13628
cat = Categorical([1, 2, 3, None, None])

with pytest.raises(ValueError, match=msg):
if len(fillna_kwargs) == 1 and "value" in fillna_kwargs:
err = TypeError
else:
err = ValueError

with pytest.raises(err, match=msg):
cat.fillna(**fillna_kwargs)

@pytest.mark.parametrize("named", [True, False])
Expand All @@ -104,7 +109,7 @@ def test_fillna_iterable_category(self, named):
# not NotImplementedError GH#41914
cat = Categorical(np.array([Point(1, 0), Point(0, 1), None], dtype=object))
msg = "Cannot setitem on a Categorical with a new category"
with pytest.raises(ValueError, match=msg):
with pytest.raises(TypeError, match=msg):
cat.fillna(Point(0, 0))

def test_fillna_array(self):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/categorical/test_take.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_take_fill_value(self):
def test_take_fill_value_new_raises(self):
# https://github.com/pandas-dev/pandas/issues/23296
cat = Categorical(["a", "b", "c"])
xpr = r"'fill_value=d' is not present in this Categorical's categories"
xpr = r"Cannot setitem on a Categorical with a new category \(d\)"
with pytest.raises(TypeError, match=xpr):
cat.take([0, 1, -1], fill_value="d", allow_fill=True)

Expand Down
14 changes: 7 additions & 7 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1283,7 +1283,7 @@ def test_object_casting_indexing_wraps_datetimelike(using_array_manager):
assert isinstance(val, pd.Timedelta)


msg1 = "Cannot setitem on a Categorical with a new category, set the categories first"
msg1 = r"Cannot setitem on a Categorical with a new category( \(.*\))?, set the"
msg2 = "Cannot set a Categorical with another, without identical categories"


Expand Down Expand Up @@ -1348,7 +1348,7 @@ def test_loc_iloc_setitem_list_of_lists(self, orig, exp_multi_row, indexer):
tm.assert_frame_equal(df, exp_multi_row)

df = orig.copy()
with pytest.raises(ValueError, match=msg1):
with pytest.raises(TypeError, match=msg1):
indexer(df)[key, :] = [["c", 2], ["c", 2]]

@pytest.mark.parametrize("indexer", [tm.loc, tm.iloc, tm.at, tm.iat])
Expand All @@ -1367,7 +1367,7 @@ def test_loc_iloc_at_iat_setitem_single_value_in_categories(
tm.assert_frame_equal(df, exp_single_cats_value)

# "c" is not among the categories for df["cat"]
with pytest.raises(ValueError, match=msg1):
with pytest.raises(TypeError, match=msg1):
indexer(df)[key] = "c"

@pytest.mark.parametrize("indexer", [tm.loc, tm.iloc])
Expand Down Expand Up @@ -1401,7 +1401,7 @@ def test_loc_iloc_setitem_full_row_non_categorical_rhs(
tm.assert_frame_equal(df, exp_single_row)

# "c" is not among the categories for df["cat"]
with pytest.raises(ValueError, match=msg1):
with pytest.raises(TypeError, match=msg1):
indexer(df)[key, :] = ["c", 2]

@pytest.mark.parametrize("indexer", [tm.loc, tm.iloc])
Expand All @@ -1423,14 +1423,14 @@ def test_loc_iloc_setitem_partial_col_categorical_rhs(

# categories do not match df["cat"]'s, but "b" is among them
semi_compat = Categorical(list("bb"), categories=list("abc"))
with pytest.raises(ValueError, match=msg2):
with pytest.raises(TypeError, match=msg2):
# different categories but holdable values
# -> not sure if this should fail or pass
indexer(df)[key] = semi_compat

# categories do not match df["cat"]'s, and "c" is not among them
incompat = Categorical(list("cc"), categories=list("abc"))
with pytest.raises(ValueError, match=msg2):
with pytest.raises(TypeError, match=msg2):
# different values
indexer(df)[key] = incompat

Expand All @@ -1450,5 +1450,5 @@ def test_loc_iloc_setitem_non_categorical_rhs(
tm.assert_frame_equal(df, exp_parts_cats_col)

# "c" not part of the categories
with pytest.raises(ValueError, match=msg1):
with pytest.raises(TypeError, match=msg1):
indexer(df)[key] = ["c", "c"]
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def test_na_actions_categorical(self):
tm.assert_frame_equal(res, df_exp_fill)

msg = "Cannot setitem on a Categorical with a new category"
with pytest.raises(ValueError, match=msg):
with pytest.raises(TypeError, match=msg):
df.fillna(value={"cats": 4, "vals": "c"})

res = df.fillna(method="pad")
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def test_unstack_fill_frame_categorical(self):
tm.assert_frame_equal(result, expected)

# Fill with non-category results in a ValueError
msg = r"'fill_value=d' is not present in"
msg = r"Cannot setitem on a Categorical with a new category \(d\)"
with pytest.raises(TypeError, match=msg):
data.unstack(fill_value="d")

Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/indexes/categorical/test_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ def test_fillna_categorical(self):

cat = idx._data

# fill by value not in categories raises ValueError on EA, casts on CI
# fill by value not in categories raises TypeError on EA, casts on CI
msg = "Cannot setitem on a Categorical with a new category"
with pytest.raises(ValueError, match=msg):
with pytest.raises(TypeError, match=msg):
cat.fillna(2.0)

result = idx.fillna(2.0)
Expand Down Expand Up @@ -48,5 +48,5 @@ def test_fillna_validates_with_no_nas(self):
tm.assert_index_equal(res, ci)

# Same check directly on the Categorical
with pytest.raises(ValueError, match=msg):
with pytest.raises(TypeError, match=msg):
cat.fillna(False)
2 changes: 1 addition & 1 deletion pandas/tests/indexes/categorical/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ def test_where_non_categories(self):
tm.assert_index_equal(result, expected)

msg = "Cannot setitem on a Categorical with a new category"
with pytest.raises(ValueError, match=msg):
with pytest.raises(TypeError, match=msg):
# Test the Categorical method directly
ci._data.where(mask, 2)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexes/categorical/test_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def test_reindex_empty_index(self):
def test_reindex_missing_category(self):
# GH: 18185
ser = Series([1, 2, 3, 1], dtype="category")
msg = "'fill_value=-1' is not present in this Categorical's categories"
msg = r"Cannot setitem on a Categorical with a new category \(-1\)"
with pytest.raises(TypeError, match=msg):
ser.reindex([1, 2, 3, 4, 5], fill_value=-1)

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/series/methods/test_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,14 +677,14 @@ def test_fillna_categorical_raises(self):
cat = ser._values

msg = "Cannot setitem on a Categorical with a new category"
with pytest.raises(ValueError, match=msg):
with pytest.raises(TypeError, match=msg):
ser.fillna("d")

msg2 = "Length of 'value' does not match."
with pytest.raises(ValueError, match=msg2):
cat.fillna(Series("d"))

with pytest.raises(ValueError, match=msg):
with pytest.raises(TypeError, match=msg):
ser.fillna({1: "d", 3: "a"})

msg = '"value" parameter must be a scalar or dict, but you passed a "list"'
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/series/methods/test_shift.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def test_shift_categorical_fill_value(self):
tm.assert_equal(res, expected)

# check for incorrect fill_value
msg = "'fill_value=f' is not present in this Categorical's categories"
msg = r"Cannot setitem on a Categorical with a new category \(f\)"
with pytest.raises(TypeError, match=msg):
ts.shift(1, fill_value="f")

Expand Down