From 360159ad156f2ac9d7d9d0439b89f4254b185bf7 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 27 Feb 2023 10:55:20 -0500 Subject: [PATCH 1/4] prevent error when comparing misordered categories --- pandas/core/arrays/categorical.py | 15 +++++++++++++-- pandas/tests/arrays/categorical/test_operators.py | 6 +++--- pandas/tests/indexes/categorical/test_equals.py | 8 +++++++- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 02ee2eb4a80ce..0f0d2477b1b03 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -139,7 +139,7 @@ def func(self, other): # Two Categoricals can only be compared if the categories are # the same (maybe up to ordering, depending on ordered) - msg = "Categoricals can only be compared if 'categories' are the same." + msg = "Categoricals can only be compared if 'categories' and 'ordered' are the same." if not self._categories_match_up_to_permutation(other): raise TypeError(msg) @@ -2177,7 +2177,18 @@ def _categories_match_up_to_permutation(self, other: Categorical) -> bool: ------- bool """ - return hash(self.dtype) == hash(other.dtype) + try: + if not hasattr(other, "categories"): + other = other.dtype + if set(self.categories) != set(other.categories): + return False + if self.ordered != other.ordered: + return False + if self.ordered and not (self.categories == other.categories).all(): + return False + except AttributeError: + return False + return True def describe(self) -> DataFrame: """ diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index bfe65f8bf3c29..a564c54ffd27d 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -76,7 +76,7 @@ def test_comparisons(self, factor): tm.assert_numpy_array_equal(res, exp) # Only categories with same categories can be compared - msg = "Categoricals can only be compared if 'categories' are the same" + msg = "Categoricals can only be compared if 'categories' and 'ordered' are the same" with pytest.raises(TypeError, match=msg): cat > cat_rev @@ -267,7 +267,7 @@ def test_comparisons(self, data, reverse, base): tm.assert_numpy_array_equal(res_rev.values, exp_rev2) # Only categories with same categories can be compared - msg = "Categoricals can only be compared if 'categories' are the same" + msg = "Categoricals can only be compared if 'categories' and 'ordered' are the same" with pytest.raises(TypeError, match=msg): cat > cat_rev @@ -333,7 +333,7 @@ def test_compare_different_lengths(self): c1 = Categorical([], categories=["a", "b"]) c2 = Categorical([], categories=["a"]) - msg = "Categoricals can only be compared if 'categories' are the same." + msg = "Categoricals can only be compared if 'categories' and 'ordered' are the same." with pytest.raises(TypeError, match=msg): c1 == c2 diff --git a/pandas/tests/indexes/categorical/test_equals.py b/pandas/tests/indexes/categorical/test_equals.py index 1ed8f3a903439..f526be16993fb 100644 --- a/pandas/tests/indexes/categorical/test_equals.py +++ b/pandas/tests/indexes/categorical/test_equals.py @@ -34,7 +34,7 @@ def test_equals_categorical(self): with pytest.raises(ValueError, match="Lengths must match"): ci1 == Index(["a", "b", "c"]) - msg = "Categoricals can only be compared if 'categories' are the same" + msg = "Categoricals can only be compared if 'categories' and 'ordered' are the same" with pytest.raises(TypeError, match=msg): ci1 == ci2 with pytest.raises(TypeError, match=msg): @@ -42,6 +42,12 @@ def test_equals_categorical(self): with pytest.raises(TypeError, match=msg): ci1 == Categorical(ci1.values, categories=list("abc")) + ci1 = CategoricalIndex(["a", "b", 3], categories=["a", "b", 3]) + ci2 = CategoricalIndex(["a", "b", 3], categories=["b", "a", 3]) + + assert ci1.equals(ci2) + assert ci1.astype(object).equals(ci2) + # tests # make sure that we are testing for category inclusion properly ci = CategoricalIndex(list("aabca"), categories=["c", "a", "b"]) From a627801045d992466d09e27f7e9b4d1d3cb1e4e6 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 27 Feb 2023 14:43:04 -0500 Subject: [PATCH 2/4] lint --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/arrays/categorical.py | 16 ++++---- pandas/core/dtypes/concat.py | 2 +- pandas/core/indexes/category.py | 2 +- pandas/core/reshape/merge.py | 2 +- .../tests/arrays/categorical/test_dtypes.py | 38 +++++++++---------- pandas/tests/reshape/merge/test_merge.py | 6 +-- 7 files changed, 32 insertions(+), 35 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index bdbde438217b9..8f104b8213fd3 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1178,6 +1178,7 @@ Categorical - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would reorder categories when used as a grouper (:issue:`48749`) - Bug in :class:`Categorical` constructor when constructing from a :class:`Categorical` object and ``dtype="category"`` losing ordered-ness (:issue:`49309`) - Bug in :meth:`.SeriesGroupBy.min`, :meth:`.SeriesGroupBy.max`, :meth:`.DataFrameGroupBy.min`, and :meth:`.DataFrameGroupBy.max` with unordered :class:`CategoricalDtype` with no groups failing to raise ``TypeError`` (:issue:`51034`) +- Bug in :meth:`Categorical._categories_match_up_to_permutation` would raise an error when comparing unordered categories with different permutations (:issue:`51543`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 0f0d2477b1b03..b46657c6a8f2f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -140,7 +140,7 @@ def func(self, other): # the same (maybe up to ordering, depending on ordered) msg = "Categoricals can only be compared if 'categories' and 'ordered' are the same." - if not self._categories_match_up_to_permutation(other): + if not self._categories_match(other): raise TypeError(msg) if not self.ordered and not self.categories.equals(other.categories): @@ -1917,12 +1917,11 @@ def _validate_listlike(self, value): # require identical categories set if isinstance(value, Categorical): - if not is_dtype_equal(self.dtype, value.dtype): + if not self._categories_match(value): raise TypeError( "Cannot set a Categorical with another, " "without identical categories" ) - # is_dtype_equal implies categories_match_up_to_permutation value = self._encode_with_my_categories(value) return value._codes @@ -2110,7 +2109,7 @@ def equals(self, other: object) -> bool: """ if not isinstance(other, Categorical): return False - elif self._categories_match_up_to_permutation(other): + elif self._categories_match(other): other = self._encode_with_my_categories(other) return np.array_equal(self._codes, other._codes) return False @@ -2154,7 +2153,7 @@ def _encode_with_my_categories(self, other: Categorical) -> Categorical: Notes ----- This assumes we have already checked - self._categories_match_up_to_permutation(other). + self._categories_match(other). """ # Indexing on codes is more efficient if categories are the same, # so we can apply some optimizations based on the degree of @@ -2164,10 +2163,11 @@ def _encode_with_my_categories(self, other: Categorical) -> Categorical: ) return self._from_backing_data(codes) - def _categories_match_up_to_permutation(self, other: Categorical) -> bool: + def _categories_match(self, other: Categorical) -> bool: """ - Returns True if categoricals are the same dtype - same categories, and same ordered + Returns True if categoricals have the same dtype, + same ordered, and the same categories regardless + of permutation (when unordered) Parameters ---------- diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 28bc849088d5f..0fe15d9566a1c 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -241,7 +241,7 @@ def _maybe_unwrap(x): raise TypeError("dtype of categories must be the same") ordered = False - if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]): + if all(first._categories_match(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 51ff92560fe5f..814fdd98bf68e 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -245,7 +245,7 @@ def _is_dtype_compat(self, other) -> Categorical: """ if is_categorical_dtype(other): other = extract_array(other) - if not other._categories_match_up_to_permutation(self): + if not other._categories_match(self): raise TypeError( "categories must match existing categories when appending" ) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 86bf9bf95e1cc..e365b91ad8822 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1287,7 +1287,7 @@ def _maybe_coerce_merge_keys(self) -> None: if lk_is_cat and rk_is_cat: lk = cast(Categorical, lk) rk = cast(Categorical, rk) - if lk._categories_match_up_to_permutation(rk): + if lk._categories_match(rk): continue elif lk_is_cat or rk_is_cat: diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index c905916772bff..0be4c09ba2d35 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -15,37 +15,33 @@ class TestCategoricalDtypes: - def test_categories_match_up_to_permutation(self): + def test_categories_match(self): # test dtype comparisons between cats c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False) c2 = Categorical(list("aabca"), categories=list("cab"), ordered=False) c3 = Categorical(list("aabca"), categories=list("cab"), ordered=True) - assert c1._categories_match_up_to_permutation(c1) - assert c2._categories_match_up_to_permutation(c2) - assert c3._categories_match_up_to_permutation(c3) - assert c1._categories_match_up_to_permutation(c2) - assert not c1._categories_match_up_to_permutation(c3) - assert not c1._categories_match_up_to_permutation(Index(list("aabca"))) - assert not c1._categories_match_up_to_permutation(c1.astype(object)) - assert c1._categories_match_up_to_permutation(CategoricalIndex(c1)) - assert c1._categories_match_up_to_permutation( - CategoricalIndex(c1, categories=list("cab")) - ) - assert not c1._categories_match_up_to_permutation( - CategoricalIndex(c1, ordered=True) - ) + assert c1._categories_match(c1) + assert c2._categories_match(c2) + assert c3._categories_match(c3) + assert c1._categories_match(c2) + assert not c1._categories_match(c3) + assert not c1._categories_match(Index(list("aabca"))) + assert not c1._categories_match(c1.astype(object)) + assert c1._categories_match(CategoricalIndex(c1)) + assert c1._categories_match(CategoricalIndex(c1, categories=list("cab"))) + assert not c1._categories_match(CategoricalIndex(c1, ordered=True)) # GH 16659 s1 = Series(c1) s2 = Series(c2) s3 = Series(c3) - assert c1._categories_match_up_to_permutation(s1) - assert c2._categories_match_up_to_permutation(s2) - assert c3._categories_match_up_to_permutation(s3) - assert c1._categories_match_up_to_permutation(s2) - assert not c1._categories_match_up_to_permutation(s3) - assert not c1._categories_match_up_to_permutation(s1.astype(object)) + assert c1._categories_match(s1) + assert c2._categories_match(s2) + assert c3._categories_match(s3) + assert c1._categories_match(s2) + assert not c1._categories_match(s3) + assert not c1._categories_match(s1.astype(object)) def test_set_dtype_same(self): c = Categorical(["a", "b", "c"]) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index ad90d5ae147c8..0afdcd69f32e1 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1939,8 +1939,8 @@ def test_other_columns(self, left, right): tm.assert_series_equal(result, expected) # categories are preserved - assert left.X.values._categories_match_up_to_permutation(merged.X.values) - assert right.Z.values._categories_match_up_to_permutation(merged.Z.values) + assert left.X.values._categories_match(merged.X.values) + assert right.Z.values._categories_match(merged.Z.values) @pytest.mark.parametrize( "change", @@ -1957,7 +1957,7 @@ def test_dtype_on_merged_different(self, change, join_type, left, right): X = change(right.X.astype("object")) right = right.assign(X=X) assert is_categorical_dtype(left.X.values.dtype) - # assert not left.X.values._categories_match_up_to_permutation(right.X.values) + # assert not left.X.values._categories_match(right.X.values) merged = merge(left, right, on="X", how=join_type) From b4dcdc59b2dca6a6964fe545b464a1c183a14f35 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Mon, 27 Feb 2023 15:33:39 -0500 Subject: [PATCH 3/4] lint --- pandas/core/arrays/categorical.py | 5 ++++- pandas/tests/arrays/categorical/test_operators.py | 15 ++++++++++++--- pandas/tests/indexes/categorical/test_equals.py | 5 ++++- pandas/tests/reshape/merge/test_merge.py | 2 +- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b46657c6a8f2f..207cf4cbc086b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -139,7 +139,10 @@ def func(self, other): # Two Categoricals can only be compared if the categories are # the same (maybe up to ordering, depending on ordered) - msg = "Categoricals can only be compared if 'categories' and 'ordered' are the same." + msg = ( + "Categoricals can only be compared if 'categories' and 'ordered' " + "are the same." + ) if not self._categories_match(other): raise TypeError(msg) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index a564c54ffd27d..7e204fa11c7d4 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -76,7 +76,10 @@ def test_comparisons(self, factor): tm.assert_numpy_array_equal(res, exp) # Only categories with same categories can be compared - msg = "Categoricals can only be compared if 'categories' and 'ordered' are the same" + msg = ( + "Categoricals can only be compared if 'categories' and 'ordered' " + "are the same" + ) with pytest.raises(TypeError, match=msg): cat > cat_rev @@ -267,7 +270,10 @@ def test_comparisons(self, data, reverse, base): tm.assert_numpy_array_equal(res_rev.values, exp_rev2) # Only categories with same categories can be compared - msg = "Categoricals can only be compared if 'categories' and 'ordered' are the same" + msg = ( + "Categoricals can only be compared if 'categories' and 'ordered' " + "are the same" + ) with pytest.raises(TypeError, match=msg): cat > cat_rev @@ -333,7 +339,10 @@ def test_compare_different_lengths(self): c1 = Categorical([], categories=["a", "b"]) c2 = Categorical([], categories=["a"]) - msg = "Categoricals can only be compared if 'categories' and 'ordered' are the same." + msg = ( + "Categoricals can only be compared if 'categories' and 'ordered' " + "are the same." + ) with pytest.raises(TypeError, match=msg): c1 == c2 diff --git a/pandas/tests/indexes/categorical/test_equals.py b/pandas/tests/indexes/categorical/test_equals.py index f526be16993fb..a77139f76ba39 100644 --- a/pandas/tests/indexes/categorical/test_equals.py +++ b/pandas/tests/indexes/categorical/test_equals.py @@ -34,7 +34,10 @@ def test_equals_categorical(self): with pytest.raises(ValueError, match="Lengths must match"): ci1 == Index(["a", "b", "c"]) - msg = "Categoricals can only be compared if 'categories' and 'ordered' are the same" + msg = ( + "Categoricals can only be compared if 'categories' and 'ordered' " + "are the same" + ) with pytest.raises(TypeError, match=msg): ci1 == ci2 with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 0afdcd69f32e1..966961ad5c6b1 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1957,7 +1957,7 @@ def test_dtype_on_merged_different(self, change, join_type, left, right): X = change(right.X.astype("object")) right = right.assign(X=X) assert is_categorical_dtype(left.X.values.dtype) - # assert not left.X.values._categories_match(right.X.values) + assert not left.X.values._categories_match(right.X.values) merged = merge(left, right, on="X", how=join_type) From 0c3ca4bc6e956745f47f71a0461ee7a514aa0d83 Mon Sep 17 00:00:00 2001 From: Parthiv Naresh Date: Fri, 10 Mar 2023 14:45:09 -0500 Subject: [PATCH 4/4] mypy --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b94edcf745648..ba233512fe2d6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2225,7 +2225,7 @@ def _categories_match(self, other: Categorical) -> bool: """ try: if not hasattr(other, "categories"): - other = other.dtype + other = other.dtype # type: ignore[assignment] if set(self.categories) != set(other.categories): return False if self.ordered != other.ordered: