diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 81f9456502bf0..363873be87899 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1716,6 +1716,35 @@ def _box_func(self, i: int): return np.NaN return self.categories[i] + def _validate_listlike(self, target: ArrayLike) -> np.ndarray: + """ + Extract integer codes we can use for comparison. + + Notes + ----- + If a value in target is not present, it gets coded as -1. + """ + + if isinstance(target, Categorical): + # Indexing on codes is more efficient if categories are the same, + # so we can apply some optimizations based on the degree of + # dtype-matching. + if self.categories.equals(target.categories): + # We use the same codes, so can go directly to the engine + codes = target.codes + elif self.is_dtype_equal(target): + # We have the same categories up to a reshuffling of codes. + codes = recode_for_categories( + target.codes, target.categories, self.categories + ) + else: + code_indexer = self.categories.get_indexer(target.categories) + codes = take_1d(code_indexer, target.codes, fill_value=-1) + else: + codes = self.categories.get_indexer(target) + + return codes + # ------------------------------------------------------------------ def take_nd(self, indexer, allow_fill: bool = False, fill_value=None): @@ -1890,11 +1919,8 @@ def _validate_setitem_value(self, value): "Cannot set a Categorical with another, " "without identical categories" ) - if not self.categories.equals(value.categories): - new_codes = recode_for_categories( - value.codes, value.categories, self.categories - ) - value = Categorical.from_codes(new_codes, dtype=self.dtype) + new_codes = self._validate_listlike(value) + value = Categorical.from_codes(new_codes, dtype=self.dtype) rvalue = value if is_list_like(value) else [value] @@ -2164,13 +2190,7 @@ def equals(self, other: object) -> bool: if not isinstance(other, Categorical): return False elif self.is_dtype_equal(other): - if self.categories.equals(other.categories): - # fastpath to avoid re-coding - other_codes = other._codes - else: - other_codes = recode_for_categories( - other.codes, other.categories, self.categories - ) + other_codes = self._validate_listlike(other) return np.array_equal(self._codes, other_codes) return False diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index dd005752a4832..1ea4ff117f209 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -310,14 +310,8 @@ def _maybe_unwrap(x): categories = first.categories ordered = first.ordered - if all(first.categories.equals(other.categories) for other in to_union[1:]): - new_codes = np.concatenate([c.codes for c in to_union]) - else: - codes = [first.codes] + [ - recode_for_categories(other.codes, other.categories, first.categories) - for other in to_union[1:] - ] - new_codes = np.concatenate(codes) + all_codes = [first._validate_listlike(x) for x in to_union] + new_codes = np.concatenate(all_codes) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with ordered Categoricals") diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 7509cb35069e8..a450e3b9fdee7 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -23,8 +23,7 @@ from pandas.core.dtypes.missing import is_valid_nat_for_dtype, notna from pandas.core import accessor -from pandas.core.algorithms import take_1d -from pandas.core.arrays.categorical import Categorical, contains, recode_for_categories +from pandas.core.arrays.categorical import Categorical, contains import pandas.core.common as com from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase @@ -558,21 +557,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): "method='nearest' not implemented yet for CategoricalIndex" ) - if isinstance(target, CategoricalIndex) and self._values.is_dtype_equal(target): - if self._values.equals(target._values): - # we have the same codes - codes = target.codes - else: - codes = recode_for_categories( - target.codes, target.categories, self._values.categories - ) - else: - if isinstance(target, CategoricalIndex): - code_indexer = self.categories.get_indexer(target.categories) - codes = take_1d(code_indexer, target.codes, fill_value=-1) - else: - codes = self.categories.get_indexer(target) - + codes = self._values._validate_listlike(target._values) indexer, _ = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer) @@ -580,15 +565,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): def get_indexer_non_unique(self, target): target = ibase.ensure_index(target) - if isinstance(target, CategoricalIndex): - # Indexing on codes is more efficient if categories are the same: - if target.categories is self.categories: - target = target.codes - indexer, missing = self._engine.get_indexer_non_unique(target) - return ensure_platform_int(indexer), missing - target = target._values - - codes = self.categories.get_indexer(target) + codes = self._values._validate_listlike(target._values) indexer, missing = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer), missing diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9f19ea9aefe09..d95355589fd0c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -43,7 +43,6 @@ from pandas import Categorical, Index, MultiIndex from pandas.core import groupby import pandas.core.algorithms as algos -from pandas.core.arrays.categorical import recode_for_categories import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.frame import _merge_doc @@ -1936,12 +1935,8 @@ def _factorize_keys( ): assert isinstance(lk, Categorical) assert isinstance(rk, Categorical) - if lk.categories.equals(rk.categories): - # if we exactly match in categories, allow us to factorize on codes - rk = rk.codes - else: - # Same categories in different orders -> recode - rk = recode_for_categories(rk.codes, rk.categories, lk.categories) + # Cast rk to encoding so we can compare codes with lk + rk = lk._validate_listlike(rk) lk = ensure_int64(lk.codes) rk = ensure_int64(rk)