diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 559aa7050a640..4384ccb7fa8b3 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -282,4 +282,18 @@ def time_sort_values(self): self.index.sort_values(ascending=False) +class SearchSorted: + def setup(self): + N = 10 ** 5 + self.ci = tm.makeCategoricalIndex(N).sort_values() + self.c = self.ci.values + self.key = self.ci.categories[1] + + def time_categorical_index_contains(self): + self.ci.searchsorted(self.key) + + def time_categorical_contains(self): + self.c.searchsorted(self.key) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 16d23d675a8bb..5f650b18a21ac 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -162,6 +162,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.corr` when ``method`` is ``"spearman"`` (:issue:`28139`) - Performance improvement in :meth:`DataFrame.replace` when provided a list of values to replace (:issue:`28099`) - Performance improvement in :meth:`DataFrame.select_dtypes` by using vectorization instead of iterating over a loop (:issue:`28317`) +- Performance improvement in :meth:`Categorical.searchsorted` and :meth:`CategoricalIndex.searchsorted` (:issue:`28795`) .. _whatsnew_1000.bug_fixes: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 33d1de01fa3db..43e52cb011324 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1399,14 +1399,14 @@ def memory_usage(self, deep=False): @Substitution(klass="Categorical") @Appender(_shared_docs["searchsorted"]) def searchsorted(self, value, side="left", sorter=None): - from pandas.core.series import Series - - codes = _get_codes_for_values(Series(value).values, self.categories) - if -1 in codes: - raise KeyError("Value(s) to be inserted must be in categories.") - - codes = codes[0] if is_scalar(value) else codes - + # searchsorted is very performance sensitive. By converting codes + # to same dtype as self.codes, we get much faster performance. + if is_scalar(value): + codes = self.categories.get_loc(value) + codes = self.codes.dtype.type(codes) + else: + locs = [self.categories.get_loc(x) for x in value] + codes = np.array(locs, dtype=self.codes.dtype) return self.codes.searchsorted(codes, side=side, sorter=sorter) def isna(self): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index c4321c993e638..ed3a4a7953df3 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -10,7 +10,7 @@ from pandas._libs.hashtable import duplicated_int64 import pandas.compat as compat from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, cache_readonly +from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.core.dtypes.common import ( ensure_platform_int, @@ -27,6 +27,7 @@ from pandas.core import accessor from pandas.core.algorithms import take_1d from pandas.core.arrays.categorical import Categorical, _recode_for_categories, contains +from pandas.core.base import _shared_docs import pandas.core.common as com import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs @@ -555,6 +556,11 @@ def _can_reindex(self, indexer): """ always allow reindexing """ pass + @Substitution(klass="CategoricalIndex") + @Appender(_shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): + return self._data.searchsorted(value, side=side, sorter=sorter) + @Appender(_index_shared_docs["where"]) def where(self, cond, other=None): # TODO: Investigate an alternative implementation with diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 86750244d5fb5..279f1492d7dad 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -113,16 +113,15 @@ def test_searchsorted(self, ordered_fixture): tm.assert_numpy_array_equal(res_ser, exp) # Searching for a single value that is not from the Categorical - msg = r"Value\(s\) to be inserted must be in categories" - with pytest.raises(KeyError, match=msg): + with pytest.raises(KeyError, match="cucumber"): cat.searchsorted("cucumber") - with pytest.raises(KeyError, match=msg): + with pytest.raises(KeyError, match="cucumber"): ser.searchsorted("cucumber") # Searching for multiple values one of each is not from the Categorical - with pytest.raises(KeyError, match=msg): + with pytest.raises(KeyError, match="cucumber"): cat.searchsorted(["bread", "cucumber"]) - with pytest.raises(KeyError, match=msg): + with pytest.raises(KeyError, match="cucumber"): ser.searchsorted(["bread", "cucumber"]) def test_unique(self):