diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index ea6a832d25058..4acfc1470a82f 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -757,6 +757,7 @@ Performance improvements - Performance improvement in :meth:`MultiIndex.putmask` (:issue:`49830`) - Performance improvement in :meth:`Index.union` and :meth:`MultiIndex.union` when index contains duplicates (:issue:`48900`) - Performance improvement in :meth:`Series.rank` for pyarrow-backed dtypes (:issue:`50264`) +- Performance improvement in :meth:`Series.searchsorted` for pyarrow-backed dtypes (:issue:`50447`) - Performance improvement in :meth:`Series.fillna` for extension array dtypes (:issue:`49722`, :issue:`50078`) - Performance improvement in :meth:`Index.join`, :meth:`Index.intersection` and :meth:`Index.union` for masked dtypes when :class:`Index` is monotonic (:issue:`50310`) - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d0c17161dea0b..de85ed67e7e8c 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -3,6 +3,7 @@ from typing import ( TYPE_CHECKING, Any, + Literal, TypeVar, cast, ) @@ -116,6 +117,11 @@ def floordiv_compat( } if TYPE_CHECKING: + from pandas._typing import ( + NumpySorter, + NumpyValueArrayLike, + ) + from pandas import Series ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray") @@ -693,6 +699,23 @@ def round( """ return type(self)(pc.round(self._data, ndigits=decimals)) + @doc(ExtensionArray.searchsorted) + def searchsorted( + self, + value: NumpyValueArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter = None, + ) -> npt.NDArray[np.intp] | np.intp: + if self._hasna: + raise ValueError( + "searchsorted requires array to be sorted, which is impossible " + "with NAs present." + ) + if isinstance(value, ExtensionArray): + value = value.astype(object) + # Base class searchsorted would cast to object, which is *much* slower. + return self.to_numpy().searchsorted(value, side=side, sorter=sorter) + def take( self, indices: TakeIndexer, diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index e5fb3fc3ff836..9b26db07fc28f 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,6 +1,9 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Literal, +) import numpy as np @@ -54,6 +57,11 @@ if TYPE_CHECKING: import pyarrow + from pandas._typing import ( + NumpySorter, + NumpyValueArrayLike, + ) + from pandas import Series @@ -492,6 +500,20 @@ def memory_usage(self, deep: bool = False) -> int: return result + lib.memory_usage_of_objects(self._ndarray) return result + @doc(ExtensionArray.searchsorted) + def searchsorted( + self, + value: NumpyValueArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter = None, + ) -> npt.NDArray[np.intp] | np.intp: + if self._hasna: + raise ValueError( + "searchsorted requires array to be sorted, which is impossible " + "with NAs present." + ) + return super().searchsorted(value=value, side=side, sorter=sorter) + def _cmp_method(self, other, op): from pandas.arrays import BooleanArray diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c1785591f41a9..02f72d67673ae 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1553,3 +1553,20 @@ def test_round(): result = ser.round(-1) expected = pd.Series([120.0, pd.NA, 60.0], dtype=dtype) tm.assert_series_equal(result, expected) + + +def test_searchsorted_with_na_raises(data_for_sorting, as_series): + # GH50447 + b, c, a = data_for_sorting + arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c] + arr[-1] = pd.NA + + if as_series: + arr = pd.Series(arr) + + msg = ( + "searchsorted requires array to be sorted, " + "which is impossible with NAs present." + ) + with pytest.raises(ValueError, match=msg): + arr.searchsorted(b) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index de7967a8578b5..3e865947aa968 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -420,3 +420,20 @@ def arrow_not_supported(self, data, request): reason="2D support not implemented for ArrowStringArray" ) request.node.add_marker(mark) + + +def test_searchsorted_with_na_raises(data_for_sorting, as_series): + # GH50447 + b, c, a = data_for_sorting + arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c] + arr[-1] = pd.NA + + if as_series: + arr = pd.Series(arr) + + msg = ( + "searchsorted requires array to be sorted, " + "which is impossible with NAs present." + ) + with pytest.raises(ValueError, match=msg): + arr.searchsorted(b)