From 5921bf1c9b1c813227b566f7925f7009c3f9a12e Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 27 Oct 2023 15:02:56 -0400 Subject: [PATCH 1/6] PERF: Use fused types for map_infer_mask --- pandas/_libs/dtypes.pxd | 7 +++++++ pandas/_libs/lib.pyx | 27 +++++++++++++++++++++++++-- pandas/core/arrays/string_.py | 2 ++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/dtypes.pxd b/pandas/_libs/dtypes.pxd index ccfb2d2ef4a23..8a459cc0579e0 100644 --- a/pandas/_libs/dtypes.pxd +++ b/pandas/_libs/dtypes.pxd @@ -9,6 +9,7 @@ from numpy cimport ( int16_t, int32_t, int64_t, + npy_bool, uint8_t, uint16_t, uint32_t, @@ -34,3 +35,9 @@ ctypedef fused numeric_t: ctypedef fused numeric_object_t: numeric_t object + +# bool + all numeric types + object, doesn't include complex +ctypedef fused bool_numeric_object_t: + npy_bool + numeric_t + object diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c0f21d1a7044c..7dd583c7a79a9 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -102,6 +102,7 @@ cdef extern from "pandas/parser/pd_parser.h": PandasParser_IMPORT from pandas._libs cimport util +from pandas._libs.dtypes cimport bool_numeric_object_t from pandas._libs.util cimport ( INT64_MAX, INT64_MIN, @@ -2855,12 +2856,34 @@ no_default = _NoDefault.no_default # Sentinel indicating the default value. NoDefault = Literal[_NoDefault.no_default] +def map_infer_mask( + ndarray[object] arr, + object f, + const uint8_t[:] mask, + bint convert=True, + object na_value=no_default, + cnp.dtype dtype=np.dtype(object) +) -> np.ndarray: + dummy = np.empty(0, dtype=dtype) + result = _map_infer_mask( + arr, + f, + mask, + dummy, + convert, + na_value, + dtype, + ) + return result + + @cython.boundscheck(False) @cython.wraparound(False) -def map_infer_mask( +def _map_infer_mask( ndarray[object] arr, object f, const uint8_t[:] mask, + bool_numeric_object_t[:] dummy, bint convert=True, object na_value=no_default, cnp.dtype dtype=np.dtype(object) @@ -2888,7 +2911,7 @@ def map_infer_mask( """ cdef: Py_ssize_t i, n - ndarray result + ndarray[bool_numeric_object_t] result object val n = len(arr) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 471b37eac783b..32dc4ab63cc21 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -624,6 +624,8 @@ def _str_map( na_value_is_na = isna(na_value) if na_value_is_na: na_value = 1 + elif dtype == np.dtype("bool"): + na_value = bool(na_value) result = lib.map_infer_mask( arr, f, From 5c4d8d99fd85cab7934da0a39fc7e4a6b1ba60c5 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 27 Oct 2023 16:14:48 -0400 Subject: [PATCH 2/6] Simplify --- pandas/_libs/dtypes.pxd | 7 ------- pandas/_libs/lib.pyx | 6 +++--- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/dtypes.pxd b/pandas/_libs/dtypes.pxd index 8a459cc0579e0..ccfb2d2ef4a23 100644 --- a/pandas/_libs/dtypes.pxd +++ b/pandas/_libs/dtypes.pxd @@ -9,7 +9,6 @@ from numpy cimport ( int16_t, int32_t, int64_t, - npy_bool, uint8_t, uint16_t, uint32_t, @@ -35,9 +34,3 @@ ctypedef fused numeric_t: ctypedef fused numeric_object_t: numeric_t object - -# bool + all numeric types + object, doesn't include complex -ctypedef fused bool_numeric_object_t: - npy_bool - numeric_t - object diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7dd583c7a79a9..1aeb21d59ec3c 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -102,7 +102,7 @@ cdef extern from "pandas/parser/pd_parser.h": PandasParser_IMPORT from pandas._libs cimport util -from pandas._libs.dtypes cimport bool_numeric_object_t +from pandas._libs.dtypes cimport numeric_object_t from pandas._libs.util cimport ( INT64_MAX, INT64_MIN, @@ -2883,7 +2883,7 @@ def _map_infer_mask( ndarray[object] arr, object f, const uint8_t[:] mask, - bool_numeric_object_t[:] dummy, + numeric_object_t[:] dummy, bint convert=True, object na_value=no_default, cnp.dtype dtype=np.dtype(object) @@ -2911,7 +2911,7 @@ def _map_infer_mask( """ cdef: Py_ssize_t i, n - ndarray[bool_numeric_object_t] result + ndarray[numeric_object_t] result object val n = len(arr) From c4bdb1aa2b6492faf8a53fcefed8ff2c0d7e0c16 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 28 Oct 2023 09:53:07 -0400 Subject: [PATCH 3/6] Refactor and docstrings --- pandas/_libs/lib.pyx | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1aeb21d59ec3c..1309c91ae6778 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2864,6 +2864,28 @@ def map_infer_mask( object na_value=no_default, cnp.dtype dtype=np.dtype(object) ) -> np.ndarray: + """ + Substitute for np.vectorize with pandas-friendly dtype inference. + + Parameters + ---------- + arr : ndarray + f : function + mask : ndarray + uint8 dtype ndarray indicating values not to apply `f` to. + convert : bool, default True + Whether to call `maybe_convert_objects` on the resulting ndarray. + na_value : Any, optional + The result value to use for masked values. By default, the + input value is used. + dtype : numpy.dtype + The numpy dtype to use for the result ndarray. + + Returns + ------- + np.ndarray + """ + # Passed so we can use infused types depending on the result dtype dummy = np.empty(0, dtype=dtype) result = _map_infer_mask( arr, @@ -2874,7 +2896,10 @@ def map_infer_mask( na_value, dtype, ) - return result + if convert: + return maybe_convert_objects(result) + else: + return result @cython.boundscheck(False) @@ -2889,7 +2914,7 @@ def _map_infer_mask( cnp.dtype dtype=np.dtype(object) ) -> np.ndarray: """ - Substitute for np.vectorize with pandas-friendly dtype inference. + Helper for map_infer_mask, split off to use fused types based on the result. Parameters ---------- @@ -2897,11 +2922,11 @@ def _map_infer_mask( f : function mask : ndarray uint8 dtype ndarray indicating values not to apply `f` to. - convert : bool, default True - Whether to call `maybe_convert_objects` on the resulting ndarray + dummy : ndarray + Unused. Has the same dtype as the result to allow using fused types. na_value : Any, optional The result value to use for masked values. By default, the - input value is used + input value is used. dtype : numpy.dtype The numpy dtype to use for the result ndarray. @@ -2931,10 +2956,7 @@ def _map_infer_mask( result[i] = val - if convert: - return maybe_convert_objects(result) - else: - return result + return result @cython.boundscheck(False) From c58e682beedef56b7d848ac241cdee03636a146e Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 1 Nov 2023 21:24:10 -0400 Subject: [PATCH 4/6] Rework --- pandas/_libs/dtypes.pxd | 5 +++++ pandas/_libs/lib.pyx | 25 +++++++++++-------------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/dtypes.pxd b/pandas/_libs/dtypes.pxd index ccfb2d2ef4a23..de4b70d387b5f 100644 --- a/pandas/_libs/dtypes.pxd +++ b/pandas/_libs/dtypes.pxd @@ -34,3 +34,8 @@ ctypedef fused numeric_t: ctypedef fused numeric_object_t: numeric_t object + +ctypedef fused uint8_int64_object_t: + uint8_t + int64_t + object diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1309c91ae6778..275f3434452bf 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -102,7 +102,7 @@ cdef extern from "pandas/parser/pd_parser.h": PandasParser_IMPORT from pandas._libs cimport util -from pandas._libs.dtypes cimport numeric_object_t +from pandas._libs.dtypes cimport uint8_int64_object_t from pandas._libs.util cimport ( INT64_MAX, INT64_MIN, @@ -2885,13 +2885,14 @@ def map_infer_mask( ------- np.ndarray """ - # Passed so we can use infused types depending on the result dtype - dummy = np.empty(0, dtype=dtype) - result = _map_infer_mask( + cdef Py_ssize_t n = len(arr) + result = np.empty(n, dtype=dtype) + + _map_infer_mask( + result, arr, f, mask, - dummy, convert, na_value, dtype, @@ -2905,25 +2906,25 @@ def map_infer_mask( @cython.boundscheck(False) @cython.wraparound(False) def _map_infer_mask( + ndarray[uint8_int64_object_t] out, ndarray[object] arr, object f, const uint8_t[:] mask, - numeric_object_t[:] dummy, bint convert=True, object na_value=no_default, cnp.dtype dtype=np.dtype(object) -) -> np.ndarray: +): """ Helper for map_infer_mask, split off to use fused types based on the result. Parameters ---------- + out : ndarray[uint8_int64_object_t] + Values to which this method will write its results. arr : ndarray f : function mask : ndarray uint8 dtype ndarray indicating values not to apply `f` to. - dummy : ndarray - Unused. Has the same dtype as the result to allow using fused types. na_value : Any, optional The result value to use for masked values. By default, the input value is used. @@ -2936,11 +2937,9 @@ def _map_infer_mask( """ cdef: Py_ssize_t i, n - ndarray[numeric_object_t] result object val n = len(arr) - result = np.empty(n, dtype=dtype) for i in range(n): if mask[i]: if na_value is no_default: @@ -2954,9 +2953,7 @@ def _map_infer_mask( # unbox 0-dim arrays, GH#690 val = val.item() - result[i] = val - - return result + out[i] = val @cython.boundscheck(False) From 4634a24598bb5e4c570a626a6144241e595fe4b8 Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 1 Nov 2023 21:27:42 -0400 Subject: [PATCH 5/6] Remove docstring --- pandas/_libs/lib.pyx | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4254a99024df3..d099ba50c42ba 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2917,24 +2917,6 @@ def _map_infer_mask( ): """ Helper for map_infer_mask, split off to use fused types based on the result. - - Parameters - ---------- - out : ndarray[uint8_int64_object_t] - Values to which this method will write its results. - arr : ndarray - f : function - mask : ndarray - uint8 dtype ndarray indicating values not to apply `f` to. - na_value : Any, optional - The result value to use for masked values. By default, the - input value is used. - dtype : numpy.dtype - The numpy dtype to use for the result ndarray. - - Returns - ------- - np.ndarray """ cdef: Py_ssize_t i, n From 59c836de7c051dd34de283ab07db92c432a07333 Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 2 Nov 2023 22:08:29 -0400 Subject: [PATCH 6/6] Cleanup, whatsnew --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/_libs/lib.pyx | 6 +----- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 16d279bb0d52c..f6089fb8909ab 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -303,6 +303,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) - Performance improvement in :meth:`Index.difference` (:issue:`55108`) - Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`) +- Performance improvement in :meth:`Series.str` methods (:issue:`55736`) - Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) - Performance improvement when localizing time to UTC (:issue:`55241`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d099ba50c42ba..7ec70c8700a0a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2894,9 +2894,7 @@ def map_infer_mask( arr, f, mask, - convert, na_value, - dtype, ) if convert: return maybe_convert_objects(result) @@ -2911,10 +2909,8 @@ def _map_infer_mask( ndarray[object] arr, object f, const uint8_t[:] mask, - bint convert=True, object na_value=no_default, - cnp.dtype dtype=np.dtype(object) -): +) -> None: """ Helper for map_infer_mask, split off to use fused types based on the result. """