From 964bbf62cfab6fefeb297030486e394171db78a9 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 29 Apr 2024 21:20:18 -0400 Subject: [PATCH 1/5] CLN: Simplify map_infer_mask --- pandas/_libs/dtypes.pxd | 5 --- pandas/_libs/lib.pyx | 84 +++++++++++++---------------------------- 2 files changed, 27 insertions(+), 62 deletions(-) diff --git a/pandas/_libs/dtypes.pxd b/pandas/_libs/dtypes.pxd index de4b70d387b5f..ccfb2d2ef4a23 100644 --- a/pandas/_libs/dtypes.pxd +++ b/pandas/_libs/dtypes.pxd @@ -34,8 +34,3 @@ ctypedef fused numeric_t: ctypedef fused numeric_object_t: numeric_t object - -ctypedef fused uint8_int64_object_t: - uint8_t - int64_t - object diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4fd68a1593e49..5202b3392c9d7 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -53,6 +53,7 @@ from numpy cimport ( PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew, + PyArray_SETITEM, complex128_t, flatiter, float64_t, @@ -75,7 +76,6 @@ cdef extern from "pandas/parser/pd_parser.h": PandasParser_IMPORT from pandas._libs cimport util -from pandas._libs.dtypes cimport uint8_int64_object_t from pandas._libs.util cimport ( INT64_MAX, INT64_MIN, @@ -2845,83 +2845,53 @@ no_default = _NoDefault.no_default # Sentinel indicating the default value. NoDefault = Literal[_NoDefault.no_default] -def map_infer_mask( - ndarray[object] arr, - object f, - const uint8_t[:] mask, - *, - bint convert=True, - object na_value=no_default, - cnp.dtype dtype=np.dtype(object) -) -> "ArrayLike": - """ - Substitute for np.vectorize with pandas-friendly dtype inference. - - Parameters - ---------- - arr : ndarray - f : function - mask : ndarray - uint8 dtype ndarray indicating values not to apply `f` to. - convert : bool, default True - Whether to call `maybe_convert_objects` on the resulting ndarray. - na_value : Any, optional - The result value to use for masked values. By default, the - input value is used. - dtype : numpy.dtype - The numpy dtype to use for the result ndarray. - - Returns - ------- - np.ndarray or an ExtensionArray - """ - cdef Py_ssize_t n = len(arr) - result = np.empty(n, dtype=dtype) - - _map_infer_mask( - result, - arr, - f, - mask, - na_value, - ) - if convert: - return maybe_convert_objects(result) - else: - return result - - @cython.boundscheck(False) @cython.wraparound(False) -def _map_infer_mask( - ndarray[uint8_int64_object_t] out, - ndarray[object] arr, - object f, - const uint8_t[:] mask, - object na_value=no_default, +def map_infer_mask( + ndarray arr, + object f, + const uint8_t[:] mask, + *, + bint convert=True, + object na_value=no_default, ) -> None: """ Helper for map_infer_mask, split off to use fused types based on the result. """ cdef: - Py_ssize_t i, n + Py_ssize_t i + Py_ssize_t n = len(arr) object val + ndarray result = np.empty(n, dtype=object) + + flatiter arr_it = PyArray_IterNew(arr) + flatiter result_it = PyArray_IterNew(result) + n = len(arr) for i in range(n): if mask[i]: if na_value is no_default: - val = arr[i] + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it)) else: val = na_value else: - val = f(arr[i]) + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it)) + val = f(val) if cnp.PyArray_IsZeroDim(val): # unbox 0-dim arrays, GH#690 val = val.item() - out[i] = val + PyArray_SETITEM(result, PyArray_ITER_DATA(result_it), val) + + PyArray_ITER_NEXT(arr_it) + PyArray_ITER_NEXT(result_it) + + if convert: + return maybe_convert_objects(result) + else: + return result @cython.boundscheck(False) From fa1808bf57052b967ca8663b1c3562926ccf0af3 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 29 Apr 2024 22:05:46 -0400 Subject: [PATCH 2/5] fix some tests --- asv_bench/benchmarks/series_methods.py | 16 ++++++++++------ pandas/_libs/lib.pyx | 23 +++++++++++++++++++++-- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index b021af4694d7d..85d34cac5a7bf 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -148,10 +148,14 @@ def time_searchsorted(self, dtype): class Map: - params = (["dict", "Series", "lambda"], ["object", "category", "int"]) - param_names = "mapper" - - def setup(self, mapper, dtype): + params = ( + ["dict", "Series", "lambda"], + ["object", "category", "int"], + [None, "ignore"], + ) + param_names = ["mapper", "dtype", "na_action"] + + def setup(self, mapper, dtype, na_action): map_size = 1000 map_data = Series(map_size - np.arange(map_size), dtype=dtype) @@ -168,8 +172,8 @@ def setup(self, mapper, dtype): self.s = Series(np.random.randint(0, map_size, 10000), dtype=dtype) - def time_map(self, mapper, *args, **kwargs): - self.s.map(self.map_data) + def time_map(self, mapper, dtype, na_action): + self.s.map(self.map_data, na_action=na_action) class Clip: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5202b3392c9d7..d442b711123c2 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2854,16 +2854,35 @@ def map_infer_mask( *, bint convert=True, object na_value=no_default, + cnp.dtype dtype=np.dtype(object) ) -> None: """ - Helper for map_infer_mask, split off to use fused types based on the result. + Substitute for np.vectorize with pandas-friendly dtype inference. + + Parameters + ---------- + arr : ndarray + f : function + mask : ndarray + uint8 dtype ndarray indicating values not to apply `f` to. + convert : bool, default True + Whether to call `maybe_convert_objects` on the resulting ndarray. + na_value : Any, optional + The result value to use for masked values. By default, the + input value is used. + dtype : numpy.dtype + The numpy dtype to use for the result ndarray. + + Returns + ------- + np.ndarray or an ExtensionArray """ cdef: Py_ssize_t i Py_ssize_t n = len(arr) object val - ndarray result = np.empty(n, dtype=object) + ndarray result = np.empty(n, dtype=dtype) flatiter arr_it = PyArray_IterNew(arr) flatiter result_it = PyArray_IterNew(result) From 701ca99323b33f133094b43ef7e2c5683550364b Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 29 Apr 2024 23:43:09 -0400 Subject: [PATCH 3/5] fix tests? --- pandas/_libs/lib.pyx | 3 +-- pandas/core/arrays/string_arrow.py | 41 ++++++++++++++---------------- 2 files changed, 20 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d442b711123c2..70f0dfc5c78b2 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2854,7 +2854,7 @@ def map_infer_mask( *, bint convert=True, object na_value=no_default, - cnp.dtype dtype=np.dtype(object) + object dtype=np.dtype(object) ) -> None: """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2887,7 +2887,6 @@ def map_infer_mask( flatiter arr_it = PyArray_IterNew(arr) flatiter result_it = PyArray_IterNew(result) - n = len(arr) for i in range(n): if mask[i]: if na_value is no_default: diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index ec2534ce174ac..b102f5a6e036c 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -627,28 +627,25 @@ def _str_map( na_value = np.nan else: na_value = False - try: - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(cast(type, dtype)), - ) - return result - - except ValueError: - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - ) - if convert and result.dtype == object: - result = lib.maybe_convert_objects(result) - return result + + dtype = np.dtype(cast(type, dtype)) + if mask.any(): + # numpy int/bool dtypes cannot hold NaNs so we must convert to + # float64 for int (to match maybe_convert_objects) or + # object for bool (again to match maybe_convert_objects) + if is_integer_dtype(dtype): + dtype = np.float64 + else: + dtype = np.dtype(object) + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=dtype, + ) + return result elif is_string_dtype(dtype) and not is_object_dtype(dtype): # i.e. StringDtype From ee941f7ad90074d810c1b89a71ee94e4b74085e3 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 30 Apr 2024 14:39:27 -0400 Subject: [PATCH 4/5] Fix types? --- pandas/core/arrays/string_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index b102f5a6e036c..7f19c6e668409 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -634,7 +634,7 @@ def _str_map( # float64 for int (to match maybe_convert_objects) or # object for bool (again to match maybe_convert_objects) if is_integer_dtype(dtype): - dtype = np.float64 + dtype = np.dtype("float64") else: dtype = np.dtype(object) result = lib.map_infer_mask( From 3875adc62a8570a3193e74bd43145f1a56e718a1 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 1 May 2024 16:44:48 -0400 Subject: [PATCH 5/5] fixup annotations --- pandas/_libs/lib.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 70f0dfc5c78b2..6a31ce84ed418 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2854,8 +2854,8 @@ def map_infer_mask( *, bint convert=True, object na_value=no_default, - object dtype=np.dtype(object) -) -> None: + cnp.dtype dtype=np.dtype(object) +) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference.