From a6add5c73f7c05d1e62c5d3aab35bb9256f3b6f6 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 21 Aug 2023 10:33:21 -0700 Subject: [PATCH 1/9] ENH: numba engine in df.apply --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/_numba/executor.py | 39 ++++++++++++++++ pandas/core/apply.py | 29 ++++++++++-- pandas/core/frame.py | 25 +++++++++++ pandas/tests/apply/test_frame_apply.py | 62 ++++++++++++++++++++------ 5 files changed, 139 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 4ad450c965464..8dd97fcd81626 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -28,7 +28,7 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py index 5cd4779907146..0a26acb7df60a 100644 --- a/pandas/core/_numba/executor.py +++ b/pandas/core/_numba/executor.py @@ -15,6 +15,45 @@ from pandas.compat._optional import import_optional_dependency +@functools.cache +def generate_apply_looper(func, nopython=True, nogil=True, parallel=False): + if TYPE_CHECKING: + import numba + else: + numba = import_optional_dependency("numba") + nb_compat_func = numba.extending.register_jitable(func) + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def nb_looper(values, axis): + # Operate on the first row/col in order to get + # the output shape + if axis == 0: + first_elem = values[:, 0] + dim0 = values.shape[1] + else: + first_elem = values[0] + dim0 = values.shape[0] + res0 = nb_compat_func(first_elem) + # Use np.asarray to get shape for + # https://github.com/numba/numba/issues/4202#issuecomment-1185981507 + buf_shape = (dim0,) + np.atleast_1d(np.asarray(res0)).shape + if axis == 0: + buf_shape = buf_shape[::-1] + buff = np.empty(buf_shape) + + if axis == 1: + buff[0] = res0 + for i in numba.prange(1, values.shape[0]): + buff[i] = nb_compat_func(values[i]) + else: + buff[:, 0] = res0 + for j in numba.prange(1, values.shape[1]): + buff[:, j] = nb_compat_func(values[:, j]) + return buff + + return nb_looper + + @functools.cache def make_looper(func, result_dtype, is_grouped_kernel, nopython, nogil, parallel): if TYPE_CHECKING: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 6ab2f958b8730..e018115734851 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -49,6 +49,7 @@ ABCSeries, ) +from pandas.core._numba.executor import generate_apply_looper import pandas.core.common as com from pandas.core.construction import ensure_wrapped_if_datetimelike @@ -80,6 +81,8 @@ def frame_apply( raw: bool = False, result_type: str | None = None, by_row: Literal[False, "compat"] = "compat", + engine: str = "python", + engine_kwargs: dict = {}, args=None, kwargs=None, ) -> FrameApply: @@ -100,6 +103,8 @@ def frame_apply( raw=raw, result_type=result_type, by_row=by_row, + engine=engine, + engine_kwargs=engine_kwargs, args=args, kwargs=kwargs, ) @@ -748,11 +753,15 @@ def __init__( result_type: str | None, *, by_row: Literal[False, "compat"] = False, + engine: str = "python", + engine_kwargs: dict = {}, args, kwargs, ) -> None: if by_row is not False and by_row != "compat": raise ValueError(f"by_row={by_row} not allowed") + self.engine = engine + self.engine_kwargs = engine_kwargs super().__init__( obj, func, raw, result_type, by_row=by_row, args=args, kwargs=kwargs ) @@ -797,6 +806,12 @@ def values(self): def apply(self) -> DataFrame | Series: """compute the results""" + + if self.engine == "numba" and not self.raw: + raise ValueError( + "The numba engine in DataFrame.apply can only be used when raw=True" + ) + # dispatch to handle list-like or dict-like if is_list_like(self.func): return self.apply_list_or_dict_like() @@ -826,7 +841,7 @@ def apply(self) -> DataFrame | Series: # raw elif self.raw: - return self.apply_raw() + return self.apply_raw(engine=self.engine, engine_kwargs=self.engine_kwargs) return self.apply_standard() @@ -899,7 +914,7 @@ def apply_empty_result(self): else: return self.obj.copy() - def apply_raw(self): + def apply_raw(self, engine="python", engine_kwargs={}): """apply to the values as a numpy array""" def wrap_function(func): @@ -917,7 +932,15 @@ def wrapper(*args, **kwargs): return wrapper - result = np.apply_along_axis(wrap_function(self.func), self.axis, self.values) + if engine == "numba": + nb_looper = generate_apply_looper(self.func, **engine_kwargs) + result = nb_looper(self.values, self.axis) + # If we made the result 2-D, squeeze it back to 1-D + result = np.squeeze(result) + else: + result = np.apply_along_axis( + wrap_function(self.func), self.axis, self.values + ) # TODO: mixed type case if result.ndim == 2: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1e10e8f11a575..e35b2f5e1bcb6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9874,6 +9874,8 @@ def apply( result_type: Literal["expand", "reduce", "broadcast"] | None = None, args=(), by_row: Literal[False, "compat"] = "compat", + engine: str = "python", + engine_kwargs: dict = {}, **kwargs, ): """ @@ -9933,6 +9935,27 @@ def apply( If False, the funcs will be passed the whole Series at once. .. versionadded:: 2.1.0 + + engine: {'python', 'numba'}, default 'python' + Choose between the python (default) engine or the numba engine in apply. + + The numba engine will attempt to JIT compile the passed function, + which may result in speedups for large DataFrames. + It also supports the following engine_kwargs + - nopython (compile the function in nopython mode) + - nogil (release the GIL inside the JIT compiled function) + - parallel (try to apply the function in parallel over the DataFrame) + + .. note:: + + As of right now, the numba engine can only be used with raw=True. + + .. versionadded:: 2.2.0 + + engine_kwargs: dict + Pass keyword arguments to the engine. + This is currently only used by the numba engine, + see the documentation for the engine argument for more information. **kwargs Additional keyword arguments to pass as keywords arguments to `func`. @@ -10033,6 +10056,8 @@ def apply( raw=raw, result_type=result_type, by_row=by_row, + engine=engine, + engine_kwargs=engine_kwargs, args=args, kwargs=kwargs, ) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 3a3f73a68374b..3f2accc23e2d6 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -18,6 +18,13 @@ from pandas.tests.frame.common import zip_frames +@pytest.fixture(params=["python", "numba"]) +def engine(request): + if request.param == "numba": + pytest.importorskip("numba") + return request.param + + def test_apply(float_frame): with np.errstate(all="ignore"): # ufunc @@ -234,36 +241,42 @@ def test_apply_broadcast_series_lambda_func(int_frame_const_col): @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_raw_float_frame(float_frame, axis): +def test_apply_raw_float_frame(float_frame, axis, engine): + if engine == "numba": + pytest.skip("numba can't handle when UDF returns None.") + def _assert_raw(x): assert isinstance(x, np.ndarray) assert x.ndim == 1 - float_frame.apply(_assert_raw, axis=axis, raw=True) + float_frame.apply(_assert_raw, axis=axis, engine=engine, raw=True) @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_raw_float_frame_lambda(float_frame, axis): - result = float_frame.apply(np.mean, axis=axis, raw=True) +def test_apply_raw_float_frame_lambda(float_frame, axis, engine): + result = float_frame.apply(np.mean, axis=axis, engine=engine, raw=True) expected = float_frame.apply(lambda x: x.values.mean(), axis=axis) tm.assert_series_equal(result, expected) -def test_apply_raw_float_frame_no_reduction(float_frame): +def test_apply_raw_float_frame_no_reduction(float_frame, engine): # no reduction - result = float_frame.apply(lambda x: x * 2, raw=True) + result = float_frame.apply(lambda x: x * 2, engine=engine, raw=True) expected = float_frame * 2 tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_raw_mixed_type_frame(mixed_type_frame, axis): +def test_apply_raw_mixed_type_frame(mixed_type_frame, axis, engine): + if engine == "numba": + pytest.skip("isinstance check doesn't work with numba") + def _assert_raw(x): assert isinstance(x, np.ndarray) assert x.ndim == 1 # Mixed dtype (GH-32423) - mixed_type_frame.apply(_assert_raw, axis=axis, raw=True) + mixed_type_frame.apply(_assert_raw, axis=axis, engine=engine, raw=True) def test_apply_axis1(float_frame): @@ -300,14 +313,20 @@ def test_apply_mixed_dtype_corner_indexing(): ) @pytest.mark.parametrize("raw", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_empty_infer_type(ax, func, raw, axis): +def test_apply_empty_infer_type(ax, func, raw, axis, engine, request): df = DataFrame(**{ax: ["a", "b", "c"]}) with np.errstate(all="ignore"): test_res = func(np.array([], dtype="f8")) is_reduction = not isinstance(test_res, np.ndarray) - result = df.apply(func, axis=axis, raw=raw) + if engine == "numba" and raw is False: + mark = pytest.mark.xfail( + reason="numba engine only supports raw=True at the moment" + ) + request.node.add_marker(mark) + + result = df.apply(func, axis=axis, engine=engine, raw=raw) if is_reduction: agg_axis = df._get_agg_axis(axis) assert isinstance(result, Series) @@ -607,8 +626,10 @@ def non_reducing_function(row): assert names == list(df.index) -def test_apply_raw_function_runs_once(): +def test_apply_raw_function_runs_once(engine): # https://github.com/pandas-dev/pandas/issues/34506 + if engine == "numba": + pytest.skip("appending to list outside of numba func is not supported") df = DataFrame({"a": [1, 2, 3]}) values = [] # Save row values function is applied to @@ -623,7 +644,7 @@ def non_reducing_function(row): for func in [reducing_function, non_reducing_function]: del values[:] - df.apply(func, raw=True, axis=1) + df.apply(func, engine=engine, raw=True, axis=1) assert values == list(df.a.to_list()) @@ -1449,10 +1470,12 @@ def test_apply_no_suffix_index(): tm.assert_frame_equal(result, expected) -def test_apply_raw_returns_string(): +def test_apply_raw_returns_string(engine): # https://github.com/pandas-dev/pandas/issues/35940 + if engine == "numba": + pytest.skip("No object dtype support in numba") df = DataFrame({"A": ["aa", "bbb"]}) - result = df.apply(lambda x: x[0], axis=1, raw=True) + result = df.apply(lambda x: x[0], engine=engine, axis=1, raw=True) expected = Series(["aa", "bbb"]) tm.assert_series_equal(result, expected) @@ -1632,3 +1655,14 @@ def test_agg_dist_like_and_nonunique_columns(): result = df.agg({"A": "count"}) expected = df["A"].count() tm.assert_series_equal(result, expected) + + +def test_numba_unsupported(): + df = DataFrame( + {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]} + ) + with pytest.raises( + ValueError, + match="The numba engine in DataFrame.apply can only be used when raw=True", + ): + df.apply(lambda x: x, engine="numba", raw=False) From cca465623d557e6b5e8a8bcf84e6f4f25704a476 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 22 Aug 2023 22:03:31 -0700 Subject: [PATCH 2/9] fixes --- pandas/core/apply.py | 8 +++++++- pandas/core/frame.py | 6 +++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index e018115734851..141921a430291 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -933,7 +933,13 @@ def wrapper(*args, **kwargs): return wrapper if engine == "numba": - nb_looper = generate_apply_looper(self.func, **engine_kwargs) + # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has + # incompatible type "Callable[..., Any] | str | list[Callable + # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | + # list[Callable[..., Any] | str]]"; expected "Hashable" + nb_looper = generate_apply_looper( + self.func, **engine_kwargs + ) # type: ignore[arg-type] result = nb_looper(self.values, self.axis) # If we made the result 2-D, squeeze it back to 1-D result = np.squeeze(result) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e35b2f5e1bcb6..5b592ac2d7763 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9936,12 +9936,12 @@ def apply( .. versionadded:: 2.1.0 - engine: {'python', 'numba'}, default 'python' + engine : {'python', 'numba'}, default 'python' Choose between the python (default) engine or the numba engine in apply. The numba engine will attempt to JIT compile the passed function, which may result in speedups for large DataFrames. - It also supports the following engine_kwargs + It also supports the following engine_kwargs: - nopython (compile the function in nopython mode) - nogil (release the GIL inside the JIT compiled function) - parallel (try to apply the function in parallel over the DataFrame) @@ -9952,7 +9952,7 @@ def apply( .. versionadded:: 2.2.0 - engine_kwargs: dict + engine_kwargs : dict Pass keyword arguments to the engine. This is currently only used by the numba engine, see the documentation for the engine argument for more information. From 83d45980b3b34dbd964ffb82b4fee09c53561b30 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 23 Aug 2023 06:39:33 -0700 Subject: [PATCH 3/9] more fixes --- pandas/core/frame.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5b592ac2d7763..7c1dd86feb946 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9946,6 +9946,7 @@ def apply( - nogil (release the GIL inside the JIT compiled function) - parallel (try to apply the function in parallel over the DataFrame) + .. note:: As of right now, the numba engine can only be used with raw=True. From 81e85cc5372297481b42182149d15a61d97a06a7 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 23 Aug 2023 22:05:39 -0700 Subject: [PATCH 4/9] try to fix --- pandas/core/apply.py | 4 ++-- pandas/core/frame.py | 12 +++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 141921a430291..0a0af205ade40 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -938,8 +938,8 @@ def wrapper(*args, **kwargs): # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | # list[Callable[..., Any] | str]]"; expected "Hashable" nb_looper = generate_apply_looper( - self.func, **engine_kwargs - ) # type: ignore[arg-type] + self.func, **engine_kwargs # type: ignore[arg-type] + ) result = nb_looper(self.values, self.axis) # If we made the result 2-D, squeeze it back to 1-D result = np.squeeze(result) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7c1dd86feb946..032276c763b8e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9941,15 +9941,13 @@ def apply( The numba engine will attempt to JIT compile the passed function, which may result in speedups for large DataFrames. - It also supports the following engine_kwargs: - - nopython (compile the function in nopython mode) - - nogil (release the GIL inside the JIT compiled function) - - parallel (try to apply the function in parallel over the DataFrame) + It also supports the following engine_kwargs : + - nopython (compile the function in nopython mode) + - nogil (release the GIL inside the JIT compiled function) + - parallel (try to apply the function in parallel over the DataFrame) - .. note:: - - As of right now, the numba engine can only be used with raw=True. + As of right now, the numba engine can only be used with raw=True. .. versionadded:: 2.2.0 From 5da0723751e7bda5ee3ec066f3bce197942093e2 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 8 Sep 2023 16:32:31 -0400 Subject: [PATCH 5/9] address code review --- pandas/core/frame.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 35e44bbd23422..6443db9b6bf52 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9919,8 +9919,8 @@ def apply( result_type: Literal["expand", "reduce", "broadcast"] | None = None, args=(), by_row: Literal[False, "compat"] = "compat", - engine: str = "python", - engine_kwargs: dict = {}, + engine: Literal["python", "numba"] = "python", + engine_kwargs: dict[str, bool] | None = None, **kwargs, ): """ @@ -9992,6 +9992,13 @@ def apply( - nogil (release the GIL inside the JIT compiled function) - parallel (try to apply the function in parallel over the DataFrame) + Note: The numba compiler only supports a subset of valid Python/numpy operations. + Please read more about the `supported python features + `_ + and `supported numpy features + `_ + in numba to learn what you can or cannot use in the passed function. + As of right now, the numba engine can only be used with raw=True. .. versionadded:: 2.2.0 From dae9a150921106968f998c2fcc690965fffbd307 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 8 Sep 2023 20:51:29 +0000 Subject: [PATCH 6/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6443db9b6bf52..973c056bcadc8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9993,9 +9993,9 @@ def apply( - parallel (try to apply the function in parallel over the DataFrame) Note: The numba compiler only supports a subset of valid Python/numpy operations. - Please read more about the `supported python features + Please read more about the `supported python features `_ - and `supported numpy features + and `supported numpy features `_ in numba to learn what you can or cannot use in the passed function. From dc5a734ffc1331d49a49732520833ad9720cb353 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 8 Sep 2023 18:24:20 -0400 Subject: [PATCH 7/9] go for green --- pandas/core/apply.py | 7 +++++-- pandas/core/frame.py | 4 +++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index fb798e5e5ef45..30e6f7f81b845 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -82,7 +82,7 @@ def frame_apply( result_type: str | None = None, by_row: Literal[False, "compat"] = "compat", engine: str = "python", - engine_kwargs: dict = {}, + engine_kwargs: dict[str, bool] | None = None, args=None, kwargs=None, ) -> FrameApply: @@ -916,7 +916,7 @@ def apply_empty_result(self): else: return self.obj.copy() - def apply_raw(self, engine="python", engine_kwargs={}): + def apply_raw(self, engine="python", engine_kwargs=None): """apply to the values as a numpy array""" def wrap_function(func): @@ -935,6 +935,9 @@ def wrapper(*args, **kwargs): return wrapper if engine == "numba": + + engine_kwargs = {} if engine_kwargs is None else engine_kwargs + # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has # incompatible type "Callable[..., Any] | str | list[Callable # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 973c056bcadc8..156863c0aa333 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9992,7 +9992,9 @@ def apply( - nogil (release the GIL inside the JIT compiled function) - parallel (try to apply the function in parallel over the DataFrame) - Note: The numba compiler only supports a subset of valid Python/numpy operations. + Note: The numba compiler only supports a subset of + valid Python/numpy operations. + Please read more about the `supported python features `_ and `supported numpy features From 8504f4540f810cd007b8da5d286e0921764d1426 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 8 Sep 2023 22:51:40 +0000 Subject: [PATCH 8/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/core/apply.py | 1 - pandas/core/frame.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 30e6f7f81b845..0264030fde9a4 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -935,7 +935,6 @@ def wrapper(*args, **kwargs): return wrapper if engine == "numba": - engine_kwargs = {} if engine_kwargs is None else engine_kwargs # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 156863c0aa333..5b17d1480c811 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9992,9 +9992,9 @@ def apply( - nogil (release the GIL inside the JIT compiled function) - parallel (try to apply the function in parallel over the DataFrame) - Note: The numba compiler only supports a subset of + Note: The numba compiler only supports a subset of valid Python/numpy operations. - + Please read more about the `supported python features `_ and `supported numpy features From 4fbd6ae48b3f148b43a8c2d3559239f1e2ba5c86 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 8 Sep 2023 19:29:09 -0400 Subject: [PATCH 9/9] update type --- pandas/core/apply.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 0264030fde9a4..2f53263444eed 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -756,7 +756,7 @@ def __init__( *, by_row: Literal[False, "compat"] = False, engine: str = "python", - engine_kwargs: dict = {}, + engine_kwargs: dict[str, bool] | None = None, args, kwargs, ) -> None: