From 76aa92e55ebd3b6df61d714d770c6038ca43c9a9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 14 Nov 2021 19:32:39 -0800 Subject: [PATCH 01/10] ENH: Add numba engine to rolling.var --- doc/source/whatsnew/v1.4.0.rst | 2 + pandas/core/_numba/executor.py | 45 +++++++--- pandas/core/_numba/kernels/__init__.py | 3 +- pandas/core/_numba/kernels/var_.py | 116 +++++++++++++++++++++++++ pandas/core/window/expanding.py | 13 ++- pandas/core/window/rolling.py | 21 ++++- pandas/tests/window/conftest.py | 1 + pandas/tests/window/test_numba.py | 45 ++++++---- 8 files changed, 212 insertions(+), 34 deletions(-) create mode 100644 pandas/core/_numba/kernels/var_.py diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index a593a03de5c25..d6b71747f3757 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -210,6 +210,8 @@ Other enhancements - :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`) - :meth:`.GroupBy.mean` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`) - :meth:`Timestamp.isoformat`, now handles the ``timespec`` argument from the base :class:``datetime`` class (:issue:`26131`) +- :meth:`.Rolling.var` and :meth:`.Expanding.var` now support `Numba `_ execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`) + .. --------------------------------------------------------------------------- diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py index c2b6191c05152..2ff478658eaa7 100644 --- a/pandas/core/_numba/executor.py +++ b/pandas/core/_numba/executor.py @@ -44,17 +44,38 @@ def generate_shared_aggregator( numba = import_optional_dependency("numba") - # error: Untyped decorator makes function "column_looper" untyped - @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) # type: ignore[misc] - def column_looper( - values: np.ndarray, - start: np.ndarray, - end: np.ndarray, - min_periods: int, - ): - result = np.empty((len(start), values.shape[1]), dtype=np.float64) - for i in numba.prange(values.shape[1]): - result[:, i] = func(values[:, i], start, end, min_periods) - return result + # Avoiding **kwargs usage: https://github.com/numba/numba/issues/2916 + if "var" in cache_key_str: + # error: Untyped decorator makes function "column_looper" untyped + @numba.jit( + nopython=nopython, nogil=nogil, parallel=parallel + ) # type: ignore[misc] + def column_looper( + values: np.ndarray, + start: np.ndarray, + end: np.ndarray, + min_periods: int, + ddof: int, + ): + result = np.empty((len(start), values.shape[1]), dtype=np.float64) + for i in numba.prange(values.shape[1]): + result[:, i] = func(values[:, i], start, end, min_periods, ddof) + return result + + else: + # error: Untyped decorator makes function "column_looper" untyped + @numba.jit( + nopython=nopython, nogil=nogil, parallel=parallel + ) # type: ignore[misc] + def column_looper( + values: np.ndarray, + start: np.ndarray, + end: np.ndarray, + min_periods: int, + ): + result = np.empty((len(start), values.shape[1]), dtype=np.float64) + for i in numba.prange(values.shape[1]): + result[:, i] = func(values[:, i], start, end, min_periods) + return result return column_looper diff --git a/pandas/core/_numba/kernels/__init__.py b/pandas/core/_numba/kernels/__init__.py index 23b0ec5c3d8aa..2753a1e01161d 100644 --- a/pandas/core/_numba/kernels/__init__.py +++ b/pandas/core/_numba/kernels/__init__.py @@ -1,4 +1,5 @@ from pandas.core._numba.kernels.mean_ import sliding_mean from pandas.core._numba.kernels.sum_ import sliding_sum +from pandas.core._numba.kernels.var_ import sliding_var -__all__ = ["sliding_mean", "sliding_sum"] +__all__ = ["sliding_mean", "sliding_sum", "sliding_var"] diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py new file mode 100644 index 0000000000000..fdbd53b9240f5 --- /dev/null +++ b/pandas/core/_numba/kernels/var_.py @@ -0,0 +1,116 @@ +""" +Numba 1D var kernels that can be shared by +* Dataframe / Series +* groupby +* rolling / expanding + +Mirrors pandas/_libs/window/aggregation.pyx +""" +from __future__ import annotations + +import numba +import numpy as np + +from pandas.core._numba.kernels.shared import is_monotonic_increasing + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def add_var( + val: float, nobs: int, mean_x: float, ssqdm_x: int, compensation: float +) -> tuple[int, float, int, float]: + if not np.isnan(val): + nobs += 1 + prev_mean = mean_x - compensation + y = val - compensation + t = y - mean_x + compensation = t + mean_x - y + delta = t + if nobs: + mean_x += delta / nobs + else: + mean_x = 0 + ssqdm_x += (val - prev_mean) * (val - mean_x) + return nobs, mean_x, ssqdm_x, compensation + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def remove_var( + val: float, nobs: int, mean_x: float, ssqdm_x: int, compensation: float +) -> tuple[int, float, int, float]: + if not np.isnan(val): + nobs -= 1 + if nobs: + prev_mean = mean_x - compensation + y = val - compensation + t = y - mean_x + compensation = t + mean_x - y + delta = t + mean_x -= delta / nobs + ssqdm_x -= (val - prev_mean) * (val - mean_x) + else: + mean_x = 0 + ssqdm_x = 0 + return nobs, mean_x, ssqdm_x, compensation + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def sliding_var( + values: np.ndarray, + start: np.ndarray, + end: np.ndarray, + min_periods: int, + ddof: int = 1, +) -> np.ndarray: + N = len(start) + nobs = 0 + mean_x = 0.0 + ssqdm_x = 0.0 + compensation_add = 0.0 + compensation_remove = 0.0 + + min_periods = max(min_periods, 1) + is_monotonic_increasing_bounds = is_monotonic_increasing( + start + ) and is_monotonic_increasing(end) + + output = np.empty(N, dtype=np.float64) + + for i in range(N): + s = start[i] + e = end[i] + if i == 0 or not is_monotonic_increasing_bounds: + for j in range(s, e): + val = values[j] + nobs, mean_x, ssqdm_x, compensation_add = add_var( + val, nobs, mean_x, ssqdm_x, compensation_add + ) + else: + for j in range(start[i - 1], s): + val = values[j] + nobs, mean_x, ssqdm_x, compensation_remove = remove_var( + val, nobs, mean_x, ssqdm_x, compensation_remove + ) + + for j in range(end[i - 1], e): + val = values[j] + nobs, mean_x, ssqdm_x, compensation_add = add_var( + val, nobs, mean_x, ssqdm_x, compensation_add + ) + + if nobs >= min_periods and nobs > ddof: + if nobs == 1: + result = 0 + else: + result = ssqdm_x / (nobs - ddof) + else: + result = np.nan + + output[i] = result + + if not is_monotonic_increasing_bounds: + nobs = 0 + mean_x = 0.0 + ssqdm_x = 0.0 + compensation_remove = 0.0 + + return output diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 796849e622ff2..10dfc89ea8b87 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -446,9 +446,18 @@ def std(self, ddof: int = 1, *args, **kwargs): aggregation_description="variance", agg_method="var", ) - def var(self, ddof: int = 1, *args, **kwargs): + def var( + self, + ddof: int = 1, + engine: str | None = None, + engine_kwargs: dict[str, bool] | None = None, + *args, + **kwargs, + ): nv.validate_expanding_func("var", args, kwargs) - return super().var(ddof=ddof, **kwargs) + return super().var( + ddof=ddof, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) @doc( template_header, diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index f7799912937b7..9be0cc2d233c0 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -589,6 +589,7 @@ def _numba_apply( func: Callable[..., Any], numba_cache_key_str: str, engine_kwargs: dict[str, bool] | None = None, + **func_kwargs, ): window_indexer = self._get_window_indexer() min_periods = ( @@ -611,7 +612,7 @@ def _numba_apply( aggregator = executor.generate_shared_aggregator( func, engine_kwargs, numba_cache_key_str ) - result = aggregator(values, start, end, min_periods) + result = aggregator(values, start, end, min_periods, **func_kwargs) NUMBA_FUNC_CACHE[(func, numba_cache_key_str)] = aggregator result = result.T if self.axis == 1 else result if obj.ndim == 1: @@ -1462,7 +1463,23 @@ def zsqrt_func(values, begin, end, min_periods): **kwargs, ) - def var(self, ddof: int = 1, *args, **kwargs): + def var( + self, + ddof: int = 1, + engine: str | None = None, + engine_kwargs: dict[str, bool] | None = None, + *args, + **kwargs, + ): + if maybe_use_numba(engine): + if self.method == "table": + raise NotImplementedError("var not supported with method='table'") + else: + from pandas.core._numba.kernels import sliding_var + + return self._numba_apply( + sliding_var, "rolling_var", engine_kwargs, ddof=ddof + ) nv.validate_window_func("var", args, kwargs) window_func = partial(window_aggregations.roll_var, ddof=ddof) return self._apply( diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 7b1aa93b5923a..29b68ac75c939 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -69,6 +69,7 @@ def arithmetic_win_operators(request): "median", "max", "min", + "var", ] ) def arithmetic_numba_supported_operators(request): diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 9fd4bd422178a..87f30d51bad74 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -59,7 +59,7 @@ def test_numba_vs_cython_rolling_methods( expected = getattr(roll, method)(engine="cython") # Check the cache - if method not in ("mean", "sum"): + if method not in ("mean", "sum", "var"): assert ( getattr(np, f"nan{method}"), "Rolling_apply_single", @@ -84,7 +84,7 @@ def test_numba_vs_cython_expanding_methods( expected = getattr(expand, method)(engine="cython") # Check the cache - if method not in ("mean", "sum"): + if method not in ("mean", "sum", "var"): assert ( getattr(np, f"nan{method}"), "Expanding_apply_single", @@ -287,14 +287,19 @@ def test_table_method_rolling_methods( engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} df = DataFrame(np.eye(3)) - - result = getattr( - df.rolling(2, method="table", axis=axis, min_periods=0), method - )(engine_kwargs=engine_kwargs, engine="numba") - expected = getattr( - df.rolling(2, method="single", axis=axis, min_periods=0), method - )(engine_kwargs=engine_kwargs, engine="numba") - tm.assert_frame_equal(result, expected) + if method == "var": + with pytest.raises(NotImplementedError, match=f"{method} not supported"): + getattr( + df.rolling(2, method="table", axis=axis, min_periods=0), method + )(engine_kwargs=engine_kwargs, engine="numba") + else: + result = getattr( + df.rolling(2, method="table", axis=axis, min_periods=0), method + )(engine_kwargs=engine_kwargs, engine="numba") + expected = getattr( + df.rolling(2, method="single", axis=axis, min_periods=0), method + )(engine_kwargs=engine_kwargs, engine="numba") + tm.assert_frame_equal(result, expected) def test_table_method_rolling_apply(self, axis, nogil, parallel, nopython): engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} @@ -355,13 +360,19 @@ def test_table_method_expanding_methods( df = DataFrame(np.eye(3)) - result = getattr(df.expanding(method="table", axis=axis), method)( - engine_kwargs=engine_kwargs, engine="numba" - ) - expected = getattr(df.expanding(method="single", axis=axis), method)( - engine_kwargs=engine_kwargs, engine="numba" - ) - tm.assert_frame_equal(result, expected) + if method == "var": + with pytest.raises(NotImplementedError, match=f"{method} not supported"): + getattr(df.expanding(method="table", axis=axis), method)( + engine_kwargs=engine_kwargs, engine="numba" + ) + else: + result = getattr(df.expanding(method="table", axis=axis), method)( + engine_kwargs=engine_kwargs, engine="numba" + ) + expected = getattr(df.expanding(method="single", axis=axis), method)( + engine_kwargs=engine_kwargs, engine="numba" + ) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("data", [np.eye(3), np.ones((2, 3)), np.ones((3, 2))]) @pytest.mark.parametrize("method", ["mean", "sum"]) From 450e601b90d1a75fef3b4f87c72de906e308f91d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 14 Nov 2021 19:39:44 -0800 Subject: [PATCH 02/10] Fix typing --- pandas/core/_numba/kernels/var_.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py index fdbd53b9240f5..2e5660673701b 100644 --- a/pandas/core/_numba/kernels/var_.py +++ b/pandas/core/_numba/kernels/var_.py @@ -16,8 +16,8 @@ @numba.jit(nopython=True, nogil=True, parallel=False) def add_var( - val: float, nobs: int, mean_x: float, ssqdm_x: int, compensation: float -) -> tuple[int, float, int, float]: + val: float, nobs: int, mean_x: float, ssqdm_x: float, compensation: float +) -> tuple[int, float, float, float]: if not np.isnan(val): nobs += 1 prev_mean = mean_x - compensation @@ -35,8 +35,8 @@ def add_var( @numba.jit(nopython=True, nogil=True, parallel=False) def remove_var( - val: float, nobs: int, mean_x: float, ssqdm_x: int, compensation: float -) -> tuple[int, float, int, float]: + val: float, nobs: int, mean_x: float, ssqdm_x: float, compensation: float +) -> tuple[int, float, float, float]: if not np.isnan(val): nobs -= 1 if nobs: @@ -99,7 +99,7 @@ def sliding_var( if nobs >= min_periods and nobs > ddof: if nobs == 1: - result = 0 + result = 0.0 else: result = ssqdm_x / (nobs - ddof) else: From d9391fe26c7e5a3909d29f7945c5bf8fa2a989cf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 14 Nov 2021 20:05:12 -0800 Subject: [PATCH 03/10] Add std, support multiple versions in numba args docstring --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/core/_numba/executor.py | 2 +- pandas/core/window/doc.py | 16 ++++++++---- pandas/core/window/ewm.py | 4 +-- pandas/core/window/expanding.py | 21 +++++++++++----- pandas/core/window/rolling.py | 41 +++++++++++++++++++++++++------ pandas/tests/window/test_numba.py | 4 +-- 7 files changed, 66 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index d6b71747f3757..4614441def361 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -210,7 +210,7 @@ Other enhancements - :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`) - :meth:`.GroupBy.mean` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`) - :meth:`Timestamp.isoformat`, now handles the ``timespec`` argument from the base :class:``datetime`` class (:issue:`26131`) -- :meth:`.Rolling.var` and :meth:`.Expanding.var` now support `Numba `_ execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`) +- :meth:`.Rolling.var`, :meth:`.Expanding.var`, :meth:`.Rolling.std`, :meth:`.Expanding.std` now support `Numba `_ execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py index 2ff478658eaa7..8fa28199a8450 100644 --- a/pandas/core/_numba/executor.py +++ b/pandas/core/_numba/executor.py @@ -45,7 +45,7 @@ def generate_shared_aggregator( numba = import_optional_dependency("numba") # Avoiding **kwargs usage: https://github.com/numba/numba/issues/2916 - if "var" in cache_key_str: + if any(func in cache_key_str for func in ("var", "std")): # error: Untyped decorator makes function "column_looper" untyped @numba.jit( nopython=nopython, nogil=nogil, parallel=parallel diff --git a/pandas/core/window/doc.py b/pandas/core/window/doc.py index 2cc7962c6bd7b..e4c4f82b423cf 100644 --- a/pandas/core/window/doc.py +++ b/pandas/core/window/doc.py @@ -98,14 +98,17 @@ def create_section_header(header: str) -> str: "extended documentation and performance considerations for the Numba engine.\n\n" ) -window_agg_numba_parameters = dedent( - """ + +def window_agg_numba_parameters(version: str = "1.3") -> str: + return ( + dedent( + """ engine : str, default None * ``'cython'`` : Runs the operation through C-extensions from cython. * ``'numba'`` : Runs the operation through JIT compiled code from numba. * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` - .. versionadded:: 1.3.0 + .. versionadded:: {version}.0 engine_kwargs : dict, default None * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` @@ -114,6 +117,9 @@ def create_section_header(header: str) -> str: ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` - .. versionadded:: 1.3.0\n + .. versionadded:: {version}.0\n """ -).replace("\n", "", 1) + ) + .replace("\n", "", 1) + .replace("{version}", version) + ) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index f5f681d9de797..7ea312219c33e 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -504,7 +504,7 @@ def aggregate(self, func, *args, **kwargs): template_header, create_section_header("Parameters"), args_compat, - window_agg_numba_parameters, + window_agg_numba_parameters(), kwargs_compat, create_section_header("Returns"), template_returns, @@ -558,7 +558,7 @@ def mean(self, *args, engine=None, engine_kwargs=None, **kwargs): template_header, create_section_header("Parameters"), args_compat, - window_agg_numba_parameters, + window_agg_numba_parameters(), kwargs_compat, create_section_header("Returns"), template_returns, diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 10dfc89ea8b87..2d213ca10801e 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -227,7 +227,7 @@ def apply( template_header, create_section_header("Parameters"), args_compat, - window_agg_numba_parameters, + window_agg_numba_parameters(), kwargs_compat, create_section_header("Returns"), template_returns, @@ -253,7 +253,7 @@ def sum( template_header, create_section_header("Parameters"), args_compat, - window_agg_numba_parameters, + window_agg_numba_parameters(), kwargs_compat, create_section_header("Returns"), template_returns, @@ -279,7 +279,7 @@ def max( template_header, create_section_header("Parameters"), args_compat, - window_agg_numba_parameters, + window_agg_numba_parameters(), kwargs_compat, create_section_header("Returns"), template_returns, @@ -305,7 +305,7 @@ def min( template_header, create_section_header("Parameters"), args_compat, - window_agg_numba_parameters, + window_agg_numba_parameters(), kwargs_compat, create_section_header("Returns"), template_returns, @@ -330,7 +330,7 @@ def mean( @doc( template_header, create_section_header("Parameters"), - window_agg_numba_parameters, + window_agg_numba_parameters(), kwargs_compat, create_section_header("Returns"), template_returns, @@ -360,6 +360,7 @@ def median( is ``N - ddof``, where ``N`` represents the number of elements.\n """ ).replace("\n", "", 1), + window_agg_numba_parameters("1.4"), args_compat, kwargs_compat, create_section_header("Returns"), @@ -396,7 +397,14 @@ def median( aggregation_description="standard deviation", agg_method="std", ) - def std(self, ddof: int = 1, *args, **kwargs): + def std( + self, + ddof: int = 1, + engine: str | None = None, + engine_kwargs: dict[str, bool] | None = None, + *args, + **kwargs, + ): nv.validate_expanding_func("std", args, kwargs) return super().std(ddof=ddof, **kwargs) @@ -410,6 +418,7 @@ def std(self, ddof: int = 1, *args, **kwargs): is ``N - ddof``, where ``N`` represents the number of elements.\n """ ).replace("\n", "", 1), + window_agg_numba_parameters("1.4"), args_compat, kwargs_compat, create_section_header("Returns"), diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 9be0cc2d233c0..553a95c61c921 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1450,7 +1450,25 @@ def median( window_func = window_aggregations.roll_median_c return self._apply(window_func, name="median", **kwargs) - def std(self, ddof: int = 1, *args, **kwargs): + def std( + self, + ddof: int = 1, + engine: str | None = None, + engine_kwargs: dict[str, bool] | None = None, + *args, + **kwargs, + ): + if maybe_use_numba(engine): + if self.method == "table": + raise NotImplementedError("std not supported with method='table'") + else: + from pandas.core._numba.kernels import sliding_var + + return zsqrt( + self._numba_apply( + sliding_var, "rolling_std", engine_kwargs, ddof=ddof + ) + ) nv.validate_window_func("std", args, kwargs) window_func = window_aggregations.roll_var @@ -1823,7 +1841,7 @@ def apply( template_header, create_section_header("Parameters"), args_compat, - window_agg_numba_parameters, + window_agg_numba_parameters(), kwargs_compat, create_section_header("Returns"), template_returns, @@ -1897,7 +1915,7 @@ def sum( template_header, create_section_header("Parameters"), args_compat, - window_agg_numba_parameters, + window_agg_numba_parameters(), kwargs_compat, create_section_header("Returns"), template_returns, @@ -1923,7 +1941,7 @@ def max( template_header, create_section_header("Parameters"), args_compat, - window_agg_numba_parameters, + window_agg_numba_parameters(), kwargs_compat, create_section_header("Returns"), template_returns, @@ -1964,7 +1982,7 @@ def min( template_header, create_section_header("Parameters"), args_compat, - window_agg_numba_parameters, + window_agg_numba_parameters(), kwargs_compat, create_section_header("Returns"), template_returns, @@ -2011,7 +2029,7 @@ def mean( @doc( template_header, create_section_header("Parameters"), - window_agg_numba_parameters, + window_agg_numba_parameters(), kwargs_compat, create_section_header("Returns"), template_returns, @@ -2056,6 +2074,7 @@ def median( is ``N - ddof``, where ``N`` represents the number of elements. """ ).replace("\n", "", 1), + window_agg_numba_parameters("1.4"), args_compat, kwargs_compat, create_section_header("Returns"), @@ -2108,6 +2127,7 @@ def std(self, ddof: int = 1, *args, **kwargs): is ``N - ddof``, where ``N`` represents the number of elements. """ ).replace("\n", "", 1), + window_agg_numba_parameters("1.4"), args_compat, kwargs_compat, create_section_header("Returns"), @@ -2146,7 +2166,14 @@ def std(self, ddof: int = 1, *args, **kwargs): aggregation_description="variance", agg_method="var", ) - def var(self, ddof: int = 1, *args, **kwargs): + def var( + self, + ddof: int = 1, + engine: str | None = None, + engine_kwargs: dict[str, bool] | None = None, + *args, + **kwargs, + ): nv.validate_rolling_func("var", args, kwargs) return super().var(ddof=ddof, **kwargs) diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 87f30d51bad74..0cb5fb43223d6 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -59,7 +59,7 @@ def test_numba_vs_cython_rolling_methods( expected = getattr(roll, method)(engine="cython") # Check the cache - if method not in ("mean", "sum", "var"): + if method not in ("mean", "sum", "var", "std"): assert ( getattr(np, f"nan{method}"), "Rolling_apply_single", @@ -84,7 +84,7 @@ def test_numba_vs_cython_expanding_methods( expected = getattr(expand, method)(engine="cython") # Check the cache - if method not in ("mean", "sum", "var"): + if method not in ("mean", "sum", "var", "std"): assert ( getattr(np, f"nan{method}"), "Expanding_apply_single", From 5b7b44850f7d059d0bf9dae0f9514329b5d6336e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 14 Nov 2021 20:47:48 -0800 Subject: [PATCH 04/10] Fix tests for std --- pandas/core/_numba/executor.py | 46 +++++++++---------------------- pandas/core/window/expanding.py | 4 ++- pandas/core/window/rolling.py | 27 ++++++++++++------ pandas/tests/window/conftest.py | 15 ++++++---- pandas/tests/window/test_numba.py | 36 +++++++++++++----------- 5 files changed, 63 insertions(+), 65 deletions(-) diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py index 8fa28199a8450..acb0c6d175c51 100644 --- a/pandas/core/_numba/executor.py +++ b/pandas/core/_numba/executor.py @@ -44,38 +44,18 @@ def generate_shared_aggregator( numba = import_optional_dependency("numba") - # Avoiding **kwargs usage: https://github.com/numba/numba/issues/2916 - if any(func in cache_key_str for func in ("var", "std")): - # error: Untyped decorator makes function "column_looper" untyped - @numba.jit( - nopython=nopython, nogil=nogil, parallel=parallel - ) # type: ignore[misc] - def column_looper( - values: np.ndarray, - start: np.ndarray, - end: np.ndarray, - min_periods: int, - ddof: int, - ): - result = np.empty((len(start), values.shape[1]), dtype=np.float64) - for i in numba.prange(values.shape[1]): - result[:, i] = func(values[:, i], start, end, min_periods, ddof) - return result - - else: - # error: Untyped decorator makes function "column_looper" untyped - @numba.jit( - nopython=nopython, nogil=nogil, parallel=parallel - ) # type: ignore[misc] - def column_looper( - values: np.ndarray, - start: np.ndarray, - end: np.ndarray, - min_periods: int, - ): - result = np.empty((len(start), values.shape[1]), dtype=np.float64) - for i in numba.prange(values.shape[1]): - result[:, i] = func(values[:, i], start, end, min_periods) - return result + # error: Untyped decorator makes function "column_looper" untyped + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) # type: ignore[misc] + def column_looper( + values: np.ndarray, + start: np.ndarray, + end: np.ndarray, + min_periods: int, + *args, + ): + result = np.empty((len(start), values.shape[1]), dtype=np.float64) + for i in numba.prange(values.shape[1]): + result[:, i] = func(values[:, i], start, end, min_periods, *args) + return result return column_looper diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 2d213ca10801e..e15faeb4097de 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -406,7 +406,9 @@ def std( **kwargs, ): nv.validate_expanding_func("std", args, kwargs) - return super().std(ddof=ddof, **kwargs) + return super().std( + ddof=ddof, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) @doc( template_header, diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 553a95c61c921..f28049e77d8df 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -589,7 +589,7 @@ def _numba_apply( func: Callable[..., Any], numba_cache_key_str: str, engine_kwargs: dict[str, bool] | None = None, - **func_kwargs, + *func_args, ): window_indexer = self._get_window_indexer() min_periods = ( @@ -612,7 +612,7 @@ def _numba_apply( aggregator = executor.generate_shared_aggregator( func, engine_kwargs, numba_cache_key_str ) - result = aggregator(values, start, end, min_periods, **func_kwargs) + result = aggregator(values, start, end, min_periods, *func_args) NUMBA_FUNC_CACHE[(func, numba_cache_key_str)] = aggregator result = result.T if self.axis == 1 else result if obj.ndim == 1: @@ -1465,9 +1465,7 @@ def std( from pandas.core._numba.kernels import sliding_var return zsqrt( - self._numba_apply( - sliding_var, "rolling_std", engine_kwargs, ddof=ddof - ) + self._numba_apply(sliding_var, "rolling_std", engine_kwargs, ddof) ) nv.validate_window_func("std", args, kwargs) window_func = window_aggregations.roll_var @@ -1496,7 +1494,7 @@ def var( from pandas.core._numba.kernels import sliding_var return self._numba_apply( - sliding_var, "rolling_var", engine_kwargs, ddof=ddof + sliding_var, "rolling_var", engine_kwargs, ddof ) nv.validate_window_func("var", args, kwargs) window_func = partial(window_aggregations.roll_var, ddof=ddof) @@ -2113,9 +2111,18 @@ def median( aggregation_description="standard deviation", agg_method="std", ) - def std(self, ddof: int = 1, *args, **kwargs): + def std( + self, + ddof: int = 1, + engine: str | None = None, + engine_kwargs: dict[str, bool] | None = None, + *args, + **kwargs, + ): nv.validate_rolling_func("std", args, kwargs) - return super().std(ddof=ddof, **kwargs) + return super().std( + ddof=ddof, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) @doc( template_header, @@ -2175,7 +2182,9 @@ def var( **kwargs, ): nv.validate_rolling_func("var", args, kwargs) - return super().var(ddof=ddof, **kwargs) + return super().var( + ddof=ddof, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) @doc( template_header, diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 29b68ac75c939..bf1af0c83c93f 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -64,12 +64,15 @@ def arithmetic_win_operators(request): @pytest.fixture( params=[ - "sum", - "mean", - "median", - "max", - "min", - "var", + ["sum", {}], + ["mean", {}], + ["median", {}], + ["max", {}], + ["min", {}], + ["var", {}], + ["var", {"ddof": 0}], + ["std", {}], + ["std", {"ddof": 0}], ] ) def arithmetic_numba_supported_operators(request): diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 0cb5fb43223d6..96a69ff1a224e 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -50,13 +50,15 @@ def test_numba_vs_cython_rolling_methods( self, data, nogil, parallel, nopython, arithmetic_numba_supported_operators ): - method = arithmetic_numba_supported_operators + method, kwargs = arithmetic_numba_supported_operators engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} roll = data.rolling(2) - result = getattr(roll, method)(engine="numba", engine_kwargs=engine_kwargs) - expected = getattr(roll, method)(engine="cython") + result = getattr(roll, method)( + engine="numba", engine_kwargs=engine_kwargs, **kwargs + ) + expected = getattr(roll, method)(engine="cython", **kwargs) # Check the cache if method not in ("mean", "sum", "var", "std"): @@ -74,14 +76,16 @@ def test_numba_vs_cython_expanding_methods( self, data, nogil, parallel, nopython, arithmetic_numba_supported_operators ): - method = arithmetic_numba_supported_operators + method, kwargs = arithmetic_numba_supported_operators engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} data = DataFrame(np.eye(5)) expand = data.expanding() - result = getattr(expand, method)(engine="numba", engine_kwargs=engine_kwargs) - expected = getattr(expand, method)(engine="cython") + result = getattr(expand, method)( + engine="numba", engine_kwargs=engine_kwargs, **kwargs + ) + expected = getattr(expand, method)(engine="cython", **kwargs) # Check the cache if method not in ("mean", "sum", "var", "std"): @@ -282,23 +286,23 @@ def f(x): def test_table_method_rolling_methods( self, axis, nogil, parallel, nopython, arithmetic_numba_supported_operators ): - method = arithmetic_numba_supported_operators + method, kwargs = arithmetic_numba_supported_operators engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} df = DataFrame(np.eye(3)) - if method == "var": + if method in ("var", "std"): with pytest.raises(NotImplementedError, match=f"{method} not supported"): getattr( df.rolling(2, method="table", axis=axis, min_periods=0), method - )(engine_kwargs=engine_kwargs, engine="numba") + )(engine_kwargs=engine_kwargs, engine="numba", **kwargs) else: result = getattr( df.rolling(2, method="table", axis=axis, min_periods=0), method - )(engine_kwargs=engine_kwargs, engine="numba") + )(engine_kwargs=engine_kwargs, engine="numba", **kwargs) expected = getattr( df.rolling(2, method="single", axis=axis, min_periods=0), method - )(engine_kwargs=engine_kwargs, engine="numba") + )(engine_kwargs=engine_kwargs, engine="numba", **kwargs) tm.assert_frame_equal(result, expected) def test_table_method_rolling_apply(self, axis, nogil, parallel, nopython): @@ -354,23 +358,23 @@ def f(x): def test_table_method_expanding_methods( self, axis, nogil, parallel, nopython, arithmetic_numba_supported_operators ): - method = arithmetic_numba_supported_operators + method, kwargs = arithmetic_numba_supported_operators engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} df = DataFrame(np.eye(3)) - if method == "var": + if method in ("var", "std"): with pytest.raises(NotImplementedError, match=f"{method} not supported"): getattr(df.expanding(method="table", axis=axis), method)( - engine_kwargs=engine_kwargs, engine="numba" + engine_kwargs=engine_kwargs, engine="numba", **kwargs ) else: result = getattr(df.expanding(method="table", axis=axis), method)( - engine_kwargs=engine_kwargs, engine="numba" + engine_kwargs=engine_kwargs, engine="numba", **kwargs ) expected = getattr(df.expanding(method="single", axis=axis), method)( - engine_kwargs=engine_kwargs, engine="numba" + engine_kwargs=engine_kwargs, engine="numba", **kwargs ) tm.assert_frame_equal(result, expected) From f3e7e692ef9a2a3377a369422f8b445f55d0c7eb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 14 Nov 2021 20:53:55 -0800 Subject: [PATCH 05/10] Replace issue number in whatsnew --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 4614441def361..b71752a5429da 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -210,7 +210,7 @@ Other enhancements - :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`) - :meth:`.GroupBy.mean` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`) - :meth:`Timestamp.isoformat`, now handles the ``timespec`` argument from the base :class:``datetime`` class (:issue:`26131`) -- :meth:`.Rolling.var`, :meth:`.Expanding.var`, :meth:`.Rolling.std`, :meth:`.Expanding.std` now support `Numba `_ execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`) +- :meth:`.Rolling.var`, :meth:`.Expanding.var`, :meth:`.Rolling.std`, :meth:`.Expanding.std` now support `Numba `_ execution with the ``engine`` keyword (:issue:`44461`) .. --------------------------------------------------------------------------- From 91bd851532f0f0d193171150f4f64912f32fd9fe Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 14 Nov 2021 20:56:07 -0800 Subject: [PATCH 06/10] Add benchmarks --- asv_bench/benchmarks/rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 406b27dd37ea5..41dbe00848ecd 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -51,7 +51,7 @@ class NumbaEngine: ["DataFrame", "Series"], ["int", "float"], [np.sum, lambda x: np.sum(x) + 5], - ["sum", "max", "min", "median", "mean"], + ["sum", "max", "min", "median", "mean", "var", "std"], [True, False], [None, 100], ) From 443d21e2fcd99ace6dbc6a20724fd1db6acf1f14 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 14 Nov 2021 21:55:05 -0800 Subject: [PATCH 07/10] Ensure args are keyword only --- pandas/core/window/expanding.py | 4 ++-- pandas/core/window/rolling.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index e15faeb4097de..c436db04c549d 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -400,9 +400,9 @@ def median( def std( self, ddof: int = 1, + *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, - *args, **kwargs, ): nv.validate_expanding_func("std", args, kwargs) @@ -460,9 +460,9 @@ def std( def var( self, ddof: int = 1, + *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, - *args, **kwargs, ): nv.validate_expanding_func("var", args, kwargs) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index f28049e77d8df..1734e229740c6 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1453,11 +1453,12 @@ def median( def std( self, ddof: int = 1, + *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, - *args, **kwargs, ): + nv.validate_window_func("std", args, kwargs) if maybe_use_numba(engine): if self.method == "table": raise NotImplementedError("std not supported with method='table'") @@ -1467,7 +1468,6 @@ def std( return zsqrt( self._numba_apply(sliding_var, "rolling_std", engine_kwargs, ddof) ) - nv.validate_window_func("std", args, kwargs) window_func = window_aggregations.roll_var def zsqrt_func(values, begin, end, min_periods): @@ -1482,11 +1482,12 @@ def zsqrt_func(values, begin, end, min_periods): def var( self, ddof: int = 1, + *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, - *args, **kwargs, ): + nv.validate_window_func("var", args, kwargs) if maybe_use_numba(engine): if self.method == "table": raise NotImplementedError("var not supported with method='table'") @@ -1496,7 +1497,6 @@ def var( return self._numba_apply( sliding_var, "rolling_var", engine_kwargs, ddof ) - nv.validate_window_func("var", args, kwargs) window_func = partial(window_aggregations.roll_var, ddof=ddof) return self._apply( window_func, @@ -2114,9 +2114,9 @@ def median( def std( self, ddof: int = 1, + *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, - *args, **kwargs, ): nv.validate_rolling_func("std", args, kwargs) @@ -2176,9 +2176,9 @@ def std( def var( self, ddof: int = 1, + *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, - *args, **kwargs, ): nv.validate_rolling_func("var", args, kwargs) From d7182809d44bf7f8249d83913c298f8adb5f97cd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 15 Nov 2021 11:29:40 -0800 Subject: [PATCH 08/10] Split calls --- pandas/tests/window/test_numba.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 96a69ff1a224e..4ec005fc5d804 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -291,18 +291,20 @@ def test_table_method_rolling_methods( engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} df = DataFrame(np.eye(3)) + roll_table = df.rolling(2, method="table", axis=axis, min_periods=0) if method in ("var", "std"): with pytest.raises(NotImplementedError, match=f"{method} not supported"): - getattr( - df.rolling(2, method="table", axis=axis, min_periods=0), method - )(engine_kwargs=engine_kwargs, engine="numba", **kwargs) + getattr(roll_table, method)( + engine_kwargs=engine_kwargs, engine="numba", **kwargs + ) else: - result = getattr( - df.rolling(2, method="table", axis=axis, min_periods=0), method - )(engine_kwargs=engine_kwargs, engine="numba", **kwargs) - expected = getattr( - df.rolling(2, method="single", axis=axis, min_periods=0), method - )(engine_kwargs=engine_kwargs, engine="numba", **kwargs) + roll_single = df.rolling(2, method="single", axis=axis, min_periods=0) + result = getattr(roll_table, method)( + engine_kwargs=engine_kwargs, engine="numba", **kwargs + ) + expected = getattr(roll_single, method)( + engine_kwargs=engine_kwargs, engine="numba", **kwargs + ) tm.assert_frame_equal(result, expected) def test_table_method_rolling_apply(self, axis, nogil, parallel, nopython): @@ -363,17 +365,18 @@ def test_table_method_expanding_methods( engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} df = DataFrame(np.eye(3)) - + expand_table = df.expanding(method="table", axis=axis) if method in ("var", "std"): with pytest.raises(NotImplementedError, match=f"{method} not supported"): - getattr(df.expanding(method="table", axis=axis), method)( + getattr(expand_table, method)( engine_kwargs=engine_kwargs, engine="numba", **kwargs ) else: - result = getattr(df.expanding(method="table", axis=axis), method)( + expand_single = df.expanding(method="single", axis=axis) + result = getattr(expand_table, method)( engine_kwargs=engine_kwargs, engine="numba", **kwargs ) - expected = getattr(df.expanding(method="single", axis=axis), method)( + expected = getattr(expand_single, method)( engine_kwargs=engine_kwargs, engine="numba", **kwargs ) tm.assert_frame_equal(result, expected) From ec2664a1ca1253545841a902494f3f112dabbaf1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 15 Nov 2021 11:57:39 -0800 Subject: [PATCH 09/10] Fix ordering of parameters --- pandas/core/window/rolling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 1734e229740c6..51d323b6eca98 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2072,8 +2072,8 @@ def median( is ``N - ddof``, where ``N`` represents the number of elements. """ ).replace("\n", "", 1), - window_agg_numba_parameters("1.4"), args_compat, + window_agg_numba_parameters("1.4"), kwargs_compat, create_section_header("Returns"), template_returns, @@ -2134,8 +2134,8 @@ def std( is ``N - ddof``, where ``N`` represents the number of elements. """ ).replace("\n", "", 1), - window_agg_numba_parameters("1.4"), args_compat, + window_agg_numba_parameters("1.4"), kwargs_compat, create_section_header("Returns"), template_returns, From c20e6aef324bf738616fd4589dc260f80eb40c21 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 15 Nov 2021 13:44:28 -0800 Subject: [PATCH 10/10] fix doc ordering in expanding --- pandas/core/window/expanding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index c436db04c549d..8c8b7a8284684 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -360,8 +360,8 @@ def median( is ``N - ddof``, where ``N`` represents the number of elements.\n """ ).replace("\n", "", 1), - window_agg_numba_parameters("1.4"), args_compat, + window_agg_numba_parameters("1.4"), kwargs_compat, create_section_header("Returns"), template_returns, @@ -420,8 +420,8 @@ def std( is ``N - ddof``, where ``N`` represents the number of elements.\n """ ).replace("\n", "", 1), - window_agg_numba_parameters("1.4"), args_compat, + window_agg_numba_parameters("1.4"), kwargs_compat, create_section_header("Returns"), template_returns,