From 39b4313b0f9748eefbfe47992e19b45aa8198ec2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 5 Jan 2021 21:43:47 -0800 Subject: [PATCH 1/6] ENH: Add table-wise numba rolling to other agg funcions --- pandas/core/window/numba_.py | 17 ++++++++++ pandas/core/window/rolling.py | 56 ++++++++++++++++++++----------- pandas/tests/window/test_numba.py | 6 ---- 3 files changed, 53 insertions(+), 26 deletions(-) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 46b47b7e988c4..462e890851fc9 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -220,3 +220,20 @@ def roll_table( return result return roll_table + + +# This function will no longer be needed once numba supports +# axis for all np.nan* agg functions +# https://github.com/numba/numba/issues/1269 +def generate_manual_numpy_nan_agg_with_axis(nan_func): + numba = import_optional_dependency("numba") + + @numba.jit(nopython=True, nogil=True, parallel=True) + def nan_agg_with_axis(table): + result = np.empty(table.shape[1]) + for i in numba.prange(table.shape[1]): + partition = table[:, i] + result[i] = nan_func(partition) + return result + + return nan_agg_with_axis diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 7ae1e61d426b9..d097574224db8 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -65,6 +65,7 @@ VariableWindowIndexer, ) from pandas.core.window.numba_ import ( + generate_manual_numpy_nan_agg_with_axis, generate_numba_apply_func, generate_numba_table_func, ) @@ -1356,12 +1357,15 @@ def sum(self, *args, engine=None, engine_kwargs=None, **kwargs): nv.validate_window_func("sum", args, kwargs) if maybe_use_numba(engine): if self.method == "table": - raise NotImplementedError("method='table' is not supported.") + func = generate_manual_numpy_nan_agg_with_axis(np.nansum) + else: + func = np.nansum # Once numba supports np.nansum with axis, args will be relevant. - # https://github.com/numba/numba/issues/6610 - args = () if self.method == "single" else (0,) + # https://github.com/numba/numba/issues/1269 + args = () # if self.method == "single" else (0,) + return self.apply( - np.nansum, + func, raw=True, engine=engine, engine_kwargs=engine_kwargs, @@ -1402,12 +1406,15 @@ def max(self, *args, engine=None, engine_kwargs=None, **kwargs): nv.validate_window_func("max", args, kwargs) if maybe_use_numba(engine): if self.method == "table": - raise NotImplementedError("method='table' is not supported.") + func = generate_manual_numpy_nan_agg_with_axis(np.nanmax) + else: + func = np.nanmax # Once numba supports np.nanmax with axis, args will be relevant. - # https://github.com/numba/numba/issues/6610 - args = () if self.method == "single" else (0,) + # https://github.com/numba/numba/issues/1269 + args = () # if self.method == "single" else (0,) + return self.apply( - np.nanmax, + func, raw=True, engine=engine, engine_kwargs=engine_kwargs, @@ -1474,12 +1481,15 @@ def min(self, *args, engine=None, engine_kwargs=None, **kwargs): nv.validate_window_func("min", args, kwargs) if maybe_use_numba(engine): if self.method == "table": - raise NotImplementedError("method='table' is not supported.") + func = generate_manual_numpy_nan_agg_with_axis(np.nanmin) + else: + func = np.nanmin # Once numba supports np.nanmin with axis, args will be relevant. - # https://github.com/numba/numba/issues/6610 - args = () if self.method == "single" else (0,) + # https://github.com/numba/numba/issues/1269 + args = () # if self.method == "single" else (0,) + return self.apply( - np.nanmin, + func, raw=True, engine=engine, engine_kwargs=engine_kwargs, @@ -1492,12 +1502,15 @@ def mean(self, *args, engine=None, engine_kwargs=None, **kwargs): nv.validate_window_func("mean", args, kwargs) if maybe_use_numba(engine): if self.method == "table": - raise NotImplementedError("method='table' is not supported.") + func = generate_manual_numpy_nan_agg_with_axis(np.nanmean) + else: + func = np.nanmean # Once numba supports np.nanmean with axis, args will be relevant. - # https://github.com/numba/numba/issues/6610 - args = () if self.method == "single" else (0,) + # https://github.com/numba/numba/issues/1269 + args = () # if self.method == "single" else (0,) + return self.apply( - np.nanmean, + func, raw=True, engine=engine, engine_kwargs=engine_kwargs, @@ -1562,12 +1575,15 @@ def mean(self, *args, engine=None, engine_kwargs=None, **kwargs): def median(self, engine=None, engine_kwargs=None, **kwargs): if maybe_use_numba(engine): if self.method == "table": - raise NotImplementedError("method='table' is not supported.") + func = generate_manual_numpy_nan_agg_with_axis(np.nanmedian) + else: + func = np.nanmedian # Once numba supports np.nanmedian with axis, args will be relevant. - # https://github.com/numba/numba/issues/6610 - args = () if self.method == "single" else (0,) + # https://github.com/numba/numba/issues/1269 + args = () # if self.method == "single" else (0,) + return self.apply( - np.nanmedian, + func, raw=True, engine=engine, engine_kwargs=engine_kwargs, diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 9d9c216801d73..c822c51a10c7f 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -177,9 +177,6 @@ def f(x): f, engine="numba", raw=True ) - @pytest.mark.xfail( - raises=NotImplementedError, reason="method='table' is not supported." - ) def test_table_method_rolling_methods( self, axis, nogil, parallel, nopython, arithmetic_numba_supported_operators ): @@ -247,9 +244,6 @@ def f(x): ) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - raises=NotImplementedError, reason="method='table' is not supported." - ) def test_table_method_expanding_methods( self, axis, nogil, parallel, nopython, arithmetic_numba_supported_operators ): From bc9a258ba5178ce3cd25f5a89b5437a662b8a84f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 5 Jan 2021 21:46:23 -0800 Subject: [PATCH 2/6] Add issue number --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 38b7a1d13c253..a0b28dd8390e8 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -37,7 +37,7 @@ For example: :class:`Rolling` and :class:`Expanding` now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. -See ref:`window.overview` for performance and functional benefits. (:issue:`15095`) +See ref:`window.overview` for performance and functional benefits. (:issue:`15095`, :issue:`38995`) .. _whatsnew_130.enhancements.other: From 62e2777260bd7c0215c65dad98ef9f89747f18af Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 6 Jan 2021 22:00:16 -0800 Subject: [PATCH 3/6] Add cache to not make new functions each time, consolidate comments --- pandas/core/window/numba_.py | 2 ++ pandas/core/window/rolling.py | 20 -------------------- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 462e890851fc9..686e881244f61 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -1,3 +1,4 @@ +import functools from typing import Any, Callable, Dict, Optional, Tuple import numpy as np @@ -225,6 +226,7 @@ def roll_table( # This function will no longer be needed once numba supports # axis for all np.nan* agg functions # https://github.com/numba/numba/issues/1269 +@functools.lru_cache def generate_manual_numpy_nan_agg_with_axis(nan_func): numba = import_optional_dependency("numba") diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 0af7c8b54e498..393c517a63660 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1382,16 +1382,12 @@ def sum(self, *args, engine=None, engine_kwargs=None, **kwargs): func = generate_manual_numpy_nan_agg_with_axis(np.nansum) else: func = np.nansum - # Once numba supports np.nansum with axis, args will be relevant. - # https://github.com/numba/numba/issues/1269 - args = () # if self.method == "single" else (0,) return self.apply( func, raw=True, engine=engine, engine_kwargs=engine_kwargs, - args=args, ) window_func = window_aggregations.roll_sum return self._apply(window_func, name="sum", **kwargs) @@ -1431,16 +1427,12 @@ def max(self, *args, engine=None, engine_kwargs=None, **kwargs): func = generate_manual_numpy_nan_agg_with_axis(np.nanmax) else: func = np.nanmax - # Once numba supports np.nanmax with axis, args will be relevant. - # https://github.com/numba/numba/issues/1269 - args = () # if self.method == "single" else (0,) return self.apply( func, raw=True, engine=engine, engine_kwargs=engine_kwargs, - args=args, ) window_func = window_aggregations.roll_max return self._apply(window_func, name="max", **kwargs) @@ -1506,16 +1498,12 @@ def min(self, *args, engine=None, engine_kwargs=None, **kwargs): func = generate_manual_numpy_nan_agg_with_axis(np.nanmin) else: func = np.nanmin - # Once numba supports np.nanmin with axis, args will be relevant. - # https://github.com/numba/numba/issues/1269 - args = () # if self.method == "single" else (0,) return self.apply( func, raw=True, engine=engine, engine_kwargs=engine_kwargs, - args=args, ) window_func = window_aggregations.roll_min return self._apply(window_func, name="min", **kwargs) @@ -1527,16 +1515,12 @@ def mean(self, *args, engine=None, engine_kwargs=None, **kwargs): func = generate_manual_numpy_nan_agg_with_axis(np.nanmean) else: func = np.nanmean - # Once numba supports np.nanmean with axis, args will be relevant. - # https://github.com/numba/numba/issues/1269 - args = () # if self.method == "single" else (0,) return self.apply( func, raw=True, engine=engine, engine_kwargs=engine_kwargs, - args=args, ) window_func = window_aggregations.roll_mean return self._apply(window_func, name="mean", **kwargs) @@ -1600,16 +1584,12 @@ def median(self, engine=None, engine_kwargs=None, **kwargs): func = generate_manual_numpy_nan_agg_with_axis(np.nanmedian) else: func = np.nanmedian - # Once numba supports np.nanmedian with axis, args will be relevant. - # https://github.com/numba/numba/issues/1269 - args = () # if self.method == "single" else (0,) return self.apply( func, raw=True, engine=engine, engine_kwargs=engine_kwargs, - args=args, ) window_func = window_aggregations.roll_median_c return self._apply(window_func, name="median", **kwargs) From 966d066a5e87db6378be5e913f8c7d7cc9b16d85 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 6 Jan 2021 23:33:09 -0800 Subject: [PATCH 4/6] Add maxsize for py 3.7 --- pandas/core/window/numba_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 686e881244f61..dea9ad3ebdcb3 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -226,7 +226,7 @@ def roll_table( # This function will no longer be needed once numba supports # axis for all np.nan* agg functions # https://github.com/numba/numba/issues/1269 -@functools.lru_cache +@functools.lru_cache(max_size=None) def generate_manual_numpy_nan_agg_with_axis(nan_func): numba = import_optional_dependency("numba") From 48feac19bdc3ba168cf242095110ed09f79ae8fe Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 7 Jan 2021 09:26:04 -0800 Subject: [PATCH 5/6] max_size -> maxsize --- pandas/core/window/numba_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index dea9ad3ebdcb3..aa69d4fa675cd 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -226,7 +226,7 @@ def roll_table( # This function will no longer be needed once numba supports # axis for all np.nan* agg functions # https://github.com/numba/numba/issues/1269 -@functools.lru_cache(max_size=None) +@functools.lru_cache(maxsize=None) def generate_manual_numpy_nan_agg_with_axis(nan_func): numba = import_optional_dependency("numba") From 86c692c722d36d02ba2ea8bcd9b6da2447f2cb4c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 7 Jan 2021 10:57:09 -0800 Subject: [PATCH 6/6] Mark table tests as slow --- ci/deps/azure-37-slow.yaml | 1 + ci/deps/azure-38-slow.yaml | 1 + pandas/tests/window/test_numba.py | 1 + 3 files changed, 3 insertions(+) diff --git a/ci/deps/azure-37-slow.yaml b/ci/deps/azure-37-slow.yaml index 05b33fa351ac9..5d097e397992c 100644 --- a/ci/deps/azure-37-slow.yaml +++ b/ci/deps/azure-37-slow.yaml @@ -36,3 +36,4 @@ dependencies: - xlwt - moto - flask + - numba diff --git a/ci/deps/azure-38-slow.yaml b/ci/deps/azure-38-slow.yaml index fd40f40294b7f..0a4107917f01a 100644 --- a/ci/deps/azure-38-slow.yaml +++ b/ci/deps/azure-38-slow.yaml @@ -34,3 +34,4 @@ dependencies: - xlwt - moto - flask + - numba diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index c822c51a10c7f..173e39ef42908 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -163,6 +163,7 @@ def test_invalid_kwargs_nopython(): @td.skip_if_no("numba", "0.46.0") +@pytest.mark.slow @pytest.mark.filterwarnings("ignore:\\nThe keyword argument") # Filter warnings when parallel=True and the function can't be parallelized by Numba class TestTableMethod: