From 84c31c2acf13950ad743305922d81f79b7a81af4 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 7 Mar 2023 12:10:09 -0800 Subject: [PATCH 1/7] PERF/REF: make window/groupby imports lazy --- pandas/__init__.py | 11 +- pandas/core/api.py | 17 +- pandas/core/generic.py | 39 ++- pandas/core/reshape/pivot.py | 5 +- pandas/core/shared_docs.py | 514 ++++++++++++++++++++++++++++++++ pandas/core/window/ewm.py | 200 +------------ pandas/core/window/expanding.py | 77 +---- pandas/core/window/rolling.py | 257 +--------------- 8 files changed, 585 insertions(+), 535 deletions(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index 1a549c09d22f7..dd81a661af42b 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -98,11 +98,9 @@ to_timedelta, # misc Flags, - Grouper, factorize, unique, value_counts, - NamedAgg, array, Categorical, set_eng_float_format, @@ -184,6 +182,15 @@ del get_versions, v +def __getattr__(name: str): + # Lazify imports to speed "import pandas as pd" + if name in ("Grouper", "NamedAgg"): + import pandas.core.groupby + + return getattr(pandas.core.groupby, name) + raise AttributeError(name) + + # module level doc-string __doc__ = """ pandas - a powerful data analysis and manipulation library for Python diff --git a/pandas/core/api.py b/pandas/core/api.py index c0b828d9330b4..9eb2521f055bf 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -44,10 +44,6 @@ from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import array from pandas.core.flags import Flags -from pandas.core.groupby import ( - Grouper, - NamedAgg, -) from pandas.core.indexes.api import ( CategoricalIndex, DatetimeIndex, @@ -80,7 +76,18 @@ # DataFrame needs to be imported after NamedAgg to avoid a circular import from pandas.core.frame import DataFrame # isort:skip -__all__ = [ + +def __getattr__(name: str): + # Lazify these so that we can avoid importing groupby at import-time + if name in ("Grouper", "NamedAgg"): + import pandas.core.groupby + + return getattr(pandas.core.groupby, name) + else: + raise AttributeError(name) + + +__all__ = [ # noqa: F822 linter doesn't recognize lazy imports "array", "ArrowDtype", "bdate_range", diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 003e4cc5b8b23..5803b713a19ff 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -90,7 +90,10 @@ SettingWithCopyError, SettingWithCopyWarning, ) -from pandas.util._decorators import doc +from pandas.util._decorators import ( + Appender, + doc, +) from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_ascending, @@ -171,14 +174,13 @@ ) from pandas.core.ops import align_method_FRAME from pandas.core.reshape.concat import concat -from pandas.core.shared_docs import _shared_docs -from pandas.core.sorting import get_indexer_indexer -from pandas.core.window import ( - Expanding, - ExponentialMovingWindow, - Rolling, - Window, +from pandas.core.shared_docs import ( + _shared_docs, + expanding_doc, + exponential_moving_window_doc, + window_doc, ) +from pandas.core.sorting import get_indexer_indexer from pandas.io.formats.format import ( DataFrameFormatter, @@ -193,6 +195,12 @@ from pandas.core.indexers.objects import BaseIndexer from pandas.core.resample import Resampler from pandas.core.series import Series + from pandas.core.window import ( + Expanding, + ExponentialMovingWindow, + Rolling, + Window, + ) from pandas.io.pytables import HDFStore @@ -11868,7 +11876,7 @@ def min( setattr(cls, "min", min) @final - @doc(Rolling) + @Appender(window_doc) def rolling( self, window: int | dt.timedelta | str | BaseOffset | BaseIndexer, @@ -11881,6 +11889,11 @@ def rolling( step: int | None = None, method: str = "single", ) -> Window | Rolling: + from pandas.core.window import ( + Rolling, + Window, + ) + axis = self._get_axis_number(axis) if win_type is not None: @@ -11911,18 +11924,20 @@ def rolling( ) @final - @doc(Expanding) + @Appender(expanding_doc) def expanding( self, min_periods: int = 1, axis: Axis = 0, method: str = "single", ) -> Expanding: + from pandas.core.window import Expanding + axis = self._get_axis_number(axis) return Expanding(self, min_periods=min_periods, axis=axis, method=method) @final - @doc(ExponentialMovingWindow) + @Appender(exponential_moving_window_doc) def ewm( self, com: float | None = None, @@ -11936,6 +11951,8 @@ def ewm( times: np.ndarray | DataFrame | Series | None = None, method: str = "single", ) -> ExponentialMovingWindow: + from pandas.core.window import ExponentialMovingWindow + axis = self._get_axis_number(axis) return ExponentialMovingWindow( self, diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index ee5851fcc2dd6..247adc50e902c 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -31,7 +31,6 @@ import pandas.core.common as com from pandas.core.frame import _shared_docs -from pandas.core.groupby import Grouper from pandas.core.indexes.api import ( Index, MultiIndex, @@ -143,6 +142,8 @@ def __internal_pivot_table( if i not in data: raise KeyError(i) + from pandas.core.groupby import Grouper + to_filter = [] for x in keys + values: if isinstance(x, Grouper): @@ -483,6 +484,8 @@ def _all_key(): def _convert_by(by): + from pandas.core.groupby import Grouper + if by is None: by = [] elif ( diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 184b77c880238..915d1eb9a1946 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -889,3 +889,517 @@ Beef co2_emissions dtype: object """ + +window_doc = """ +Provide rolling window calculations. + +Parameters +---------- +window : int, timedelta, str, offset, or BaseIndexer subclass + Size of the moving window. + + If an integer, the fixed number of observations used for + each window. + + If a timedelta, str, or offset, the time period of each window. Each + window will be a variable sized based on the observations included in + the time-period. This is only valid for datetimelike indexes. + To learn more about the offsets & frequency strings, please see `this link + `__. + + If a BaseIndexer subclass, the window boundaries + based on the defined ``get_window_bounds`` method. Additional rolling + keyword arguments, namely ``min_periods``, ``center``, ``closed`` and + ``step`` will be passed to ``get_window_bounds``. + +min_periods : int, default None + Minimum number of observations in window required to have a value; + otherwise, result is ``np.nan``. + + For a window that is specified by an offset, ``min_periods`` will default to 1. + + For a window that is specified by an integer, ``min_periods`` will default + to the size of the window. + +center : bool, default False + If False, set the window labels as the right edge of the window index. + + If True, set the window labels as the center of the window index. + +win_type : str, default None + If ``None``, all points are evenly weighted. + + If a string, it must be a valid `scipy.signal window function + `__. + + Certain Scipy window types require additional parameters to be passed + in the aggregation function. The additional parameters must match + the keywords specified in the Scipy window type method signature. + +on : str, optional + For a DataFrame, a column label or Index level on which + to calculate the rolling window, rather than the DataFrame's index. + + Provided integer column is ignored and excluded from result since + an integer index is not used to calculate the rolling window. + +axis : int or str, default 0 + If ``0`` or ``'index'``, roll across the rows. + + If ``1`` or ``'columns'``, roll across the columns. + + For `Series` this parameter is unused and defaults to 0. + +closed : str, default None + If ``'right'``, the first point in the window is excluded from calculations. + + If ``'left'``, the last point in the window is excluded from calculations. + + If ``'both'``, the no points in the window are excluded from calculations. + + If ``'neither'``, the first and last points in the window are excluded + from calculations. + + Default ``None`` (``'right'``). + + .. versionchanged:: 1.2.0 + + The closed parameter with fixed windows is now supported. + +step : int, default None + + .. versionadded:: 1.5.0 + + Evaluate the window at every ``step`` result, equivalent to slicing as + ``[::step]``. ``window`` must be an integer. Using a step argument other + than None or 1 will produce a result with a different shape than the input. + +method : str {'single', 'table'}, default 'single' + + .. versionadded:: 1.3.0 + + Execute the rolling operation per single column or row (``'single'``) + or over the entire object (``'table'``). + + This argument is only implemented when specifying ``engine='numba'`` + in the method call. + +Returns +------- +``Window`` subclass if a ``win_type`` is passed + +``Rolling`` subclass if ``win_type`` is not passed + +See Also +-------- +expanding : Provides expanding transformations. +ewm : Provides exponential weighted functions. + +Notes +----- +See :ref:`Windowing Operations ` for further usage details +and examples. + +Examples +-------- +>>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) +>>> df + B +0 0.0 +1 1.0 +2 2.0 +3 NaN +4 4.0 + +**window** + +Rolling sum with a window length of 2 observations. + +>>> df.rolling(2).sum() + B +0 NaN +1 1.0 +2 3.0 +3 NaN +4 NaN + +Rolling sum with a window span of 2 seconds. + +>>> df_time = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, +... index = [pd.Timestamp('20130101 09:00:00'), +... pd.Timestamp('20130101 09:00:02'), +... pd.Timestamp('20130101 09:00:03'), +... pd.Timestamp('20130101 09:00:05'), +... pd.Timestamp('20130101 09:00:06')]) + +>>> df_time + B +2013-01-01 09:00:00 0.0 +2013-01-01 09:00:02 1.0 +2013-01-01 09:00:03 2.0 +2013-01-01 09:00:05 NaN +2013-01-01 09:00:06 4.0 + +>>> df_time.rolling('2s').sum() + B +2013-01-01 09:00:00 0.0 +2013-01-01 09:00:02 1.0 +2013-01-01 09:00:03 3.0 +2013-01-01 09:00:05 NaN +2013-01-01 09:00:06 4.0 + +Rolling sum with forward looking windows with 2 observations. + +>>> indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=2) +>>> df.rolling(window=indexer, min_periods=1).sum() + B +0 1.0 +1 3.0 +2 2.0 +3 4.0 +4 4.0 + +**min_periods** + +Rolling sum with a window length of 2 observations, but only needs a minimum of 1 +observation to calculate a value. + +>>> df.rolling(2, min_periods=1).sum() + B +0 0.0 +1 1.0 +2 3.0 +3 2.0 +4 4.0 + +**center** + +Rolling sum with the result assigned to the center of the window index. + +>>> df.rolling(3, min_periods=1, center=True).sum() + B +0 1.0 +1 3.0 +2 3.0 +3 6.0 +4 4.0 + +>>> df.rolling(3, min_periods=1, center=False).sum() + B +0 0.0 +1 1.0 +2 3.0 +3 3.0 +4 6.0 + +**step** + +Rolling sum with a window length of 2 observations, minimum of 1 observation to +calculate a value, and a step of 2. + +>>> df.rolling(2, min_periods=1, step=2).sum() + B +0 0.0 +2 3.0 +4 4.0 + +**win_type** + +Rolling sum with a window length of 2, using the Scipy ``'gaussian'`` +window type. ``std`` is required in the aggregation function. + +>>> df.rolling(2, win_type='gaussian').sum(std=3) + B +0 NaN +1 0.986207 +2 2.958621 +3 NaN +4 NaN + +**on** + +Rolling sum with a window length of 2 days. + +>>> df = pd.DataFrame({ +... 'A': [pd.to_datetime('2020-01-01'), +... pd.to_datetime('2020-01-01'), +... pd.to_datetime('2020-01-02'),], +... 'B': [1, 2, 3], }, +... index=pd.date_range('2020', periods=3)) + +>>> df + A B +2020-01-01 2020-01-01 1 +2020-01-02 2020-01-01 2 +2020-01-03 2020-01-02 3 + +>>> df.rolling('2D', on='A').sum() + A B +2020-01-01 2020-01-01 1.0 +2020-01-02 2020-01-01 3.0 +2020-01-03 2020-01-02 6.0 +""" + + +expanding_doc = """ +Provide expanding window calculations. + +Parameters +---------- +min_periods : int, default 1 + Minimum number of observations in window required to have a value; + otherwise, result is ``np.nan``. + +axis : int or str, default 0 + If ``0`` or ``'index'``, roll across the rows. + + If ``1`` or ``'columns'``, roll across the columns. + + For `Series` this parameter is unused and defaults to 0. + +method : str {'single', 'table'}, default 'single' + Execute the rolling operation per single column or row (``'single'``) + or over the entire object (``'table'``). + + This argument is only implemented when specifying ``engine='numba'`` + in the method call. + + .. versionadded:: 1.3.0 + +Returns +------- +``Expanding`` subclass + +See Also +-------- +rolling : Provides rolling window calculations. +ewm : Provides exponential weighted functions. + +Notes +----- +See :ref:`Windowing Operations ` for further usage details +and examples. + +Examples +-------- +>>> df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) +>>> df + B +0 0.0 +1 1.0 +2 2.0 +3 NaN +4 4.0 + +**min_periods** + +Expanding sum with 1 vs 3 observations needed to calculate a value. + +>>> df.expanding(1).sum() + B +0 0.0 +1 1.0 +2 3.0 +3 3.0 +4 7.0 +>>> df.expanding(3).sum() + B +0 NaN +1 NaN +2 3.0 +3 3.0 +4 7.0 +""" + +exponential_moving_window_doc = r""" +Provide exponentially weighted (EW) calculations. + +Exactly one of ``com``, ``span``, ``halflife``, or ``alpha`` must be +provided if ``times`` is not provided. If ``times`` is provided, +``halflife`` and one of ``com``, ``span`` or ``alpha`` may be provided. + +Parameters +---------- +com : float, optional + Specify decay in terms of center of mass + + :math:`\alpha = 1 / (1 + com)`, for :math:`com \geq 0`. + +span : float, optional + Specify decay in terms of span + + :math:`\alpha = 2 / (span + 1)`, for :math:`span \geq 1`. + +halflife : float, str, timedelta, optional + Specify decay in terms of half-life + + :math:`\alpha = 1 - \exp\left(-\ln(2) / halflife\right)`, for + :math:`halflife > 0`. + + If ``times`` is specified, a timedelta convertible unit over which an + observation decays to half its value. Only applicable to ``mean()``, + and halflife value will not apply to the other functions. + + .. versionadded:: 1.1.0 + +alpha : float, optional + Specify smoothing factor :math:`\alpha` directly + + :math:`0 < \alpha \leq 1`. + +min_periods : int, default 0 + Minimum number of observations in window required to have a value; + otherwise, result is ``np.nan``. + +adjust : bool, default True + Divide by decaying adjustment factor in beginning periods to account + for imbalance in relative weightings (viewing EWMA as a moving average). + + - When ``adjust=True`` (default), the EW function is calculated using weights + :math:`w_i = (1 - \alpha)^i`. For example, the EW moving average of the series + [:math:`x_0, x_1, ..., x_t`] would be: + + .. math:: + y_t = \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ... + (1 - + \alpha)^t x_0}{1 + (1 - \alpha) + (1 - \alpha)^2 + ... + (1 - \alpha)^t} + + - When ``adjust=False``, the exponentially weighted function is calculated + recursively: + + .. math:: + \begin{split} + y_0 &= x_0\\ + y_t &= (1 - \alpha) y_{t-1} + \alpha x_t, + \end{split} +ignore_na : bool, default False + Ignore missing values when calculating weights. + + - When ``ignore_na=False`` (default), weights are based on absolute positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in calculating + the final weighted average of [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\alpha)^2` and :math:`\alpha` if ``adjust=False``. + + - When ``ignore_na=True``, weights are based + on relative positions. For example, the weights of :math:`x_0` and :math:`x_2` + used in calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are :math:`1-\alpha` and :math:`1` if + ``adjust=True``, and :math:`1-\alpha` and :math:`\alpha` if ``adjust=False``. + +axis : {0, 1}, default 0 + If ``0`` or ``'index'``, calculate across the rows. + + If ``1`` or ``'columns'``, calculate across the columns. + + For `Series` this parameter is unused and defaults to 0. + +times : np.ndarray, Series, default None + + .. versionadded:: 1.1.0 + + Only applicable to ``mean()``. + + Times corresponding to the observations. Must be monotonically increasing and + ``datetime64[ns]`` dtype. + + If 1-D array like, a sequence with the same shape as the observations. + +method : str {'single', 'table'}, default 'single' + .. versionadded:: 1.4.0 + + Execute the rolling operation per single column or row (``'single'``) + or over the entire object (``'table'``). + + This argument is only implemented when specifying ``engine='numba'`` + in the method call. + + Only applicable to ``mean()`` + +Returns +------- +``ExponentialMovingWindow`` subclass + +See Also +-------- +rolling : Provides rolling window calculations. +expanding : Provides expanding transformations. + +Notes +----- +See :ref:`Windowing Operations ` +for further usage details and examples. + +Examples +-------- +>>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) +>>> df + B +0 0.0 +1 1.0 +2 2.0 +3 NaN +4 4.0 + +>>> df.ewm(com=0.5).mean() + B +0 0.000000 +1 0.750000 +2 1.615385 +3 1.615385 +4 3.670213 +>>> df.ewm(alpha=2 / 3).mean() + B +0 0.000000 +1 0.750000 +2 1.615385 +3 1.615385 +4 3.670213 + +**adjust** + +>>> df.ewm(com=0.5, adjust=True).mean() + B +0 0.000000 +1 0.750000 +2 1.615385 +3 1.615385 +4 3.670213 +>>> df.ewm(com=0.5, adjust=False).mean() + B +0 0.000000 +1 0.666667 +2 1.555556 +3 1.555556 +4 3.650794 + +**ignore_na** + +>>> df.ewm(com=0.5, ignore_na=True).mean() + B +0 0.000000 +1 0.750000 +2 1.615385 +3 1.615385 +4 3.225000 +>>> df.ewm(com=0.5, ignore_na=False).mean() + B +0 0.000000 +1 0.750000 +2 1.615385 +3 1.615385 +4 3.670213 + +**times** + +Exponentially weighted mean with weights calculated with a timedelta ``halflife`` +relative to ``times``. + +>>> times = ['2020-01-01', '2020-01-03', '2020-01-10', '2020-01-15', '2020-01-17'] +>>> df.ewm(halflife='4 days', times=pd.DatetimeIndex(times)).mean() + B +0 0.000000 +1 0.585786 +2 1.523889 +3 1.523889 +4 3.233686 +""" diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 34dc49ff4a82a..ec1a467b95d6a 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -9,7 +9,10 @@ from pandas._libs.tslibs import Timedelta import pandas._libs.window.aggregations as window_aggregations -from pandas.util._decorators import doc +from pandas.util._decorators import ( + Appender, + doc, +) from pandas.core.dtypes.common import ( is_datetime64_ns_dtype, @@ -23,6 +26,7 @@ ExponentialMovingWindowIndexer, GroupbyIndexer, ) +from pandas.core.shared_docs import exponential_moving_window_doc from pandas.core.util.numba_ import ( get_jit_arguments, maybe_use_numba, @@ -124,200 +128,8 @@ def _calculate_deltas( return np.diff(_times) / _halflife +@Appender(exponential_moving_window_doc) class ExponentialMovingWindow(BaseWindow): - r""" - Provide exponentially weighted (EW) calculations. - - Exactly one of ``com``, ``span``, ``halflife``, or ``alpha`` must be - provided if ``times`` is not provided. If ``times`` is provided, - ``halflife`` and one of ``com``, ``span`` or ``alpha`` may be provided. - - Parameters - ---------- - com : float, optional - Specify decay in terms of center of mass - - :math:`\alpha = 1 / (1 + com)`, for :math:`com \geq 0`. - - span : float, optional - Specify decay in terms of span - - :math:`\alpha = 2 / (span + 1)`, for :math:`span \geq 1`. - - halflife : float, str, timedelta, optional - Specify decay in terms of half-life - - :math:`\alpha = 1 - \exp\left(-\ln(2) / halflife\right)`, for - :math:`halflife > 0`. - - If ``times`` is specified, a timedelta convertible unit over which an - observation decays to half its value. Only applicable to ``mean()``, - and halflife value will not apply to the other functions. - - .. versionadded:: 1.1.0 - - alpha : float, optional - Specify smoothing factor :math:`\alpha` directly - - :math:`0 < \alpha \leq 1`. - - min_periods : int, default 0 - Minimum number of observations in window required to have a value; - otherwise, result is ``np.nan``. - - adjust : bool, default True - Divide by decaying adjustment factor in beginning periods to account - for imbalance in relative weightings (viewing EWMA as a moving average). - - - When ``adjust=True`` (default), the EW function is calculated using weights - :math:`w_i = (1 - \alpha)^i`. For example, the EW moving average of the series - [:math:`x_0, x_1, ..., x_t`] would be: - - .. math:: - y_t = \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ... + (1 - - \alpha)^t x_0}{1 + (1 - \alpha) + (1 - \alpha)^2 + ... + (1 - \alpha)^t} - - - When ``adjust=False``, the exponentially weighted function is calculated - recursively: - - .. math:: - \begin{split} - y_0 &= x_0\\ - y_t &= (1 - \alpha) y_{t-1} + \alpha x_t, - \end{split} - ignore_na : bool, default False - Ignore missing values when calculating weights. - - - When ``ignore_na=False`` (default), weights are based on absolute positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in calculating - the final weighted average of [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\alpha)^2` and :math:`1` if ``adjust=True``, and - :math:`(1-\alpha)^2` and :math:`\alpha` if ``adjust=False``. - - - When ``ignore_na=True``, weights are based - on relative positions. For example, the weights of :math:`x_0` and :math:`x_2` - used in calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are :math:`1-\alpha` and :math:`1` if - ``adjust=True``, and :math:`1-\alpha` and :math:`\alpha` if ``adjust=False``. - - axis : {0, 1}, default 0 - If ``0`` or ``'index'``, calculate across the rows. - - If ``1`` or ``'columns'``, calculate across the columns. - - For `Series` this parameter is unused and defaults to 0. - - times : np.ndarray, Series, default None - - .. versionadded:: 1.1.0 - - Only applicable to ``mean()``. - - Times corresponding to the observations. Must be monotonically increasing and - ``datetime64[ns]`` dtype. - - If 1-D array like, a sequence with the same shape as the observations. - - method : str {'single', 'table'}, default 'single' - .. versionadded:: 1.4.0 - - Execute the rolling operation per single column or row (``'single'``) - or over the entire object (``'table'``). - - This argument is only implemented when specifying ``engine='numba'`` - in the method call. - - Only applicable to ``mean()`` - - Returns - ------- - ``ExponentialMovingWindow`` subclass - - See Also - -------- - rolling : Provides rolling window calculations. - expanding : Provides expanding transformations. - - Notes - ----- - See :ref:`Windowing Operations ` - for further usage details and examples. - - Examples - -------- - >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) - >>> df - B - 0 0.0 - 1 1.0 - 2 2.0 - 3 NaN - 4 4.0 - - >>> df.ewm(com=0.5).mean() - B - 0 0.000000 - 1 0.750000 - 2 1.615385 - 3 1.615385 - 4 3.670213 - >>> df.ewm(alpha=2 / 3).mean() - B - 0 0.000000 - 1 0.750000 - 2 1.615385 - 3 1.615385 - 4 3.670213 - - **adjust** - - >>> df.ewm(com=0.5, adjust=True).mean() - B - 0 0.000000 - 1 0.750000 - 2 1.615385 - 3 1.615385 - 4 3.670213 - >>> df.ewm(com=0.5, adjust=False).mean() - B - 0 0.000000 - 1 0.666667 - 2 1.555556 - 3 1.555556 - 4 3.650794 - - **ignore_na** - - >>> df.ewm(com=0.5, ignore_na=True).mean() - B - 0 0.000000 - 1 0.750000 - 2 1.615385 - 3 1.615385 - 4 3.225000 - >>> df.ewm(com=0.5, ignore_na=False).mean() - B - 0 0.000000 - 1 0.750000 - 2 1.615385 - 3 1.615385 - 4 3.670213 - - **times** - - Exponentially weighted mean with weights calculated with a timedelta ``halflife`` - relative to ``times``. - - >>> times = ['2020-01-01', '2020-01-03', '2020-01-10', '2020-01-15', '2020-01-17'] - >>> df.ewm(halflife='4 days', times=pd.DatetimeIndex(times)).mean() - B - 0 0.000000 - 1 0.585786 - 2 1.523889 - 3 1.523889 - 4 3.233686 - """ - _attributes = [ "com", "span", diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index b3caa189bd579..4a22807ea56eb 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -7,13 +7,17 @@ Callable, ) -from pandas.util._decorators import doc +from pandas.util._decorators import ( + Appender, + doc, +) from pandas.core.indexers.objects import ( BaseIndexer, ExpandingIndexer, GroupbyIndexer, ) +from pandas.core.shared_docs import expanding_doc from pandas.core.window.doc import ( _shared_docs, create_section_header, @@ -44,77 +48,8 @@ from pandas.core.generic import NDFrame +@Appender(expanding_doc) class Expanding(RollingAndExpandingMixin): - """ - Provide expanding window calculations. - - Parameters - ---------- - min_periods : int, default 1 - Minimum number of observations in window required to have a value; - otherwise, result is ``np.nan``. - - axis : int or str, default 0 - If ``0`` or ``'index'``, roll across the rows. - - If ``1`` or ``'columns'``, roll across the columns. - - For `Series` this parameter is unused and defaults to 0. - - method : str {'single', 'table'}, default 'single' - Execute the rolling operation per single column or row (``'single'``) - or over the entire object (``'table'``). - - This argument is only implemented when specifying ``engine='numba'`` - in the method call. - - .. versionadded:: 1.3.0 - - Returns - ------- - ``Expanding`` subclass - - See Also - -------- - rolling : Provides rolling window calculations. - ewm : Provides exponential weighted functions. - - Notes - ----- - See :ref:`Windowing Operations ` for further usage details - and examples. - - Examples - -------- - >>> df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) - >>> df - B - 0 0.0 - 1 1.0 - 2 2.0 - 3 NaN - 4 4.0 - - **min_periods** - - Expanding sum with 1 vs 3 observations needed to calculate a value. - - >>> df.expanding(1).sum() - B - 0 0.0 - 1 1.0 - 2 3.0 - 3 3.0 - 4 7.0 - >>> df.expanding(3).sum() - B - 0 NaN - 1 NaN - 2 3.0 - 3 3.0 - 4 7.0 - """ - _attributes: list[str] = ["min_periods", "axis", "method"] def __init__( diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index b11ff11421ed4..19b5fecce7a5b 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -28,7 +28,10 @@ import pandas._libs.window.aggregations as window_aggregations from pandas.compat._optional import import_optional_dependency from pandas.errors import DataError -from pandas.util._decorators import doc +from pandas.util._decorators import ( + Appender, + doc, +) from pandas.core.dtypes.common import ( ensure_float64, @@ -65,6 +68,7 @@ TimedeltaIndex, ) from pandas.core.reshape.concat import concat +from pandas.core.shared_docs import window_doc from pandas.core.util.numba_ import ( get_jit_arguments, maybe_use_numba, @@ -868,257 +872,8 @@ def _gotitem(self, key, ndim, subset=None): return super()._gotitem(key, ndim, subset=subset) +@Appender(window_doc) class Window(BaseWindow): - """ - Provide rolling window calculations. - - Parameters - ---------- - window : int, timedelta, str, offset, or BaseIndexer subclass - Size of the moving window. - - If an integer, the fixed number of observations used for - each window. - - If a timedelta, str, or offset, the time period of each window. Each - window will be a variable sized based on the observations included in - the time-period. This is only valid for datetimelike indexes. - To learn more about the offsets & frequency strings, please see `this link - `__. - - If a BaseIndexer subclass, the window boundaries - based on the defined ``get_window_bounds`` method. Additional rolling - keyword arguments, namely ``min_periods``, ``center``, ``closed`` and - ``step`` will be passed to ``get_window_bounds``. - - min_periods : int, default None - Minimum number of observations in window required to have a value; - otherwise, result is ``np.nan``. - - For a window that is specified by an offset, ``min_periods`` will default to 1. - - For a window that is specified by an integer, ``min_periods`` will default - to the size of the window. - - center : bool, default False - If False, set the window labels as the right edge of the window index. - - If True, set the window labels as the center of the window index. - - win_type : str, default None - If ``None``, all points are evenly weighted. - - If a string, it must be a valid `scipy.signal window function - `__. - - Certain Scipy window types require additional parameters to be passed - in the aggregation function. The additional parameters must match - the keywords specified in the Scipy window type method signature. - - on : str, optional - For a DataFrame, a column label or Index level on which - to calculate the rolling window, rather than the DataFrame's index. - - Provided integer column is ignored and excluded from result since - an integer index is not used to calculate the rolling window. - - axis : int or str, default 0 - If ``0`` or ``'index'``, roll across the rows. - - If ``1`` or ``'columns'``, roll across the columns. - - For `Series` this parameter is unused and defaults to 0. - - closed : str, default None - If ``'right'``, the first point in the window is excluded from calculations. - - If ``'left'``, the last point in the window is excluded from calculations. - - If ``'both'``, the no points in the window are excluded from calculations. - - If ``'neither'``, the first and last points in the window are excluded - from calculations. - - Default ``None`` (``'right'``). - - .. versionchanged:: 1.2.0 - - The closed parameter with fixed windows is now supported. - - step : int, default None - - .. versionadded:: 1.5.0 - - Evaluate the window at every ``step`` result, equivalent to slicing as - ``[::step]``. ``window`` must be an integer. Using a step argument other - than None or 1 will produce a result with a different shape than the input. - - method : str {'single', 'table'}, default 'single' - - .. versionadded:: 1.3.0 - - Execute the rolling operation per single column or row (``'single'``) - or over the entire object (``'table'``). - - This argument is only implemented when specifying ``engine='numba'`` - in the method call. - - Returns - ------- - ``Window`` subclass if a ``win_type`` is passed - - ``Rolling`` subclass if ``win_type`` is not passed - - See Also - -------- - expanding : Provides expanding transformations. - ewm : Provides exponential weighted functions. - - Notes - ----- - See :ref:`Windowing Operations ` for further usage details - and examples. - - Examples - -------- - >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) - >>> df - B - 0 0.0 - 1 1.0 - 2 2.0 - 3 NaN - 4 4.0 - - **window** - - Rolling sum with a window length of 2 observations. - - >>> df.rolling(2).sum() - B - 0 NaN - 1 1.0 - 2 3.0 - 3 NaN - 4 NaN - - Rolling sum with a window span of 2 seconds. - - >>> df_time = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - ... index = [pd.Timestamp('20130101 09:00:00'), - ... pd.Timestamp('20130101 09:00:02'), - ... pd.Timestamp('20130101 09:00:03'), - ... pd.Timestamp('20130101 09:00:05'), - ... pd.Timestamp('20130101 09:00:06')]) - - >>> df_time - B - 2013-01-01 09:00:00 0.0 - 2013-01-01 09:00:02 1.0 - 2013-01-01 09:00:03 2.0 - 2013-01-01 09:00:05 NaN - 2013-01-01 09:00:06 4.0 - - >>> df_time.rolling('2s').sum() - B - 2013-01-01 09:00:00 0.0 - 2013-01-01 09:00:02 1.0 - 2013-01-01 09:00:03 3.0 - 2013-01-01 09:00:05 NaN - 2013-01-01 09:00:06 4.0 - - Rolling sum with forward looking windows with 2 observations. - - >>> indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=2) - >>> df.rolling(window=indexer, min_periods=1).sum() - B - 0 1.0 - 1 3.0 - 2 2.0 - 3 4.0 - 4 4.0 - - **min_periods** - - Rolling sum with a window length of 2 observations, but only needs a minimum of 1 - observation to calculate a value. - - >>> df.rolling(2, min_periods=1).sum() - B - 0 0.0 - 1 1.0 - 2 3.0 - 3 2.0 - 4 4.0 - - **center** - - Rolling sum with the result assigned to the center of the window index. - - >>> df.rolling(3, min_periods=1, center=True).sum() - B - 0 1.0 - 1 3.0 - 2 3.0 - 3 6.0 - 4 4.0 - - >>> df.rolling(3, min_periods=1, center=False).sum() - B - 0 0.0 - 1 1.0 - 2 3.0 - 3 3.0 - 4 6.0 - - **step** - - Rolling sum with a window length of 2 observations, minimum of 1 observation to - calculate a value, and a step of 2. - - >>> df.rolling(2, min_periods=1, step=2).sum() - B - 0 0.0 - 2 3.0 - 4 4.0 - - **win_type** - - Rolling sum with a window length of 2, using the Scipy ``'gaussian'`` - window type. ``std`` is required in the aggregation function. - - >>> df.rolling(2, win_type='gaussian').sum(std=3) - B - 0 NaN - 1 0.986207 - 2 2.958621 - 3 NaN - 4 NaN - - **on** - - Rolling sum with a window length of 2 days. - - >>> df = pd.DataFrame({ - ... 'A': [pd.to_datetime('2020-01-01'), - ... pd.to_datetime('2020-01-01'), - ... pd.to_datetime('2020-01-02'),], - ... 'B': [1, 2, 3], }, - ... index=pd.date_range('2020', periods=3)) - - >>> df - A B - 2020-01-01 2020-01-01 1 - 2020-01-02 2020-01-01 2 - 2020-01-03 2020-01-02 3 - - >>> df.rolling('2D', on='A').sum() - A B - 2020-01-01 2020-01-01 1.0 - 2020-01-02 2020-01-01 3.0 - 2020-01-03 2020-01-02 6.0 - """ - _attributes = [ "window", "min_periods", From 278a469792226cda03797d22881cfb06f1003dff Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 7 Mar 2023 13:57:23 -0800 Subject: [PATCH 2/7] Fix namespace tests --- pandas/__init__.py | 9 ++++++++- pandas/tests/api/test_api.py | 4 ++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index dd81a661af42b..b12c1a56978ff 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -188,7 +188,14 @@ def __getattr__(name: str): import pandas.core.groupby return getattr(pandas.core.groupby, name) - raise AttributeError(name) + raise AttributeError(f"module 'pandas' has no attribute '{name}'") + + +def __dir__() -> list[str]: + # include lazy imports defined in __getattr__ in dir() + base = list(globals().keys()) + result = base + ["Grouper", "NamedAgg"] + return result # module level doc-string diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 94430e23b054a..7134463271e0f 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -27,6 +27,7 @@ class TestPDApi(Base): # these are optionally imported based on testing # & need to be ignored ignored = ["tests", "locale", "conftest"] + lazy = ["Grouper", "NamedAgg"] # top-level sub-packages public_lib = [ @@ -201,6 +202,9 @@ def test_api(self): + self.private_modules ) self.check(namespace=pd, expected=checkthese, ignored=self.ignored) + for name in self.lazy: + getattr(pd, name) + assert name in dir(pd) def test_api_all(self): expected = set( From 2b8a4e58eed9f05e733342d3c55629a2729d335e Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 8 Mar 2023 16:10:05 -0800 Subject: [PATCH 3/7] mypy fixup --- pandas/core/groupby/ops.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 0acc7fe29b5db..15ff654c36560 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -823,7 +823,12 @@ def groups(self) -> dict[Hashable, np.ndarray]: if len(self.groupings) == 1: return self.groupings[0].groups else: - to_groupby = zip(*(ping.grouping_vector for ping in self.groupings)) + # error: Cannot determine type of "grouping_vector" + pings = ( + ping.grouping_vector # type: ignore[has-type] + for ping in self.groupings + ) + to_groupby = zip(*pings) index = Index(to_groupby) return self.axis.groupby(index) From b076299660b27251205da541f01ea2d2ee0f831a Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 10 Mar 2023 10:37:18 -0800 Subject: [PATCH 4/7] pyright fixup --- pandas/__init__.py | 4 ++-- pandas/core/api.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index 65d56f7c0cdf1..f8c2d259e8478 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -257,7 +257,7 @@ def __dir__() -> list[str]: "Flags", "Float32Dtype", "Float64Dtype", - "Grouper", + "Grouper", # pyright: ignore[reportUnsupportedDunderAll] "HDFStore", "Index", "IndexSlice", @@ -271,7 +271,7 @@ def __dir__() -> list[str]: "MultiIndex", "NA", "NaT", - "NamedAgg", + "NamedAgg", # pyright: ignore[reportUnsupportedDunderAll] "Period", "PeriodDtype", "PeriodIndex", diff --git a/pandas/core/api.py b/pandas/core/api.py index 9eb2521f055bf..bb333debca5d4 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -104,7 +104,7 @@ def __getattr__(name: str): "Flags", "Float32Dtype", "Float64Dtype", - "Grouper", + "Grouper", # pyright: ignore[reportUnsupportedDunderAll] "Index", "IndexSlice", "Int16Dtype", @@ -119,7 +119,7 @@ def __getattr__(name: str): "isnull", "MultiIndex", "NA", - "NamedAgg", + "NamedAgg", # pyright: ignore[reportUnsupportedDunderAll] "NaT", "notna", "notnull", From a2a5b8d545b13e6d0886e2654d24eedcc585be4b Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 10 Mar 2023 13:30:01 -0800 Subject: [PATCH 5/7] pylint fixup --- pandas/__init__.py | 4 ++-- pandas/core/api.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index f8c2d259e8478..9f181f08b678a 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -257,7 +257,7 @@ def __dir__() -> list[str]: "Flags", "Float32Dtype", "Float64Dtype", - "Grouper", # pyright: ignore[reportUnsupportedDunderAll] + "Grouper", # pyright: ignore[reportUnsupportedDunderAll] # pylint: disable=undefined-all-variable # noqa:E501 "HDFStore", "Index", "IndexSlice", @@ -271,7 +271,7 @@ def __dir__() -> list[str]: "MultiIndex", "NA", "NaT", - "NamedAgg", # pyright: ignore[reportUnsupportedDunderAll] + "NamedAgg", # pyright: ignore[reportUnsupportedDunderAll] # pylint: disable=undefined-all-variable # noqa:E501 "Period", "PeriodDtype", "PeriodIndex", diff --git a/pandas/core/api.py b/pandas/core/api.py index bb333debca5d4..6d0d8d81feeb0 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -104,7 +104,7 @@ def __getattr__(name: str): "Flags", "Float32Dtype", "Float64Dtype", - "Grouper", # pyright: ignore[reportUnsupportedDunderAll] + "Grouper", # pyright: ignore[reportUnsupportedDunderAll] # pylint: disable=undefined-all-variable # noqa:E501 "Index", "IndexSlice", "Int16Dtype", @@ -119,7 +119,7 @@ def __getattr__(name: str): "isnull", "MultiIndex", "NA", - "NamedAgg", # pyright: ignore[reportUnsupportedDunderAll] + "NamedAgg", # pyright: ignore[reportUnsupportedDunderAll] # pylint: disable=undefined-all-variable # noqa:E501 "NaT", "notna", "notnull", From 9c54f0c938bd67dc2eeb205ad780a7da2488dc20 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 18 Mar 2023 08:36:54 -0700 Subject: [PATCH 6/7] restore core.api imports --- pandas/core/api.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/pandas/core/api.py b/pandas/core/api.py index 6d0d8d81feeb0..c0b828d9330b4 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -44,6 +44,10 @@ from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import array from pandas.core.flags import Flags +from pandas.core.groupby import ( + Grouper, + NamedAgg, +) from pandas.core.indexes.api import ( CategoricalIndex, DatetimeIndex, @@ -76,18 +80,7 @@ # DataFrame needs to be imported after NamedAgg to avoid a circular import from pandas.core.frame import DataFrame # isort:skip - -def __getattr__(name: str): - # Lazify these so that we can avoid importing groupby at import-time - if name in ("Grouper", "NamedAgg"): - import pandas.core.groupby - - return getattr(pandas.core.groupby, name) - else: - raise AttributeError(name) - - -__all__ = [ # noqa: F822 linter doesn't recognize lazy imports +__all__ = [ "array", "ArrowDtype", "bdate_range", @@ -104,7 +97,7 @@ def __getattr__(name: str): "Flags", "Float32Dtype", "Float64Dtype", - "Grouper", # pyright: ignore[reportUnsupportedDunderAll] # pylint: disable=undefined-all-variable # noqa:E501 + "Grouper", "Index", "IndexSlice", "Int16Dtype", @@ -119,7 +112,7 @@ def __getattr__(name: str): "isnull", "MultiIndex", "NA", - "NamedAgg", # pyright: ignore[reportUnsupportedDunderAll] # pylint: disable=undefined-all-variable # noqa:E501 + "NamedAgg", "NaT", "notna", "notnull", From ba1441fb1de3e970f2bfd76c5a041167d278108f Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 20 Mar 2023 09:19:36 -0700 Subject: [PATCH 7/7] mypy fixup --- pandas/core/groupby/ops.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 15ff654c36560..aae773e992d29 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -823,11 +823,7 @@ def groups(self) -> dict[Hashable, np.ndarray]: if len(self.groupings) == 1: return self.groupings[0].groups else: - # error: Cannot determine type of "grouping_vector" - pings = ( - ping.grouping_vector # type: ignore[has-type] - for ping in self.groupings - ) + pings = (ping.grouping_vector for ping in self.groupings) to_groupby = zip(*pings) index = Index(to_groupby) return self.axis.groupby(index)