From 68c7064a851094c24b99870b49db51d2e797f6cf Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 17 Dec 2019 21:47:35 +0000 Subject: [PATCH 1/3] move NDFrame.groupby to (DataFrame|Series).groupby --- pandas/core/frame.py | 79 +++++++++++++++++++++++++++++++++++++- pandas/core/generic.py | 86 ++++-------------------------------------- pandas/core/series.py | 85 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 169 insertions(+), 81 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1de0d3b58dc5f..bef6d90f00ba8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -94,7 +94,8 @@ ) from pandas.core.dtypes.missing import isna, notna -from pandas.core import algorithms, common as com, nanops, ops +from pandas._typing import Axes, Dtype, FilePathOrBuffer +from pandas.core import algorithms, common as com, groupby, nanops, ops from pandas.core.accessor import CachedAccessor from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray @@ -5598,6 +5599,82 @@ def update( # ---------------------------------------------------------------------- # Data reshaping + @Appender( + """ +Examples +-------- +>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', +... 'Parrot', 'Parrot'], +... 'Max Speed': [380., 370., 24., 26.]}) +>>> df + Animal Max Speed +0 Falcon 380.0 +1 Falcon 370.0 +2 Parrot 24.0 +3 Parrot 26.0 +>>> df.groupby(['Animal']).mean() + Max Speed +Animal +Falcon 375.0 +Parrot 25.0 + +**Hierarchical Indexes** + +We can groupby different levels of a hierarchical index +using the `level` parameter: + +>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], +... ['Captive', 'Wild', 'Captive', 'Wild']] +>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) +>>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, +... index=index) +>>> df + Max Speed +Animal Type +Falcon Captive 390.0 + Wild 350.0 +Parrot Captive 30.0 + Wild 20.0 +>>> df.groupby(level=0).mean() + Max Speed +Animal +Falcon 370.0 +Parrot 25.0 +>>> df.groupby(level="Type").mean() + Max Speed +Type +Captive 210.0 +Wild 185.0 +""" + ) + @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) + def groupby( + self, + by=None, + axis=0, + level=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + ) -> "groupby.DataFrameGroupBy": + + if level is None and by is None: + raise TypeError("You have to supply one of 'by' and 'level'") + axis = self._get_axis_number(axis) + + return groupby.DataFrameGroupBy( + obj=self, + keys=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + ) _shared_docs[ "pivot" diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b896721469f1f..d8da0af413eca 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7273,19 +7273,10 @@ def clip( return result - def groupby( - self, - by=None, - axis=0, - level=None, - as_index: bool_t = True, - sort: bool_t = True, - group_keys: bool_t = True, - squeeze: bool_t = False, - observed: bool_t = False, - ): - """ - Group DataFrame or Series using a mapper or by a Series of columns. + _shared_docs[ + "groupby" + ] = """ + Group %(klass)s using a mapper or by a Series of columns. A groupby operation involves some combination of splitting the object, applying a function, and combining the results. This can be @@ -7330,9 +7321,8 @@ def groupby( Returns ------- - DataFrameGroupBy or SeriesGroupBy - Depends on the calling object and returns groupby object that - contains information about the groups. + %(klass)sGroupBy + Returns a groupby object that contains information about the groups. See Also -------- @@ -7343,69 +7333,7 @@ def groupby( ----- See the `user guide `_ for more. - - Examples - -------- - >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', - ... 'Parrot', 'Parrot'], - ... 'Max Speed': [380., 370., 24., 26.]}) - >>> df - Animal Max Speed - 0 Falcon 380.0 - 1 Falcon 370.0 - 2 Parrot 24.0 - 3 Parrot 26.0 - >>> df.groupby(['Animal']).mean() - Max Speed - Animal - Falcon 375.0 - Parrot 25.0 - - **Hierarchical Indexes** - - We can groupby different levels of a hierarchical index - using the `level` parameter: - - >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], - ... ['Captive', 'Wild', 'Captive', 'Wild']] - >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) - >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, - ... index=index) - >>> df - Max Speed - Animal Type - Falcon Captive 390.0 - Wild 350.0 - Parrot Captive 30.0 - Wild 20.0 - >>> df.groupby(level=0).mean() - Max Speed - Animal - Falcon 370.0 - Parrot 25.0 - >>> df.groupby(level=1).mean() - Max Speed - Type - Captive 210.0 - Wild 185.0 - """ - from pandas.core.groupby.groupby import get_groupby - - if level is None and by is None: - raise TypeError("You have to supply one of 'by' and 'level'") - axis = self._get_axis_number(axis) - - return get_groupby( - self, - by=by, - axis=axis, - level=level, - as_index=as_index, - sort=sort, - group_keys=group_keys, - squeeze=squeeze, - observed=observed, - ) + """ def asfreq( self, diff --git a/pandas/core/series.py b/pandas/core/series.py index 36e26e088935c..1c1c548b1f434 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -47,7 +47,7 @@ ) import pandas as pd -from pandas.core import algorithms, base, generic, nanops, ops +from pandas.core import algorithms, base, generic, groupby, nanops, ops from pandas.core.accessor import CachedAccessor from pandas.core.arrays import ExtensionArray, try_cast_to_ea from pandas.core.arrays.categorical import Categorical, CategoricalAccessor @@ -1568,6 +1568,89 @@ def _set_name(self, name, inplace=False): ser.name = name return ser + @Appender( + """ +Examples +-------- +>>> ser = pd.Series([390., 350., 30., 20.], +... index=['Falcon', 'Falcon', 'Parrot', 'Parrot'], name="Max Speed") +>>> ser +Falcon 390.0 +Falcon 350.0 +Parrot 30.0 +Parrot 20.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(["a", "b", "a", "b"]).mean() +a 210.0 +b 185.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level=0).mean() +Falcon 370.0 +Parrot 25.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(ser > 100).mean() +Max Speed +False 25.0 +True 370.0 +Name: Max Speed, dtype: float64 + +**Grouping by Indexes** + +We can groupby different levels of a hierarchical index +using the `level` parameter: + +>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], +... ['Captive', 'Wild', 'Captive', 'Wild']] +>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) +>>> ser = pd.Series([390., 350., 30., 20.], index=index, name="Max Speed") +>>> ser +Animal Type +Falcon Captive 390.0 + Wild 350.0 +Parrot Captive 30.0 + Wild 20.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level=0).mean() +Animal +Falcon 370.0 +Parrot 25.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level="Type").mean() +Type +Captive 210.0 +Wild 185.0 +Name: Max Speed, dtype: float64 +""" + ) + @Appender(generic._shared_docs["groupby"] % _shared_doc_kwargs) + def groupby( + self, + by=None, + axis=0, + level=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + ) -> "groupby.SeriesGroupBy": + + if level is None and by is None: + raise TypeError("You have to supply one of 'by' and 'level'") + axis = self._get_axis_number(axis) + + return groupby.SeriesGroupBy( + obj=self, + keys=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + ) + # ---------------------------------------------------------------------- # Statistics, overridden ndarray methods From 7a2e0837b62bbf602344926a8296168b80b359db Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 18 Dec 2019 02:20:21 +0000 Subject: [PATCH 2/3] fix mypy errors --- pandas/core/frame.py | 7 ++++--- pandas/core/reshape/merge.py | 5 ++++- pandas/core/series.py | 7 ++++--- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bef6d90f00ba8..0c084250b6fe9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -95,12 +95,13 @@ from pandas.core.dtypes.missing import isna, notna from pandas._typing import Axes, Dtype, FilePathOrBuffer -from pandas.core import algorithms, common as com, groupby, nanops, ops +from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs +from pandas.core.groupby import generic as grp_generic from pandas.core.indexes import base as ibase from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences from pandas.core.indexes.datetimes import DatetimeIndex @@ -5658,13 +5659,13 @@ def groupby( group_keys: bool = True, squeeze: bool = False, observed: bool = False, - ) -> "groupby.DataFrameGroupBy": + ) -> "grp_generic.DataFrameGroupBy": if level is None and by is None: raise TypeError("You have to supply one of 'by' and 'level'") axis = self._get_axis_number(axis) - return groupby.DataFrameGroupBy( + return grp_generic.DataFrameGroupBy( obj=self, keys=by, axis=axis, diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 37ec05c40940e..6b46c6c534058 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -41,6 +41,8 @@ from pandas.core.dtypes.missing import isna, na_value_for_dtype from pandas import Categorical, Index, MultiIndex +from pandas._typing import FrameOrSeries +from pandas.core import groupby import pandas.core.algorithms as algos from pandas.core.arrays.categorical import _recode_for_categories import pandas.core.common as com @@ -113,6 +115,7 @@ def _groupby_and_merge( by = [by] lby = left.groupby(by, sort=False) + rby: Optional[groupby.DataFrameGroupBy] = None # if we can groupby the rhs # then we can get vastly better perf @@ -132,7 +135,7 @@ def _groupby_and_merge( try: rby = right.groupby(by, sort=False) except KeyError: - rby = None + pass for key, lhs in lby: diff --git a/pandas/core/series.py b/pandas/core/series.py index 1c1c548b1f434..1336c6b58e1c2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -47,7 +47,7 @@ ) import pandas as pd -from pandas.core import algorithms, base, generic, groupby, nanops, ops +from pandas.core import algorithms, base, generic, nanops, ops from pandas.core.accessor import CachedAccessor from pandas.core.arrays import ExtensionArray, try_cast_to_ea from pandas.core.arrays.categorical import Categorical, CategoricalAccessor @@ -60,6 +60,7 @@ sanitize_array, ) from pandas.core.generic import _shared_docs +from pandas.core.groupby import generic as grp_generic from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.accessors import CombinedDatetimelikeProperties from pandas.core.indexes.api import ( @@ -1633,13 +1634,13 @@ def groupby( group_keys: bool = True, squeeze: bool = False, observed: bool = False, - ) -> "groupby.SeriesGroupBy": + ) -> "grp_generic.SeriesGroupBy": if level is None and by is None: raise TypeError("You have to supply one of 'by' and 'level'") axis = self._get_axis_number(axis) - return groupby.SeriesGroupBy( + return grp_generic.SeriesGroupBy( obj=self, keys=by, axis=axis, From 9ada87c12d855348767ec384113121f37724d051 Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 18 Dec 2019 20:36:02 +0000 Subject: [PATCH 3/3] rename grp_generic to groupby_generic --- pandas/core/frame.py | 7 +++---- pandas/core/reshape/merge.py | 1 - pandas/core/series.py | 9 ++++----- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0c084250b6fe9..7d71c3bfb0368 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -94,14 +94,13 @@ ) from pandas.core.dtypes.missing import isna, notna -from pandas._typing import Axes, Dtype, FilePathOrBuffer from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs -from pandas.core.groupby import generic as grp_generic +from pandas.core.groupby import generic as groupby_generic from pandas.core.indexes import base as ibase from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences from pandas.core.indexes.datetimes import DatetimeIndex @@ -5659,13 +5658,13 @@ def groupby( group_keys: bool = True, squeeze: bool = False, observed: bool = False, - ) -> "grp_generic.DataFrameGroupBy": + ) -> "groupby_generic.DataFrameGroupBy": if level is None and by is None: raise TypeError("You have to supply one of 'by' and 'level'") axis = self._get_axis_number(axis) - return grp_generic.DataFrameGroupBy( + return groupby_generic.DataFrameGroupBy( obj=self, keys=by, axis=axis, diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6b46c6c534058..a7471cc646777 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -41,7 +41,6 @@ from pandas.core.dtypes.missing import isna, na_value_for_dtype from pandas import Categorical, Index, MultiIndex -from pandas._typing import FrameOrSeries from pandas.core import groupby import pandas.core.algorithms as algos from pandas.core.arrays.categorical import _recode_for_categories diff --git a/pandas/core/series.py b/pandas/core/series.py index 1336c6b58e1c2..aa5af9bb893fa 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -59,8 +59,7 @@ is_empty_data, sanitize_array, ) -from pandas.core.generic import _shared_docs -from pandas.core.groupby import generic as grp_generic +from pandas.core.groupby import generic as groupby_generic from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.accessors import CombinedDatetimelikeProperties from pandas.core.indexes.api import ( @@ -1432,7 +1431,7 @@ def to_string( """ ) @Substitution(klass="Series") - @Appender(_shared_docs["to_markdown"]) + @Appender(generic._shared_docs["to_markdown"]) def to_markdown( self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs, ) -> Optional[str]: @@ -1634,13 +1633,13 @@ def groupby( group_keys: bool = True, squeeze: bool = False, observed: bool = False, - ) -> "grp_generic.SeriesGroupBy": + ) -> "groupby_generic.SeriesGroupBy": if level is None and by is None: raise TypeError("You have to supply one of 'by' and 'level'") axis = self._get_axis_number(axis) - return grp_generic.SeriesGroupBy( + return groupby_generic.SeriesGroupBy( obj=self, keys=by, axis=axis,