diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index b5a382ce24342..42f49b85bb481 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -1002,6 +1002,67 @@ See the :ref:`visualization documentation` for more. to ``df.boxplot(by="g")``. See :ref:`here` for an explanation. + +.. _groupby.pipe: + +Piping function calls +~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.17.0 + +Similar to the funcionality provided by ``DataFrames`` and ``Series``, functions +that take ``GroupBy`` objects can be chained together using a ``pipe`` method to +allow for a cleaner, more readable syntax. + +Imagine that one had functions f, g, and h that each takes a ``DataFrameGroupBy`` +as well as a single argument and returns a ``DataFrameGroupBy``, and one wanted +to apply these functions in succession to a grouped DataFrame. Instead of having +to deeply compose these functions and their arguments, such as: + +.. code-block:: python + + >>> h(g(f(df.groupby('group'), arg1), arg2), arg4) + +one can write the following: + +.. code-block:: python + + >>> (df + .groupby('group') + .pipe(f, arg1) + .pipe(g, arg2) + .pipe(h, arg3)) + +For a more concrete example, imagine one wanted to group a DataFrame by column +'A' and the user wanted to take the square of the difference between the maximum +value of 'B' in each group and the overal minimum value of 'C' (across all +groups). One could write this as a pipeline of functions applied to the original +dataframe: + +.. code-block:: python + + def f(dfgb): + """ + Take a DataFrameGroupBy and return a Series + where each value corresponds to the maximum + value of column 'B' in each group minus the + global minimum of column 'C'. + """ + return dfgb.B.max() - dfgb.C.min().min() + + def square(srs): + """ + Take a Series and transform it by + squaring each value. + """ + return srs ** 2 + + res = df.groupby('A').pipe(f).pipe(square) + + +For more details on pipeline functionality, see :ref:`here`. + + Examples -------- diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 3b3bf8cffe41b..9fc50219d0bc9 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -468,6 +468,9 @@ Other enhancements - ``pd.read_csv`` can now read bz2-compressed files incrementally, and the C parser can read bz2-compressed files from AWS S3 (:issue:`11070`, :issue:`11072`). +- ``GroupBy`` objects now have a ``pipe`` method, similar to the one on ``DataFrame`` and ``Series`` that allow for functions that take a ``GroupBy`` to be composed in a clean, readable syntax. See the :ref:`documentation ` for more. + + .. _whatsnew_0170.api: .. _whatsnew_0170.api_breaking: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6aec297c31d2b..7c33342ef21a2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -26,6 +26,7 @@ AbstractMethodError) import pandas.core.nanops as nanops from pandas.util.decorators import Appender, Substitution, deprecate_kwarg +from pandas.tools.util import _pipe from pandas.core import config @@ -2169,7 +2170,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No ----- Use ``.pipe`` when chaining together functions that expect - on Series or DataFrames. Instead of writing + on Series, DataFrames, or GroupBys. Instead of writing >>> f(g(h(df), arg1=a), arg2=b, arg3=c) @@ -2191,6 +2192,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No See Also -------- + pandas.GroupBy.pipe pandas.DataFrame.apply pandas.DataFrame.applymap pandas.Series.map @@ -2198,15 +2200,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No ) @Appender(_shared_docs['pipe'] % _shared_doc_kwargs) def pipe(self, func, *args, **kwargs): - if isinstance(func, tuple): - func, target = func - if target in kwargs: - msg = '%s is both the pipe target and a keyword argument' % target - raise ValueError(msg) - kwargs[target] = self - return func(*args, **kwargs) - else: - return func(self, *args, **kwargs) + return _pipe(self, func, *args, **kwargs) #---------------------------------------------------------------------- # Attribute access diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index f34fd6e3d2575..21c70d80a9bc5 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -14,13 +14,14 @@ from pandas.core.base import PandasObject from pandas.core.categorical import Categorical from pandas.core.frame import DataFrame -from pandas.core.generic import NDFrame +from pandas.core.generic import NDFrame, _pipe from pandas.core.index import Index, MultiIndex, CategoricalIndex, _ensure_index from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series from pandas.core.panel import Panel from pandas.util.decorators import (cache_readonly, Appender, make_signature, deprecate_kwarg) +from pandas.tools.util import _pipe import pandas.core.algorithms as algos import pandas.core.common as com from pandas.core.common import(_possibly_downcast_to_dtype, isnull, @@ -1076,6 +1077,59 @@ def tail(self, n=5): tail = obj[in_tail] return tail + def pipe(self, func, *args, **kwargs): + """ Apply a function with arguments to this GroupBy object + + .. versionadded:: 0.17.0 + + Parameters + ---------- + func : callable or tuple of (callable, string) + Function to apply to this GroupBy or, alternatively, a + ``(callable, data_keyword)`` tuple where ``data_keyword`` is a + string indicating the keyword of `callable`` that expects the + %(klass)s. + args : iterable, optional + positional arguments passed into ``func``. + kwargs : any, dictionary + a dictionary of keyword arguments passed into ``func``. + + Returns + ------- + object : the return type of ``func``. + + Notes + ----- + + Use ``.pipe`` when chaining together functions that expect + a GroupBy, or when alternating between functions that take + a DataFrame and a GroupBy. + + Assuming that one has a function f that takes and returns + a DataFrameGroupBy, a function g that takes a DataFrameGroupBy + and returns a DataFrame, and a function h that takes a DataFrame, + instead of having to write: + + >>> f(g(h(df.groupby('group')), arg1=a), arg2=b, arg3=c) + + You can write + + >>> (df + ... .groupby('group') + ... .pipe(f, arg1) + ... .pipe(g, arg2) + ... .pipe(h, arg3)) + + + See Also + -------- + pandas.Series.pipe + pandas.DataFrame.pipe + pandas.GroupBy.apply + """ + return _pipe(self, func, *args, **kwargs) + + def _cumcount_array(self, arr=None, ascending=True): """ arr is where cumcount gets its values from diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 97b57690ccc49..752e0ed515cd3 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -5159,7 +5159,7 @@ def test_tab_completion(self): 'resample', 'cummin', 'fillna', 'cumsum', 'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill', 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', - 'cov', 'dtypes', 'diff', 'idxmax', 'idxmin' + 'cov', 'dtypes', 'diff', 'idxmax', 'idxmin', 'pipe' ]) self.assertEqual(results, expected) @@ -5467,6 +5467,7 @@ def test_func(x): expected = DataFrame() tm.assert_frame_equal(result, expected) + def test_first_last_max_min_on_time_data(self): # GH 10295 # Verify that NaT is not in the result of max, min, first and last on @@ -5512,6 +5513,66 @@ def test_sort(x): g.apply(test_sort) + def test_pipe(self): + # Test the pipe method of DataFrameGroupBy. + # Issue #10353 + + random_state = np.random.RandomState(1234567890) + + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': random_state.randn(8), + 'C': random_state.randn(8)}) + + def f(dfgb): + return dfgb.B.max() - dfgb.C.min().min() + + def square(srs): + return srs ** 2 + + # Note that the transformations are + # GroupBy -> Series + # Series -> Series + # This then chains the GroupBy.pipe and the + # NDFrame.pipe methods + res = df.groupby('A').pipe(f).pipe(square) + + index = Index([u'bar', u'foo'], dtype='object', name=u'A') + expected = pd.Series([8.99110003361, 8.17516964785], name='B', index=index) + + assert_series_equal(expected, res) + + + def test_pipe_args(self): + # Test passing args to the pipe method of DataFrameGroupBy. + # Issue #10353 + + df = pd.DataFrame({'group': ['A', 'A', 'B', 'B', 'C'], + 'x': [1.0, 2.0, 3.0, 2.0, 5.0], + 'y': [10.0, 100.0, 1000.0, -100.0, -1000.0]}) + + def f(dfgb, arg1): + return dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False).groupby(dfgb.grouper) + + def g(dfgb, arg2): + return dfgb.sum() / dfgb.sum().sum() + arg2 + + def h(df, arg3): + return df.x + df.y - arg3 + + res = (df + .groupby('group') + .pipe(f, 0) + .pipe(g, 10) + .pipe(h, 100)) + + # Assert the results here + index = pd.Index(['A', 'B', 'C'], name='group') + expected = pd.Series([-79.5160891089, -78.4839108911, None], index=index) + + assert_series_equal(expected, res) + + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all() diff --git a/pandas/tools/util.py b/pandas/tools/util.py index 0bb6b4b7f7892..54ddfd13edf70 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -48,3 +48,25 @@ def compose(*funcs): """Compose 2 or more callables""" assert len(funcs) > 1, 'At least 2 callables must be passed to compose' return reduce(_compose2, funcs) + + +def _pipe(obj, func, *args, **kwargs): + """ + Apply a function to a obj either by + passing the obj as the first argument + to the function or, in the case that + the func is a tuple, interpret the first + element of the tuple as a function and + pass the obj to that function as a keyword + arguemnt whose key is the value of the + second element of the tuple + """ + if isinstance(func, tuple): + func, target = func + if target in kwargs: + msg = '%s is both the pipe target and a keyword argument' % target + raise ValueError(msg) + kwargs[target] = obj + return func(*args, **kwargs) + else: + return func(obj, *args, **kwargs)