diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index e4dd82afcdf65..4f116a42253e5 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -946,23 +946,6 @@ that is itself a series, and possibly upcast the result to a DataFrame: So depending on the path taken, and exactly what you are grouping. Thus the grouped columns(s) may be included in the output as well as set the indices. -.. warning:: - - In the current implementation apply calls func twice on the - first group to decide whether it can take a fast or slow code - path. This can lead to unexpected behavior if func has - side-effects, as they will take effect twice for the first - group. - - .. ipython:: python - - d = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]}) - def identity(df): - print(df) - return df - - d.groupby("a").apply(identity) - Other useful features --------------------- diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 0c3ef237126c2..561562f367db2 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -73,6 +73,50 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`) df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific')) df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00'] +.. _whatsnew_0250.api_breaking.groupby_apply_first_group_once: + +GroupBy.apply on ``DataFrame`` evaluates first group only once +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The implementation of :meth:`DataFrameGroupBy.apply() ` +previously evaluated the supplied function consistently twice on the first group +to infer if it is safe to use a fast code path. Particularly for functions with +side effects, this was an undesired behavior and may have led to surprises. + +(:issue:`2936`, :issue:`2656`, :issue:`7739`, :issue:`10519`, :issue:`12155`, +:issue:`20084`, :issue:`21417`) + +Now every group is evaluated only a single time. + +.. ipython:: python + + df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]}) + df + + def func(group): + print(group.name) + return group + +*Previous Behaviour*: + +.. code-block:: python + + In [3]: df.groupby('a').apply(func) + x + x + y + Out[3]: + a b + 0 x 1 + 1 y 2 + +*New Behaviour*: + +.. ipython:: python + + df.groupby("a").apply(func) + + Concatenating Sparse Values ^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -83,14 +127,14 @@ Series or DataFrame with sparse values, rather than a ``SparseDataFrame`` (:issu df = pd.DataFrame({"A": pd.SparseArray([0, 1])}) -*Previous Behavior:* +*Previous Behavior*: .. code-block:: ipython In [2]: type(pd.concat([df, df])) pandas.core.sparse.frame.SparseDataFrame -*New Behavior:* +*New Behavior*: .. ipython:: python diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 517d59c399179..5df6e02d6a040 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -509,17 +509,6 @@ def apply_frame_axis0(object frame, object f, object names, results = [] - # Need to infer if our low-level mucking is going to cause a segfault - if n > 0: - chunk = frame.iloc[starts[0]:ends[0]] - object.__setattr__(chunk, 'name', names[0]) - try: - result = f(chunk) - if result is chunk: - raise InvalidApply('Function unsafe for fast apply') - except: - raise InvalidApply('Let this error raise above us') - slider = BlockSlider(frame) mutated = False @@ -529,13 +518,18 @@ def apply_frame_axis0(object frame, object f, object names, slider.move(starts[i], ends[i]) item_cache.clear() # ugh + chunk = slider.dummy + object.__setattr__(chunk, 'name', names[i]) - object.__setattr__(slider.dummy, 'name', names[i]) - piece = f(slider.dummy) + try: + piece = f(chunk) + except: + raise InvalidApply('Let this error raise above us') - # I'm paying the price for index-sharing, ugh + # Need to infer if low level index slider will cause segfaults + require_slow_apply = i == 0 and piece is chunk try: - if piece.index is slider.dummy.index: + if piece.index is chunk.index: piece = piece.copy(deep='all') else: mutated = True @@ -543,6 +537,12 @@ def apply_frame_axis0(object frame, object f, object names, pass results.append(piece) + + # If the data was modified inplace we need to + # take the slow path to not risk segfaults + # we have already computed the first piece + if require_slow_apply: + break finally: slider.reset() diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 525af28c27ff5..ec22548de6da3 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -165,26 +165,45 @@ def apply(self, f, data, axis=0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) group_keys = self._get_group_keys() + result_values = None # oh boy f_name = com.get_callable_name(f) if (f_name not in base.plotting_methods and hasattr(splitter, 'fast_apply') and axis == 0): try: - values, mutated = splitter.fast_apply(f, group_keys) - return group_keys, values, mutated + result_values, mutated = splitter.fast_apply(f, group_keys) + + # If the fast apply path could be used we can return here. + # Otherwise we need to fall back to the slow implementation. + if len(result_values) == len(group_keys): + return group_keys, result_values, mutated + except reduction.InvalidApply: - # we detect a mutation of some kind - # so take slow path + # Cannot fast apply on MultiIndex (_has_complex_internals). + # This Exception is also raised if `f` triggers an exception + # but it is preferable to raise the exception in Python. pass except Exception: # raise this error to the caller pass - result_values = [] for key, (i, group) in zip(group_keys, splitter): object.__setattr__(group, 'name', key) + # result_values is None if fast apply path wasn't taken + # or fast apply aborted with an unexpected exception. + # In either case, initialize the result list and perform + # the slow iteration. + if result_values is None: + result_values = [] + + # If result_values is not None we're in the case that the + # fast apply loop was broken prematurely but we have + # already the result for the first group which we can reuse. + elif i == 0: + continue + # group might be modified group_axes = _get_axes(group) res = f(group) @@ -854,10 +873,7 @@ def fast_apply(self, f, names): return [], True sdata = self._get_sorted_data() - results, mutated = reduction.apply_frame_axis0(sdata, f, names, - starts, ends) - - return results, mutated + return reduction.apply_frame_axis0(sdata, f, names, starts, ends) def _chop(self, sdata, slice_obj): if self.axis == 0: diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index dea7e518ac605..d753556f9978d 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -102,9 +102,80 @@ def f(g): group_keys = grouper._get_group_keys() values, mutated = splitter.fast_apply(f, group_keys) + assert not mutated +@pytest.mark.parametrize( + "df, group_names", + [ + (DataFrame({"a": [1, 1, 1, 2, 3], + "b": ["a", "a", "a", "b", "c"]}), + [1, 2, 3]), + (DataFrame({"a": [0, 0, 1, 1], + "b": [0, 1, 0, 1]}), + [0, 1]), + (DataFrame({"a": [1]}), + [1]), + (DataFrame({"a": [1, 1, 1, 2, 2, 1, 1, 2], + "b": range(8)}), + [1, 2]), + (DataFrame({"a": [1, 2, 3, 1, 2, 3], + "two": [4, 5, 6, 7, 8, 9]}), + [1, 2, 3]), + (DataFrame({"a": list("aaabbbcccc"), + "B": [3, 4, 3, 6, 5, 2, 1, 9, 5, 4], + "C": [4, 0, 2, 2, 2, 7, 8, 6, 2, 8]}), + ["a", "b", "c"]), + (DataFrame([[1, 2, 3], [2, 2, 3]], columns=["a", "b", "c"]), + [1, 2]), + ], ids=['GH2936', 'GH7739 & GH10519', 'GH10519', + 'GH2656', 'GH12155', 'GH20084', 'GH21417']) +def test_group_apply_once_per_group(df, group_names): + # GH2936, GH7739, GH10519, GH2656, GH12155, GH20084, GH21417 + + # This test should ensure that a function is only evaluted + # once per group. Previously the function has been evaluated twice + # on the first group to check if the Cython index slider is safe to use + # This test ensures that the side effect (append to list) is only triggered + # once per group + + names = [] + # cannot parameterize over the functions since they need external + # `names` to detect side effects + + def f_copy(group): + # this takes the fast apply path + names.append(group.name) + return group.copy() + + def f_nocopy(group): + # this takes the slow apply path + names.append(group.name) + return group + + def f_scalar(group): + # GH7739, GH2656 + names.append(group.name) + return 0 + + def f_none(group): + # GH10519, GH12155, GH21417 + names.append(group.name) + return None + + def f_constant_df(group): + # GH2936, GH20084 + names.append(group.name) + return DataFrame({"a": [1], "b": [1]}) + + for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]: + del names[:] + + df.groupby("a").apply(func) + assert names == group_names + + def test_apply_with_mixed_dtype(): # GH3480, apply with mixed dtype on axis=1 breaks in 0.11 df = DataFrame({'foo1': np.random.randn(6), diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 8267f38f939b1..56d428f29a38c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1381,11 +1381,9 @@ def test_group_name_available_in_inference_pass(): def f(group): names.append(group.name) return group.copy() - df.groupby('a', sort=False, group_keys=False).apply(f) - # we expect 2 zeros because we call ``f`` once to see if a faster route - # can be used. - expected_names = [0, 0, 1, 2] + + expected_names = [0, 1, 2] assert names == expected_names