Merge pull request #3384 from jreback/groupby_mutate

jreback · jreback · commit 1380bb87685e · 2013-04-20T07:32:53.000-07:00
BUG: GH3380 groupby will handle mutation on a DataFrame group's columns
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -298,6 +298,8 @@ pandas 0.11.0
   - Fix set_index segfault when passing MultiIndex (GH3308_)
   - Ensure pickles created in py2 can be read in py3
   - Insert ellipsis in MultiIndex summary repr (GH3348_)
+  - Groupby will handle mutation among an input groups columns (and fallback
+    to non-fast apply) (GH3380_)
 
 .. _GH3294: https://github.com/pydata/pandas/issues/3294
 .. _GH622: https://github.com/pydata/pandas/issues/622
@@ -409,6 +411,7 @@ pandas 0.11.0
 .. _GH2919: https://github.com/pydata/pandas/issues/2919
 .. _GH3308: https://github.com/pydata/pandas/issues/3308
 .. _GH3311: https://github.com/pydata/pandas/issues/3311
+.. _GH3380: https://github.com/pydata/pandas/issues/3380
 
 pandas 0.10.1
 =============
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -620,7 +620,9 @@ def apply(self, f, data, axis=0, keep_internal=False):
             try:
                 values, mutated = splitter.fast_apply(f, group_keys)
                 return group_keys, values, mutated
-            except lib.InvalidApply:
+            except (Exception), detail:
+                # we detect a mutatation of some kind
+                # so take slow path
                 pass
 
         result_values = []
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -1491,6 +1491,30 @@ def f(group):
         for key, group in grouped:
             assert_frame_equal(result.ix[key], f(group))
 
+    def test_mutate_groups(self):
+
+        # GH3380
+
+        mydf = DataFrame({
+                'cat1' : ['a'] * 8 + ['b'] * 6,
+                'cat2' : ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 + ['d'] * 2 + ['e'] * 2,
+                'cat3' : map(lambda x: 'g%s' % x, range(1,15)),
+                'val' : np.random.randint(100, size=14),
+                })
+
+        def f_copy(x):
+            x = x.copy()
+            x['rank'] = x.val.rank(method='min')
+            return x.groupby('cat2')['rank'].min()
+
+        def f_no_copy(x):
+            x['rank'] = x.val.rank(method='min')
+            return x.groupby('cat2')['rank'].min()
+
+        grpby_copy    = mydf.groupby('cat1').apply(f_copy)
+        grpby_no_copy = mydf.groupby('cat1').apply(f_no_copy)
+        assert_series_equal(grpby_copy,grpby_no_copy)
+
     def test_apply_chunk_view(self):
         # Low level tinkering could be unsafe, make sure not
         df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3],