Merge pull request #5593 from jreback/groupby_apply

jreback · jreback · commit d5ef4eba961f · 2013-11-26T11:49:55.000-08:00
BUG: Bug in groupby returning non-consistent types when user function returns a None, (GH5992)
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -814,6 +814,7 @@ Bug Fixes
   - Bug in delitem on a Series (:issue:`5542`)
   - Bug fix in apply when using custom function and objects are not mutated (:issue:`5545`)
   - Bug in selecting from a non-unique index with ``loc`` (:issue:`5553`)
+  - Bug in groupby returning non-consistent types when user function returns a ``None``, (:issue:`5592`)
 
 pandas 0.12.0
 -------------
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -2122,11 +2122,23 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
                 else:
                     key_index = Index(keys, name=key_names[0])
 
-            if isinstance(values[0], (np.ndarray, Series)):
-                if isinstance(values[0], Series):
+
+            # make Nones an empty object
+            if com._count_not_none(*values) != len(values):
+                v = None
+                for v in values:
+                    if v is not None:
+                        break
+                if v is None:
+                    return DataFrame()
+                values = [ x if x is not None else v._constructor(**v._construct_axes_dict()) for x in values ]
+
+            v = values[0]
+
+            if isinstance(v, (np.ndarray, Series)):
+                if isinstance(v, Series):
                     applied_index = self.obj._get_axis(self.axis)
-                    all_indexed_same = _all_indexes_same([x.index
-                                                          for x in values])
+                    all_indexed_same = _all_indexes_same([x.index for x in values ])
                     singular_series = (len(values) == 1 and
                                        applied_index.nlevels == 1)
 
@@ -2165,13 +2177,13 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
 
                         stacked_values = np.vstack([np.asarray(x)
                                                     for x in values])
-                        columns = values[0].index
+                        columns = v.index
                         index = key_index
                     else:
                         stacked_values = np.vstack([np.asarray(x)
                                                     for x in values]).T
 
-                        index = values[0].index
+                        index = v.index
                         columns = key_index
 
                 except (ValueError, AttributeError):
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -7,7 +7,7 @@
 from datetime import datetime
 from numpy import nan
 
-from pandas import bdate_range, Timestamp
+from pandas import date_range,bdate_range, Timestamp
 from pandas.core.index import Index, MultiIndex, Int64Index
 from pandas.core.common import rands
 from pandas.core.api import Categorical, DataFrame
@@ -259,7 +259,7 @@ def test_groupby_bounds_check(self):
 
     def test_groupby_grouper_f_sanity_checked(self):
         import pandas as pd
-        dates = pd.date_range('01-Jan-2013', periods=12, freq='MS')
+        dates = date_range('01-Jan-2013', periods=12, freq='MS')
         ts = pd.TimeSeries(np.random.randn(12), index=dates)
 
         # GH3035
@@ -320,6 +320,34 @@ def func(dataf):
         result = df.groupby('X',squeeze=False).count()
         tm.assert_isinstance(result,DataFrame)
 
+        # GH5592
+        # inconcistent return type
+        df = DataFrame(dict(A = [ 'Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb', 'Pony', 'Pony' ],
+                            B = np.arange(7)))
+        def f(grp):
+            return grp.iloc[0]
+        expected = df.groupby('A').first()
+        result = df.groupby('A').apply(f)[['B']]
+        assert_frame_equal(result,expected)
+
+        def f(grp):
+            if grp.name == 'Tiger':
+                return None
+            return grp.iloc[0]
+        result = df.groupby('A').apply(f)[['B']]
+        e = expected.copy()
+        e.loc['Tiger'] = np.nan
+        assert_frame_equal(result,e)
+
+        def f(grp):
+            if grp.name == 'Pony':
+                return None
+            return grp.iloc[0]
+        result = df.groupby('A').apply(f)[['B']]
+        e = expected.copy()
+        e.loc['Pony'] = np.nan
+        assert_frame_equal(result,e)
+
     def test_agg_regression1(self):
         grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
         result = grouped.agg(np.mean)