Merge pull request #7463 from jreback/merge_transform

jreback · jreback · commit be2db88377c7 · 2014-06-16T08:52:18.000-04:00
PERF: performance gains in DataFrame groupby.transform for ufuncs (GH7383)
diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt
@@ -137,6 +137,7 @@ Performance
 ~~~~~~~~~~~
 - Improvements in dtype inference for numeric operations involving yielding performance gains for dtypes: ``int64``, ``timedelta64``, ``datetime64`` (:issue:`7223`)
 - Improvements in Series.transform for signifcant performance gains (:issue`6496`)
+- Improvements in DataFrame.transform with ufuncs and built-in grouper functions for signifcant performance gains (:issue`7383`)
 
 
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -2701,27 +2701,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
             return self._concat_objects(keys, values,
                                         not_indexed_same=not_indexed_same)
 
-    def transform(self, func, *args, **kwargs):
-        """
-        Call function producing a like-indexed DataFrame on each group and
-        return a DataFrame having the same indexes as the original object
-        filled with the transformed values
-
-        Parameters
-        ----------
-        f : function
-            Function to apply to each subframe
-
-        Notes
-        -----
-        Each subframe is endowed the attribute 'name' in case you need to know
-        which group you are working on.
-
-        Examples
-        --------
-        >>> grouped = df.groupby(lambda x: mapping[x])
-        >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
-        """
+    def _transform_general(self, func, *args, **kwargs):
         from pandas.tools.merge import concat
 
         applied = []
@@ -2763,6 +2743,66 @@ def transform(self, func, *args, **kwargs):
         concatenated.sort_index(inplace=True)
         return concatenated
 
+    def transform(self, func, *args, **kwargs):
+        """
+        Call function producing a like-indexed DataFrame on each group and
+        return a DataFrame having the same indexes as the original object
+        filled with the transformed values
+
+        Parameters
+        ----------
+        f : function
+            Function to apply to each subframe
+
+        Notes
+        -----
+        Each subframe is endowed the attribute 'name' in case you need to know
+        which group you are working on.
+
+        Examples
+        --------
+        >>> grouped = df.groupby(lambda x: mapping[x])
+        >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
+        """
+
+        # try to do a fast transform via merge if possible
+        try:
+            obj = self._obj_with_exclusions
+            if isinstance(func, compat.string_types):
+                result = getattr(self, func)(*args, **kwargs)
+            else:
+                cyfunc = _intercept_cython(func)
+                if cyfunc and not args and not kwargs:
+                    result = getattr(self, cyfunc)()
+                else:
+                    return self._transform_general(func, *args, **kwargs)
+        except:
+            return self._transform_general(func, *args, **kwargs)
+
+        # a reduction transform
+        if not isinstance(result, DataFrame):
+            return self._transform_general(func, *args, **kwargs)
+
+        # nuiscance columns
+        if not result.columns.equals(obj.columns):
+            return self._transform_general(func, *args, **kwargs)
+
+        # a grouped that doesn't preserve the index, remap index based on the grouper
+        # and broadcast it
+        if not isinstance(obj.index,MultiIndex) and type(result.index) != type(obj.index):
+            results = obj.values.copy()
+            for (name, group), (i, row) in zip(self, result.iterrows()):
+                indexer = self._get_index(name)
+                results[indexer] = np.tile(row.values,len(indexer)).reshape(len(indexer),-1)
+            return DataFrame(results,columns=result.columns,index=obj.index).convert_objects()
+
+        # we can merge the result in
+        # GH 7383
+        names = result.columns
+        result = obj.merge(result, how='outer', left_index=True, right_index=True).ix[:,-result.shape[1]:]
+        result.columns = names
+        return result
+
     def _define_paths(self, func, *args, **kwargs):
         if isinstance(func, compat.string_types):
             fast_path = lambda group: getattr(group, func)(*args, **kwargs)
diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py
@@ -376,6 +376,7 @@ def f(g):
 """
 
 groupby_transform = Benchmark("data.groupby(level='security_id').transform(f_fillna)", setup)
+groupby_transform_ufunc = Benchmark("data.groupby(level='date').transform(np.max)", setup)
 
 setup = common_setup + """
 np.random.seed(0)
@@ -393,4 +394,4 @@ def f(g):
 df = DataFrame({ 'signal' : np.random.rand(N)})
 """
 
-groupby_transform2 = Benchmark("df['signal'].groupby(g).transform(np.mean)", setup)
+groupby_transform_series = Benchmark("df['signal'].groupby(g).transform(np.mean)", setup)