diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt index 99fec7be42baa..271b4eb75a7b6 100644 --- a/doc/source/v0.14.1.txt +++ b/doc/source/v0.14.1.txt @@ -137,6 +137,7 @@ Performance ~~~~~~~~~~~ - Improvements in dtype inference for numeric operations involving yielding performance gains for dtypes: ``int64``, ``timedelta64``, ``datetime64`` (:issue:`7223`) - Improvements in Series.transform for signifcant performance gains (:issue`6496`) +- Improvements in DataFrame.transform with ufuncs and built-in grouper functions for signifcant performance gains (:issue`7383`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index a90f00fd11e36..dc8b7f3bccc2a 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2701,27 +2701,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) - def transform(self, func, *args, **kwargs): - """ - Call function producing a like-indexed DataFrame on each group and - return a DataFrame having the same indexes as the original object - filled with the transformed values - - Parameters - ---------- - f : function - Function to apply to each subframe - - Notes - ----- - Each subframe is endowed the attribute 'name' in case you need to know - which group you are working on. - - Examples - -------- - >>> grouped = df.groupby(lambda x: mapping[x]) - >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) - """ + def _transform_general(self, func, *args, **kwargs): from pandas.tools.merge import concat applied = [] @@ -2763,6 +2743,66 @@ def transform(self, func, *args, **kwargs): concatenated.sort_index(inplace=True) return concatenated + def transform(self, func, *args, **kwargs): + """ + Call function producing a like-indexed DataFrame on each group and + return a DataFrame having the same indexes as the original object + filled with the transformed values + + Parameters + ---------- + f : function + Function to apply to each subframe + + Notes + ----- + Each subframe is endowed the attribute 'name' in case you need to know + which group you are working on. + + Examples + -------- + >>> grouped = df.groupby(lambda x: mapping[x]) + >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) + """ + + # try to do a fast transform via merge if possible + try: + obj = self._obj_with_exclusions + if isinstance(func, compat.string_types): + result = getattr(self, func)(*args, **kwargs) + else: + cyfunc = _intercept_cython(func) + if cyfunc and not args and not kwargs: + result = getattr(self, cyfunc)() + else: + return self._transform_general(func, *args, **kwargs) + except: + return self._transform_general(func, *args, **kwargs) + + # a reduction transform + if not isinstance(result, DataFrame): + return self._transform_general(func, *args, **kwargs) + + # nuiscance columns + if not result.columns.equals(obj.columns): + return self._transform_general(func, *args, **kwargs) + + # a grouped that doesn't preserve the index, remap index based on the grouper + # and broadcast it + if not isinstance(obj.index,MultiIndex) and type(result.index) != type(obj.index): + results = obj.values.copy() + for (name, group), (i, row) in zip(self, result.iterrows()): + indexer = self._get_index(name) + results[indexer] = np.tile(row.values,len(indexer)).reshape(len(indexer),-1) + return DataFrame(results,columns=result.columns,index=obj.index).convert_objects() + + # we can merge the result in + # GH 7383 + names = result.columns + result = obj.merge(result, how='outer', left_index=True, right_index=True).ix[:,-result.shape[1]:] + result.columns = names + return result + def _define_paths(self, func, *args, **kwargs): if isinstance(func, compat.string_types): fast_path = lambda group: getattr(group, func)(*args, **kwargs) diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index f61c60d939907..eac313481aca7 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -376,6 +376,7 @@ def f(g): """ groupby_transform = Benchmark("data.groupby(level='security_id').transform(f_fillna)", setup) +groupby_transform_ufunc = Benchmark("data.groupby(level='date').transform(np.max)", setup) setup = common_setup + """ np.random.seed(0) @@ -393,4 +394,4 @@ def f(g): df = DataFrame({ 'signal' : np.random.rand(N)}) """ -groupby_transform2 = Benchmark("df['signal'].groupby(g).transform(np.mean)", setup) +groupby_transform_series = Benchmark("df['signal'].groupby(g).transform(np.mean)", setup)