Skip to content

Commit be2db88

Browse files
committed
Merge pull request #7463 from jreback/merge_transform
PERF: performance gains in DataFrame groupby.transform for ufuncs (GH7383)
2 parents a367e9b + ae396ff commit be2db88

File tree

3 files changed

+64
-22
lines changed

3 files changed

+64
-22
lines changed

doc/source/v0.14.1.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ Performance
137137
~~~~~~~~~~~
138138
- Improvements in dtype inference for numeric operations involving yielding performance gains for dtypes: ``int64``, ``timedelta64``, ``datetime64`` (:issue:`7223`)
139139
- Improvements in Series.transform for signifcant performance gains (:issue`6496`)
140+
- Improvements in DataFrame.transform with ufuncs and built-in grouper functions for signifcant performance gains (:issue`7383`)
140141

141142

142143

pandas/core/groupby.py

Lines changed: 61 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2701,27 +2701,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
27012701
return self._concat_objects(keys, values,
27022702
not_indexed_same=not_indexed_same)
27032703

2704-
def transform(self, func, *args, **kwargs):
2705-
"""
2706-
Call function producing a like-indexed DataFrame on each group and
2707-
return a DataFrame having the same indexes as the original object
2708-
filled with the transformed values
2709-
2710-
Parameters
2711-
----------
2712-
f : function
2713-
Function to apply to each subframe
2714-
2715-
Notes
2716-
-----
2717-
Each subframe is endowed the attribute 'name' in case you need to know
2718-
which group you are working on.
2719-
2720-
Examples
2721-
--------
2722-
>>> grouped = df.groupby(lambda x: mapping[x])
2723-
>>> grouped.transform(lambda x: (x - x.mean()) / x.std())
2724-
"""
2704+
def _transform_general(self, func, *args, **kwargs):
27252705
from pandas.tools.merge import concat
27262706

27272707
applied = []
@@ -2763,6 +2743,66 @@ def transform(self, func, *args, **kwargs):
27632743
concatenated.sort_index(inplace=True)
27642744
return concatenated
27652745

2746+
def transform(self, func, *args, **kwargs):
2747+
"""
2748+
Call function producing a like-indexed DataFrame on each group and
2749+
return a DataFrame having the same indexes as the original object
2750+
filled with the transformed values
2751+
2752+
Parameters
2753+
----------
2754+
f : function
2755+
Function to apply to each subframe
2756+
2757+
Notes
2758+
-----
2759+
Each subframe is endowed the attribute 'name' in case you need to know
2760+
which group you are working on.
2761+
2762+
Examples
2763+
--------
2764+
>>> grouped = df.groupby(lambda x: mapping[x])
2765+
>>> grouped.transform(lambda x: (x - x.mean()) / x.std())
2766+
"""
2767+
2768+
# try to do a fast transform via merge if possible
2769+
try:
2770+
obj = self._obj_with_exclusions
2771+
if isinstance(func, compat.string_types):
2772+
result = getattr(self, func)(*args, **kwargs)
2773+
else:
2774+
cyfunc = _intercept_cython(func)
2775+
if cyfunc and not args and not kwargs:
2776+
result = getattr(self, cyfunc)()
2777+
else:
2778+
return self._transform_general(func, *args, **kwargs)
2779+
except:
2780+
return self._transform_general(func, *args, **kwargs)
2781+
2782+
# a reduction transform
2783+
if not isinstance(result, DataFrame):
2784+
return self._transform_general(func, *args, **kwargs)
2785+
2786+
# nuiscance columns
2787+
if not result.columns.equals(obj.columns):
2788+
return self._transform_general(func, *args, **kwargs)
2789+
2790+
# a grouped that doesn't preserve the index, remap index based on the grouper
2791+
# and broadcast it
2792+
if not isinstance(obj.index,MultiIndex) and type(result.index) != type(obj.index):
2793+
results = obj.values.copy()
2794+
for (name, group), (i, row) in zip(self, result.iterrows()):
2795+
indexer = self._get_index(name)
2796+
results[indexer] = np.tile(row.values,len(indexer)).reshape(len(indexer),-1)
2797+
return DataFrame(results,columns=result.columns,index=obj.index).convert_objects()
2798+
2799+
# we can merge the result in
2800+
# GH 7383
2801+
names = result.columns
2802+
result = obj.merge(result, how='outer', left_index=True, right_index=True).ix[:,-result.shape[1]:]
2803+
result.columns = names
2804+
return result
2805+
27662806
def _define_paths(self, func, *args, **kwargs):
27672807
if isinstance(func, compat.string_types):
27682808
fast_path = lambda group: getattr(group, func)(*args, **kwargs)

vb_suite/groupby.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,7 @@ def f(g):
376376
"""
377377

378378
groupby_transform = Benchmark("data.groupby(level='security_id').transform(f_fillna)", setup)
379+
groupby_transform_ufunc = Benchmark("data.groupby(level='date').transform(np.max)", setup)
379380

380381
setup = common_setup + """
381382
np.random.seed(0)
@@ -393,4 +394,4 @@ def f(g):
393394
df = DataFrame({ 'signal' : np.random.rand(N)})
394395
"""
395396

396-
groupby_transform2 = Benchmark("df['signal'].groupby(g).transform(np.mean)", setup)
397+
groupby_transform_series = Benchmark("df['signal'].groupby(g).transform(np.mean)", setup)

0 commit comments

Comments
 (0)