Skip to content

PERF: performance gains in DataFrame groupby.transform for ufuncs (GH7383) #7463

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 16, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/v0.14.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ Performance
~~~~~~~~~~~
- Improvements in dtype inference for numeric operations involving yielding performance gains for dtypes: ``int64``, ``timedelta64``, ``datetime64`` (:issue:`7223`)
- Improvements in Series.transform for signifcant performance gains (:issue`6496`)
- Improvements in DataFrame.transform with ufuncs and built-in grouper functions for signifcant performance gains (:issue`7383`)



Expand Down
82 changes: 61 additions & 21 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2701,27 +2701,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
return self._concat_objects(keys, values,
not_indexed_same=not_indexed_same)

def transform(self, func, *args, **kwargs):
"""
Call function producing a like-indexed DataFrame on each group and
return a DataFrame having the same indexes as the original object
filled with the transformed values

Parameters
----------
f : function
Function to apply to each subframe

Notes
-----
Each subframe is endowed the attribute 'name' in case you need to know
which group you are working on.

Examples
--------
>>> grouped = df.groupby(lambda x: mapping[x])
>>> grouped.transform(lambda x: (x - x.mean()) / x.std())
"""
def _transform_general(self, func, *args, **kwargs):
from pandas.tools.merge import concat

applied = []
Expand Down Expand Up @@ -2763,6 +2743,66 @@ def transform(self, func, *args, **kwargs):
concatenated.sort_index(inplace=True)
return concatenated

def transform(self, func, *args, **kwargs):
"""
Call function producing a like-indexed DataFrame on each group and
return a DataFrame having the same indexes as the original object
filled with the transformed values

Parameters
----------
f : function
Function to apply to each subframe

Notes
-----
Each subframe is endowed the attribute 'name' in case you need to know
which group you are working on.

Examples
--------
>>> grouped = df.groupby(lambda x: mapping[x])
>>> grouped.transform(lambda x: (x - x.mean()) / x.std())
"""

# try to do a fast transform via merge if possible
try:
obj = self._obj_with_exclusions
if isinstance(func, compat.string_types):
result = getattr(self, func)(*args, **kwargs)
else:
cyfunc = _intercept_cython(func)
if cyfunc and not args and not kwargs:
result = getattr(self, cyfunc)()
else:
return self._transform_general(func, *args, **kwargs)
except:
return self._transform_general(func, *args, **kwargs)

# a reduction transform
if not isinstance(result, DataFrame):
return self._transform_general(func, *args, **kwargs)

# nuiscance columns
if not result.columns.equals(obj.columns):
return self._transform_general(func, *args, **kwargs)

# a grouped that doesn't preserve the index, remap index based on the grouper
# and broadcast it
if not isinstance(obj.index,MultiIndex) and type(result.index) != type(obj.index):
results = obj.values.copy()
for (name, group), (i, row) in zip(self, result.iterrows()):
indexer = self._get_index(name)
results[indexer] = np.tile(row.values,len(indexer)).reshape(len(indexer),-1)
return DataFrame(results,columns=result.columns,index=obj.index).convert_objects()

# we can merge the result in
# GH 7383
names = result.columns
result = obj.merge(result, how='outer', left_index=True, right_index=True).ix[:,-result.shape[1]:]
result.columns = names
return result

def _define_paths(self, func, *args, **kwargs):
if isinstance(func, compat.string_types):
fast_path = lambda group: getattr(group, func)(*args, **kwargs)
Expand Down
3 changes: 2 additions & 1 deletion vb_suite/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ def f(g):
"""

groupby_transform = Benchmark("data.groupby(level='security_id').transform(f_fillna)", setup)
groupby_transform_ufunc = Benchmark("data.groupby(level='date').transform(np.max)", setup)

setup = common_setup + """
np.random.seed(0)
Expand All @@ -393,4 +394,4 @@ def f(g):
df = DataFrame({ 'signal' : np.random.rand(N)})
"""

groupby_transform2 = Benchmark("df['signal'].groupby(g).transform(np.mean)", setup)
groupby_transform_series = Benchmark("df['signal'].groupby(g).transform(np.mean)", setup)