Skip to content

Commit 36fb835

Browse files
committed
BUG/API: consistency in .agg with nested dicts #9052
1 parent 3c23dc9 commit 36fb835

File tree

6 files changed

+188
-28
lines changed

6 files changed

+188
-28
lines changed

doc/source/whatsnew/v0.18.0.txt

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,55 @@ New features
2525
~~~~~~~~~~~~
2626

2727

28+
.. _whatsnew_0180.enhancements.moments:
2829

30+
Computation moments are now methods
31+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2932

33+
Computational moments have been refactored to be method on ``Series/DataFrame`` objects, rather than top-level functions, which are now deprecated. This allows these window-type functions, to have a similar API to that of ``.groupby``. See the full documentation :ref:`here <stats.moments>` (:issue:`11603`)
3034

35+
.. ipython:: python
36+
37+
np.random.seed(1234)
38+
df = DataFrame({'A' : range(10), 'B' : np.random.randn(10)})
39+
df
40+
41+
Previous Behavior:
42+
43+
.. code-block:: python
44+
45+
In [8]: pd.rolling_mean(df,window=3)
46+
Out[8]:
47+
A B
48+
0 NaN NaN
49+
1 NaN NaN
50+
2 1 0.237722
51+
3 2 -0.023640
52+
4 3 0.133155
53+
5 4 -0.048693
54+
6 5 0.342054
55+
7 6 0.370076
56+
8 7 0.079587
57+
9 8 -0.954504
58+
59+
New Behavior:
60+
61+
.. ipython:: python
62+
63+
r = df.rolling(window=3)
64+
65+
# descriptive repr
66+
r
67+
68+
# operate on this Rolling object itself
69+
r.mean()
70+
71+
# getitem access
72+
r['A'].mean()
3173

74+
# aggregates
75+
r.agg({'A' : {'ra' : ['mean','std']},
76+
'B' : {'rb' : ['mean','std']}})
3277

3378
.. _whatsnew_0180.enhancements.other:
3479

@@ -195,6 +240,7 @@ Bug Fixes
195240
- Bug in ``Period.end_time`` when a multiple of time period is requested (:issue:`11738`)
196241
- Regression in ``.clip`` with tz-aware datetimes (:issue:`11838`)
197242
- Bug in ``date_range`` when the boundaries fell on the frequency (:issue:`11804`)
243+
- Bug in consistency of passing nested dicts to ``.groupby(...).agg(...)`` (:issue:`9052`)
198244

199245

200246

pandas/core/base.py

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ class SelectionMixin(object):
237237
sub-classes need to define: obj, exclusions
238238
"""
239239
_selection = None
240-
_internal_names = ['_cache']
240+
_internal_names = ['_cache','__setstate__']
241241
_internal_names_set = set(_internal_names)
242242
_builtin_table = {
243243
builtins.sum: np.sum,
@@ -368,6 +368,13 @@ def _aggregate(self, arg, *args, **kwargs):
368368
"""
369369
provide an implementation for the aggregators
370370
371+
Parameters
372+
----------
373+
arg : string, dict, function
374+
*args : args to pass on to the function
375+
**kwargs : kwargs to pass on to the function
376+
377+
371378
Returns
372379
-------
373380
tuple of result, how
@@ -378,6 +385,7 @@ def _aggregate(self, arg, *args, **kwargs):
378385
None if not required
379386
"""
380387

388+
_level = kwargs.pop('_level',None)
381389
if isinstance(arg, compat.string_types):
382390
return getattr(self, arg)(*args, **kwargs), None
383391

@@ -403,24 +411,24 @@ def _aggregate(self, arg, *args, **kwargs):
403411

404412
for fname, agg_how in compat.iteritems(arg):
405413
colg = self._gotitem(self._selection, ndim=1, subset=subset)
406-
result[fname] = colg.aggregate(agg_how)
414+
result[fname] = colg.aggregate(agg_how, _level=None)
407415
keys.append(fname)
408416
else:
409417
for col, agg_how in compat.iteritems(arg):
410418
colg = self._gotitem(col, ndim=1)
411-
result[col] = colg.aggregate(agg_how)
419+
result[col] = colg.aggregate(agg_how, _level=(_level or 0) + 1)
412420
keys.append(col)
413421

414422
if isinstance(list(result.values())[0], com.ABCDataFrame):
415423
from pandas.tools.merge import concat
416-
result = concat([result[k] for k in keys], keys=keys, axis=1)
424+
result = concat([ result[k] for k in keys ], keys=keys, axis=1)
417425
else:
418426
from pandas import DataFrame
419427
result = DataFrame(result)
420428

421429
return result, True
422430
elif hasattr(arg, '__iter__'):
423-
return self._aggregate_multiple_funcs(arg), None
431+
return self._aggregate_multiple_funcs(arg, _level=_level), None
424432
else:
425433
result = None
426434

@@ -431,7 +439,7 @@ def _aggregate(self, arg, *args, **kwargs):
431439
# caller can react
432440
return result, True
433441

434-
def _aggregate_multiple_funcs(self, arg):
442+
def _aggregate_multiple_funcs(self, arg, _level):
435443
from pandas.tools.merge import concat
436444

437445
if self.axis != 0:
@@ -447,7 +455,15 @@ def _aggregate_multiple_funcs(self, arg):
447455
try:
448456
colg = self._gotitem(obj.name, ndim=1, subset=obj)
449457
results.append(colg.aggregate(a))
450-
keys.append(getattr(a,'name',a))
458+
459+
# find a good name, this could be a function that we don't recognize
460+
name = self._is_cython_func(a) or a
461+
if not isinstance(name, compat.string_types):
462+
name = getattr(a,name,a)
463+
if not isinstance(name, compat.string_types):
464+
name = getattr(a,func_name,a)
465+
466+
keys.append(name)
451467
except (TypeError, DataError):
452468
pass
453469
except SpecificationError:
@@ -464,6 +480,9 @@ def _aggregate_multiple_funcs(self, arg):
464480
pass
465481
except SpecificationError:
466482
raise
483+
484+
if _level:
485+
keys = None
467486
result = concat(results, keys=keys, axis=1)
468487

469488
return result

pandas/core/groupby.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2362,6 +2362,7 @@ def aggregate(self, func_or_funcs, *args, **kwargs):
23622362
-------
23632363
Series or DataFrame
23642364
"""
2365+
_level = kwargs.pop('_level',None)
23652366
if isinstance(func_or_funcs, compat.string_types):
23662367
return getattr(self, func_or_funcs)(*args, **kwargs)
23672368

@@ -2411,11 +2412,18 @@ def _aggregate_multiple_funcs(self, arg):
24112412

24122413
results = {}
24132414
for name, func in arg:
2415+
obj = self
24142416
if name in results:
24152417
raise SpecificationError('Function names must be unique, '
24162418
'found multiple named %s' % name)
24172419

2418-
results[name] = self.aggregate(func)
2420+
# reset the cache so that we
2421+
# only include the named selection
2422+
if name in self._selected_obj:
2423+
obj = copy.copy(obj)
2424+
obj._reset_cache()
2425+
obj._selection = name
2426+
results[name] = obj.aggregate(func)
24192427

24202428
return DataFrame(results, columns=columns)
24212429

@@ -2856,7 +2864,8 @@ def _post_process_cython_aggregate(self, obj):
28562864
@Appender(SelectionMixin._agg_doc)
28572865
def aggregate(self, arg, *args, **kwargs):
28582866

2859-
result, how = self._aggregate(arg, *args, **kwargs)
2867+
_level = kwargs.pop('_level',None)
2868+
result, how = self._aggregate(arg, _level=_level, *args, **kwargs)
28602869
if how is None:
28612870
return result
28622871

@@ -2870,7 +2879,7 @@ def aggregate(self, arg, *args, **kwargs):
28702879
# try to treat as if we are passing a list
28712880
try:
28722881
assert not args and not kwargs
2873-
result = self._aggregate_multiple_funcs([arg])
2882+
result = self._aggregate_multiple_funcs([arg], _level=_level)
28742883
result.columns = Index(result.columns.levels[0],
28752884
name=self._selected_obj.columns.name)
28762885
except:

pandas/core/window.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from collections import defaultdict
1313

1414
import pandas as pd
15+
from pandas.lib import isscalar
1516
from pandas.core.base import PandasObject, SelectionMixin, AbstractMethodError
1617
import pandas.core.common as com
1718
import pandas.algos as algos
@@ -64,11 +65,12 @@ def _gotitem(self, key, ndim, subset=None):
6465
# create a new object to prevent aliasing
6566
if subset is None:
6667
subset = self.obj
67-
new_self = self._shallow_copy(subset)
68-
if ndim==2 and key in subset:
69-
new_self._selection = key
70-
new_self._reset_cache()
71-
return new_self
68+
self = self._shallow_copy(subset)
69+
self._reset_cache()
70+
if subset.ndim==2:
71+
if isscalar(key) and key in subset or com.is_list_like(key):
72+
self._selection = key
73+
return self
7274

7375
def __getattr__(self, attr):
7476
if attr in self._internal_names_set:
@@ -191,8 +193,6 @@ def _convert_freq(self):
191193
@Appender(SelectionMixin._agg_doc)
192194
def aggregate(self, arg, *args, **kwargs):
193195
result, how = self._aggregate(arg, *args, **kwargs)
194-
if result is None:
195-
import pdb; pdb.set_trace()
196196
return result
197197

198198
class Window(_Window):

pandas/tests/test_groupby.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1443,6 +1443,48 @@ def test_frame_set_name_single(self):
14431443
result = grouped['C'].agg({'foo': np.mean, 'bar': np.std})
14441444
self.assertEqual(result.index.name, 'A')
14451445

1446+
def test_aggregate_api_consistency(self):
1447+
# GH 9052
1448+
# make sure that the aggregates via dict
1449+
# are consistent
1450+
1451+
1452+
def compare(result, expected):
1453+
# if we ar passin dicts then ordering is not guaranteed for output columns
1454+
assert_frame_equal(result.reindex_like(expected), expected)
1455+
1456+
1457+
df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
1458+
'foo', 'bar', 'foo', 'foo'],
1459+
'B' : ['one', 'one', 'two', 'three',
1460+
'two', 'two', 'one', 'three'],
1461+
'C' : np.random.randn(8),
1462+
'D' : np.random.randn(8)})
1463+
1464+
grouped = df.groupby(['A', 'B'])
1465+
result = grouped[['D','C']].agg({'r':np.sum, 'r2':np.mean})
1466+
expected = pd.concat([grouped[['D','C']].sum(),
1467+
grouped[['D','C']].mean()],
1468+
keys=['r','r2'],
1469+
axis=1).stack(level=1)
1470+
compare(result, expected)
1471+
1472+
result = grouped[['D','C']].agg({'r': { 'C' : np.sum }, 'r2' : { 'D' : np.mean }})
1473+
expected = pd.concat([grouped[['C']].sum(),
1474+
grouped[['D']].mean()],
1475+
axis=1)
1476+
expected.columns = MultiIndex.from_tuples([('r','C'),('r2','D')])
1477+
compare(result, expected)
1478+
1479+
result = grouped[['D','C']].agg([np.sum, np.mean])
1480+
expected = pd.concat([grouped['D'].sum(),
1481+
grouped['D'].mean(),
1482+
grouped['C'].sum(),
1483+
grouped['C'].mean()],
1484+
axis=1)
1485+
expected.columns = MultiIndex.from_product([['D','C'],['sum','mean']])
1486+
compare(result, expected)
1487+
14461488
def test_multi_iter(self):
14471489
s = Series(np.arange(6))
14481490
k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b'])

pandas/tests/test_window.py

Lines changed: 55 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -103,18 +103,62 @@ def test_agg(self):
103103
'B' : range(0,10,2)})
104104

105105
r = df.rolling(window=3)
106+
a_mean = r['A'].mean()
107+
a_std = r['A'].std()
108+
a_sum = r['A'].sum()
109+
b_mean = r['B'].mean()
110+
b_std = r['B'].std()
111+
b_sum = r['B'].sum()
112+
113+
def compare(result, expected):
114+
# if we are using dicts, the orderings is not guaranteed
115+
assert_frame_equal(result.reindex_like(expected), expected)
116+
117+
result = r.aggregate([np.mean, np.std])
118+
expected = pd.concat([a_mean,a_std,b_mean,b_std],axis=1)
119+
expected.columns = pd.MultiIndex.from_product([['A','B'],['mean','std']])
120+
assert_frame_equal(result, expected)
121+
122+
result = r.aggregate({'A': np.mean,
123+
'B': np.std})
124+
expected = pd.concat([a_mean,b_std],axis=1)
125+
compare(result, expected)
126+
127+
result = r.aggregate({'A': ['mean','std']})
128+
expected = pd.concat([a_mean,a_std],axis=1)
129+
expected.columns = pd.MultiIndex.from_product([['A'],['mean','std']])
130+
assert_frame_equal(result, expected)
131+
132+
result = r['A'].aggregate(['mean','sum'])
133+
expected = pd.concat([a_mean,a_sum],axis=1)
134+
expected.columns = pd.MultiIndex.from_product([['A'],['mean','sum']])
135+
assert_frame_equal(result, expected)
106136

107-
import pdb; pdb.set_trace()
108-
agged = r.aggregate([np.mean, np.std])
109-
agged = r.aggregate({'A': np.mean,
110-
'B': np.std})
111-
agged = r.aggregate({'A': ['mean','sum']})
112-
agged = r['A'].aggregate(['mean','sum'])
113-
agged = r.aggregate({'A': { 'mean' : 'mean', 'sum' : 'sum' } })
114-
agged = r.aggregate({'A': { 'mean' : 'mean', 'sum' : 'sum' },
115-
'B': { 'mean2' : 'mean', 'sum2' : 'sum' }})
116-
agged = r.aggregate({'r1': { 'A' : ['mean','sum'] },
117-
'r2' : { 'B' : ['mean','sum'] }})
137+
result = r.aggregate({'A': { 'mean' : 'mean', 'sum' : 'sum' } })
138+
expected = pd.concat([a_mean,a_sum],axis=1)
139+
expected.columns = pd.MultiIndex.from_product([['A'],['mean','sum']])
140+
compare(result, expected)
141+
142+
result = r.aggregate({'A': { 'mean' : 'mean', 'sum' : 'sum' },
143+
'B': { 'mean2' : 'mean', 'sum2' : 'sum' }})
144+
expected = pd.concat([a_mean,a_sum,b_mean,b_sum],axis=1)
145+
expected.columns = pd.MultiIndex.from_tuples([('A','mean'),('A','sum'),
146+
('B','mean2'),('B','sum2')])
147+
compare(result, expected)
148+
149+
result = r.aggregate({'r1' : { 'A' : ['mean','sum'] },
150+
'r2' : { 'B' : ['mean','sum'] }})
151+
expected = pd.concat([a_mean,a_sum,b_mean,b_sum],axis=1)
152+
expected.columns = pd.MultiIndex.from_tuples([('r1','A','mean'),('r1','A','sum'),
153+
('r2','B','mean'),('r2','B','sum')])
154+
compare(result, expected)
155+
156+
result = r.agg({'A' : {'ra' : ['mean','std']},
157+
'B' : {'rb' : ['mean','std']}})
158+
expected = pd.concat([a_mean,a_std,b_mean,b_std],axis=1)
159+
expected.columns = pd.MultiIndex.from_tuples([('A','ra','mean'),('A','ra','std'),
160+
('B','rb','mean'),('B','rb','std')])
161+
compare(result, expected)
118162

119163
class TestMoments(Base):
120164

0 commit comments

Comments
 (0)