Skip to content

Commit 21565a3

Browse files
committed
Merge pull request #6908 from sinhrks/grouper
BUG: TimeGrouper outputs different result by column order
2 parents 9681959 + 2cf3eb6 commit 21565a3

File tree

4 files changed

+165
-129
lines changed

4 files changed

+165
-129
lines changed

doc/source/release.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,7 @@ Bug Fixes
294294
- Bug in TimeGrouper/resample when presented with a non-monotonic DatetimeIndex would return invalid results. (:issue:`4161`)
295295
- Bug in index name propogation in TimeGrouper/resample (:issue:`4161`)
296296
- TimeGrouper has a more compatible API to the rest of the groupers (e.g. ``groups`` was missing) (:issue:`3881`)
297+
- Bug in multiple grouping with a TimeGrouper depending on target column order (:issue:`6764`)
297298
- Bug in ``pd.eval`` when parsing strings with possible tokens like ``'&'``
298299
(:issue:`6351`)
299300
- Bug correctly handle placements of ``-inf`` in Panels when dividing by integer 0 (:issue:`6178`)

pandas/core/groupby.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -263,11 +263,11 @@ def _set_grouper(self, obj, sort=False):
263263
if not (level == 0 or level == ax.name):
264264
raise ValueError("The grouper level {0} is not valid".format(level))
265265

266-
# possibly sort
267-
if (self.sort or sort) and not ax.is_monotonic:
268-
indexer = self.indexer = ax.argsort(kind='quicksort')
269-
ax = ax.take(indexer)
270-
obj = obj.take(indexer, axis=self.axis, convert=False, is_copy=False)
266+
# possibly sort
267+
if (self.sort or sort) and not ax.is_monotonic:
268+
indexer = self.indexer = ax.argsort(kind='quicksort')
269+
ax = ax.take(indexer)
270+
obj = obj.take(indexer, axis=self.axis, convert=False, is_copy=False)
271271

272272
self.obj = obj
273273
self.grouper = ax

pandas/tests/test_groupby.py

Lines changed: 153 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -2914,7 +2914,7 @@ def test_groupby_with_timegrouper(self):
29142914
# TimeGrouper requires a sorted index
29152915
# also verifies that the resultant index has the correct name
29162916
import datetime as DT
2917-
df = DataFrame({
2917+
df_original = DataFrame({
29182918
'Buyer': 'Carl Carl Carl Carl Joe Carl'.split(),
29192919
'Quantity': [18,3,5,1,9,3],
29202920
'Date' : [
@@ -2925,29 +2925,34 @@ def test_groupby_with_timegrouper(self):
29252925
DT.datetime(2013,12,2,12,0),
29262926
DT.datetime(2013,9,2,14,0),
29272927
]})
2928-
df = df.set_index(['Date'])
2928+
2929+
# GH 6908 change target column's order
2930+
df_reordered = df_original.sort(columns='Quantity')
29292931

2930-
expected = DataFrame({ 'Quantity' : np.nan },
2931-
index=date_range('20130901 13:00:00','20131205 13:00:00',
2932-
freq='5D',name='Date',closed='left'))
2933-
expected.iloc[[0,6,18],0] = np.array([24.,6.,9.],dtype='float64')
2932+
for df in [df_original, df_reordered]:
2933+
df = df.set_index(['Date'])
29342934

2935-
result1 = df.resample('5D',how=sum)
2936-
assert_frame_equal(result1, expected)
2935+
expected = DataFrame({ 'Quantity' : np.nan },
2936+
index=date_range('20130901 13:00:00','20131205 13:00:00',
2937+
freq='5D',name='Date',closed='left'))
2938+
expected.iloc[[0,6,18],0] = np.array([24.,6.,9.],dtype='float64')
29372939

2938-
df_sorted = df.sort_index()
2939-
result2 = df_sorted.groupby(pd.TimeGrouper(freq='5D')).sum()
2940-
assert_frame_equal(result2, expected)
2940+
result1 = df.resample('5D',how=sum)
2941+
assert_frame_equal(result1, expected)
29412942

2942-
result3 = df.groupby(pd.TimeGrouper(freq='5D')).sum()
2943-
assert_frame_equal(result3, expected)
2943+
df_sorted = df.sort_index()
2944+
result2 = df_sorted.groupby(pd.TimeGrouper(freq='5D')).sum()
2945+
assert_frame_equal(result2, expected)
2946+
2947+
result3 = df.groupby(pd.TimeGrouper(freq='5D')).sum()
2948+
assert_frame_equal(result3, expected)
29442949

29452950
def test_groupby_with_timegrouper_methods(self):
29462951
# GH 3881
29472952
# make sure API of timegrouper conforms
29482953

29492954
import datetime as DT
2950-
df = pd.DataFrame({
2955+
df_original = pd.DataFrame({
29512956
'Branch' : 'A A A A A B'.split(),
29522957
'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(),
29532958
'Quantity': [1,3,5,8,9,3],
@@ -2960,13 +2965,16 @@ def test_groupby_with_timegrouper_methods(self):
29602965
DT.datetime(2013,12,2,14,0),
29612966
]})
29622967

2963-
df = df.set_index('Date', drop=False)
2964-
g = df.groupby(pd.TimeGrouper('6M'))
2965-
self.assertTrue(g.group_keys)
2966-
self.assertTrue(isinstance(g.grouper,pd.core.groupby.BinGrouper))
2967-
groups = g.groups
2968-
self.assertTrue(isinstance(groups,dict))
2969-
self.assertTrue(len(groups) == 3)
2968+
df_sorted = df_original.sort(columns='Quantity', ascending=False)
2969+
2970+
for df in [df_original, df_sorted]:
2971+
df = df.set_index('Date', drop=False)
2972+
g = df.groupby(pd.TimeGrouper('6M'))
2973+
self.assertTrue(g.group_keys)
2974+
self.assertTrue(isinstance(g.grouper,pd.core.groupby.BinGrouper))
2975+
groups = g.groups
2976+
self.assertTrue(isinstance(groups,dict))
2977+
self.assertTrue(len(groups) == 3)
29702978

29712979
def test_timegrouper_with_reg_groups(self):
29722980

@@ -2975,7 +2983,7 @@ def test_timegrouper_with_reg_groups(self):
29752983

29762984
import datetime as DT
29772985

2978-
df = DataFrame({
2986+
df_original = DataFrame({
29792987
'Branch' : 'A A A A A A A B'.split(),
29802988
'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
29812989
'Quantity': [1,3,5,1,8,1,9,3],
@@ -2990,32 +2998,34 @@ def test_timegrouper_with_reg_groups(self):
29902998
DT.datetime(2013,12,2,14,0),
29912999
]}).set_index('Date')
29923000

2993-
expected = DataFrame({
2994-
'Buyer': 'Carl Joe Mark'.split(),
2995-
'Quantity': [10,18,3],
2996-
'Date' : [
2997-
DT.datetime(2013,12,31,0,0),
2998-
DT.datetime(2013,12,31,0,0),
2999-
DT.datetime(2013,12,31,0,0),
3000-
]}).set_index(['Date','Buyer'])
3001-
3002-
result = df.groupby([pd.Grouper(freq='A'),'Buyer']).sum()
3003-
assert_frame_equal(result,expected)
3004-
3005-
expected = DataFrame({
3006-
'Buyer': 'Carl Mark Carl Joe'.split(),
3007-
'Quantity': [1,3,9,18],
3008-
'Date' : [
3009-
DT.datetime(2013,1,1,0,0),
3010-
DT.datetime(2013,1,1,0,0),
3011-
DT.datetime(2013,7,1,0,0),
3012-
DT.datetime(2013,7,1,0,0),
3013-
]}).set_index(['Date','Buyer'])
3014-
3015-
result = df.groupby([pd.Grouper(freq='6MS'),'Buyer']).sum()
3016-
assert_frame_equal(result,expected)
3017-
3018-
df = DataFrame({
3001+
df_sorted = df_original.sort(columns='Quantity', ascending=False)
3002+
3003+
for df in [df_original, df_sorted]:
3004+
expected = DataFrame({
3005+
'Buyer': 'Carl Joe Mark'.split(),
3006+
'Quantity': [10,18,3],
3007+
'Date' : [
3008+
DT.datetime(2013,12,31,0,0),
3009+
DT.datetime(2013,12,31,0,0),
3010+
DT.datetime(2013,12,31,0,0),
3011+
]}).set_index(['Date','Buyer'])
3012+
3013+
result = df.groupby([pd.Grouper(freq='A'),'Buyer']).sum()
3014+
assert_frame_equal(result,expected)
3015+
3016+
expected = DataFrame({
3017+
'Buyer': 'Carl Mark Carl Joe'.split(),
3018+
'Quantity': [1,3,9,18],
3019+
'Date' : [
3020+
DT.datetime(2013,1,1,0,0),
3021+
DT.datetime(2013,1,1,0,0),
3022+
DT.datetime(2013,7,1,0,0),
3023+
DT.datetime(2013,7,1,0,0),
3024+
]}).set_index(['Date','Buyer'])
3025+
result = df.groupby([pd.Grouper(freq='6MS'),'Buyer']).sum()
3026+
assert_frame_equal(result,expected)
3027+
3028+
df_original = DataFrame({
30193029
'Branch' : 'A A A A A A A B'.split(),
30203030
'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
30213031
'Quantity': [1,3,5,1,8,1,9,3],
@@ -3030,81 +3040,105 @@ def test_timegrouper_with_reg_groups(self):
30303040
DT.datetime(2013,10,2,14,0),
30313041
]}).set_index('Date')
30323042

3033-
expected = DataFrame({
3034-
'Buyer': 'Carl Joe Mark Carl Joe'.split(),
3035-
'Quantity': [6,8,3,4,10],
3036-
'Date' : [
3037-
DT.datetime(2013,10,1,0,0),
3038-
DT.datetime(2013,10,1,0,0),
3039-
DT.datetime(2013,10,1,0,0),
3040-
DT.datetime(2013,10,2,0,0),
3041-
DT.datetime(2013,10,2,0,0),
3042-
]}).set_index(['Date','Buyer'])
3043-
3044-
result = df.groupby([pd.Grouper(freq='1D'),'Buyer']).sum()
3045-
assert_frame_equal(result,expected)
3046-
3047-
result = df.groupby([pd.Grouper(freq='1M'),'Buyer']).sum()
3048-
expected = DataFrame({
3049-
'Buyer': 'Carl Joe Mark'.split(),
3050-
'Quantity': [10,18,3],
3051-
'Date' : [
3052-
DT.datetime(2013,10,31,0,0),
3053-
DT.datetime(2013,10,31,0,0),
3054-
DT.datetime(2013,10,31,0,0),
3055-
]}).set_index(['Date','Buyer'])
3056-
assert_frame_equal(result,expected)
3057-
3058-
# passing the name
3059-
df = df.reset_index()
3060-
result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum()
3061-
assert_frame_equal(result,expected)
3062-
3063-
self.assertRaises(KeyError, lambda : df.groupby([pd.Grouper(freq='1M',key='foo'),'Buyer']).sum())
3064-
3065-
# passing the level
3066-
df = df.set_index('Date')
3067-
result = df.groupby([pd.Grouper(freq='1M',level='Date'),'Buyer']).sum()
3068-
assert_frame_equal(result,expected)
3069-
result = df.groupby([pd.Grouper(freq='1M',level=0),'Buyer']).sum()
3070-
assert_frame_equal(result,expected)
3071-
3072-
self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',level='foo'),'Buyer']).sum())
3073-
3074-
# multi names
3075-
df = df.copy()
3076-
df['Date'] = df.index + pd.offsets.MonthEnd(2)
3077-
result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum()
3078-
expected = DataFrame({
3079-
'Buyer': 'Carl Joe Mark'.split(),
3080-
'Quantity': [10,18,3],
3081-
'Date' : [
3082-
DT.datetime(2013,11,30,0,0),
3083-
DT.datetime(2013,11,30,0,0),
3084-
DT.datetime(2013,11,30,0,0),
3085-
]}).set_index(['Date','Buyer'])
3086-
assert_frame_equal(result,expected)
3087-
3088-
# error as we have both a level and a name!
3089-
self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',key='Date',level='Date'),'Buyer']).sum())
3090-
3043+
df_sorted = df_original.sort(columns='Quantity', ascending=False)
3044+
for df in [df_original, df_sorted]:
3045+
3046+
expected = DataFrame({
3047+
'Buyer': 'Carl Joe Mark Carl Joe'.split(),
3048+
'Quantity': [6,8,3,4,10],
3049+
'Date' : [
3050+
DT.datetime(2013,10,1,0,0),
3051+
DT.datetime(2013,10,1,0,0),
3052+
DT.datetime(2013,10,1,0,0),
3053+
DT.datetime(2013,10,2,0,0),
3054+
DT.datetime(2013,10,2,0,0),
3055+
]}).set_index(['Date','Buyer'])
3056+
3057+
result = df.groupby([pd.Grouper(freq='1D'),'Buyer']).sum()
3058+
assert_frame_equal(result,expected)
3059+
3060+
result = df.groupby([pd.Grouper(freq='1M'),'Buyer']).sum()
3061+
expected = DataFrame({
3062+
'Buyer': 'Carl Joe Mark'.split(),
3063+
'Quantity': [10,18,3],
3064+
'Date' : [
3065+
DT.datetime(2013,10,31,0,0),
3066+
DT.datetime(2013,10,31,0,0),
3067+
DT.datetime(2013,10,31,0,0),
3068+
]}).set_index(['Date','Buyer'])
3069+
assert_frame_equal(result,expected)
3070+
3071+
# passing the name
3072+
df = df.reset_index()
3073+
result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum()
3074+
assert_frame_equal(result,expected)
3075+
3076+
self.assertRaises(KeyError, lambda : df.groupby([pd.Grouper(freq='1M',key='foo'),'Buyer']).sum())
3077+
3078+
# passing the level
3079+
df = df.set_index('Date')
3080+
result = df.groupby([pd.Grouper(freq='1M',level='Date'),'Buyer']).sum()
3081+
assert_frame_equal(result,expected)
3082+
result = df.groupby([pd.Grouper(freq='1M',level=0),'Buyer']).sum()
3083+
assert_frame_equal(result,expected)
3084+
3085+
self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',level='foo'),'Buyer']).sum())
3086+
3087+
# multi names
3088+
df = df.copy()
3089+
df['Date'] = df.index + pd.offsets.MonthEnd(2)
3090+
result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum()
3091+
expected = DataFrame({
3092+
'Buyer': 'Carl Joe Mark'.split(),
3093+
'Quantity': [10,18,3],
3094+
'Date' : [
3095+
DT.datetime(2013,11,30,0,0),
3096+
DT.datetime(2013,11,30,0,0),
3097+
DT.datetime(2013,11,30,0,0),
3098+
]}).set_index(['Date','Buyer'])
3099+
assert_frame_equal(result,expected)
3100+
3101+
# error as we have both a level and a name!
3102+
self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',key='Date',level='Date'),'Buyer']).sum())
3103+
3104+
3105+
# single groupers
3106+
expected = DataFrame({ 'Quantity' : [31],
3107+
'Date' : [DT.datetime(2013,10,31,0,0)] }).set_index('Date')
3108+
result = df.groupby(pd.Grouper(freq='1M')).sum()
3109+
assert_frame_equal(result, expected)
30913110

3092-
# single groupers
3093-
expected = DataFrame({ 'Quantity' : [31],
3094-
'Date' : [DT.datetime(2013,10,31,0,0)] }).set_index('Date')
3095-
result = df.groupby(pd.Grouper(freq='1M')).sum()
3096-
assert_frame_equal(result, expected)
3111+
result = df.groupby([pd.Grouper(freq='1M')]).sum()
3112+
assert_frame_equal(result, expected)
30973113

3098-
result = df.groupby([pd.Grouper(freq='1M')]).sum()
3099-
assert_frame_equal(result, expected)
3114+
expected = DataFrame({ 'Quantity' : [31],
3115+
'Date' : [DT.datetime(2013,11,30,0,0)] }).set_index('Date')
3116+
result = df.groupby(pd.Grouper(freq='1M',key='Date')).sum()
3117+
assert_frame_equal(result, expected)
31003118

3101-
expected = DataFrame({ 'Quantity' : [31],
3102-
'Date' : [DT.datetime(2013,11,30,0,0)] }).set_index('Date')
3103-
result = df.groupby(pd.Grouper(freq='1M',key='Date')).sum()
3104-
assert_frame_equal(result, expected)
3119+
result = df.groupby([pd.Grouper(freq='1M',key='Date')]).sum()
3120+
assert_frame_equal(result, expected)
31053121

3106-
result = df.groupby([pd.Grouper(freq='1M',key='Date')]).sum()
3107-
assert_frame_equal(result, expected)
3122+
# GH 6764 multiple grouping with/without sort
3123+
df = DataFrame({
3124+
'date' : pd.to_datetime([
3125+
'20121002','20121007','20130130','20130202','20130305','20121002',
3126+
'20121207','20130130','20130202','20130305','20130202','20130305']),
3127+
'user_id' : [1,1,1,1,1,3,3,3,5,5,5,5],
3128+
'whole_cost' : [1790,364,280,259,201,623,90,312,359,301,359,801],
3129+
'cost1' : [12,15,10,24,39,1,0,90,45,34,1,12] }).set_index('date')
3130+
3131+
for freq in ['D', 'M', 'A', 'Q-APR']:
3132+
expected = df.groupby('user_id')['whole_cost'].resample(
3133+
freq, how='sum').dropna().reorder_levels(
3134+
['date','user_id']).sortlevel().astype('int64')
3135+
expected.name = 'whole_cost'
3136+
3137+
result1 = df.sort_index().groupby([pd.TimeGrouper(freq=freq), 'user_id'])['whole_cost'].sum()
3138+
assert_series_equal(result1, expected)
3139+
3140+
result2 = df.groupby([pd.TimeGrouper(freq=freq), 'user_id'])['whole_cost'].sum()
3141+
assert_series_equal(result2, expected)
31083142

31093143
def test_cumcount(self):
31103144
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])

pandas/tseries/resample.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,8 @@ def _get_binner_for_grouping(self, obj):
138138
# since we may have had to sort
139139
# may need to reorder groups here
140140
if self.indexer is not None:
141-
grouper = grouper.take(self.indexer)
141+
indexer = self.indexer.argsort(kind='quicksort')
142+
grouper = grouper.take(indexer)
142143
return grouper
143144

144145
def _get_time_bins(self, ax):
@@ -161,7 +162,7 @@ def _get_time_bins(self, ax):
161162

162163
# a little hack
163164
trimmed = False
164-
if (len(binner) > 2 and binner[-2] == ax[-1] and
165+
if (len(binner) > 2 and binner[-2] == ax.max() and
165166
self.closed == 'right'):
166167

167168
binner = binner[:-1]
@@ -204,7 +205,7 @@ def _adjust_bin_edges(self, binner, ax_values):
204205
bin_edges = bin_edges + day_nanos - 1
205206

206207
# intraday values on last day
207-
if bin_edges[-2] > ax_values[-1]:
208+
if bin_edges[-2] > ax_values.max():
208209
bin_edges = bin_edges[:-1]
209210
binner = binner[:-1]
210211

@@ -320,8 +321,8 @@ def _resample_periods(self):
320321
# Get the fill indexer
321322
indexer = memb.get_indexer(new_index, method=self.fill_method,
322323
limit=self.limit)
323-
324324
return _take_new_index(obj, indexer, new_index, axis=self.axis)
325+
325326
else:
326327
raise ValueError('Frequency %s cannot be resampled to %s'
327328
% (axlabels.freq, self.freq))
@@ -352,7 +353,7 @@ def _get_range_edges(axis, offset, closed='left', base=0):
352353
return _adjust_dates_anchored(axis[0], axis[-1], offset,
353354
closed=closed, base=base)
354355

355-
first, last = axis[0], axis[-1]
356+
first, last = axis.min(), axis.max()
356357
if not isinstance(offset, Tick): # and first.time() != last.time():
357358
# hack!
358359
first = tools.normalize_date(first)

0 commit comments

Comments
 (0)