Skip to content

BUG: TimeGrouper outputs different result by column order #6908

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 19, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ Bug Fixes
- Bug in TimeGrouper/resample when presented with a non-monotonic DatetimeIndex would return invalid results. (:issue:`4161`)
- Bug in index name propogation in TimeGrouper/resample (:issue:`4161`)
- TimeGrouper has a more compatible API to the rest of the groupers (e.g. ``groups`` was missing) (:issue:`3881`)
- Bug in multiple grouping with a TimeGrouper depending on target column order (:issue:`6764`)
- Bug in ``pd.eval`` when parsing strings with possible tokens like ``'&'``
(:issue:`6351`)
- Bug correctly handle placements of ``-inf`` in Panels when dividing by integer 0 (:issue:`6178`)
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,11 +263,11 @@ def _set_grouper(self, obj, sort=False):
if not (level == 0 or level == ax.name):
raise ValueError("The grouper level {0} is not valid".format(level))

# possibly sort
if (self.sort or sort) and not ax.is_monotonic:
indexer = self.indexer = ax.argsort(kind='quicksort')
ax = ax.take(indexer)
obj = obj.take(indexer, axis=self.axis, convert=False, is_copy=False)
# possibly sort
if (self.sort or sort) and not ax.is_monotonic:
indexer = self.indexer = ax.argsort(kind='quicksort')
ax = ax.take(indexer)
obj = obj.take(indexer, axis=self.axis, convert=False, is_copy=False)

self.obj = obj
self.grouper = ax
Expand Down
272 changes: 153 additions & 119 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2914,7 +2914,7 @@ def test_groupby_with_timegrouper(self):
# TimeGrouper requires a sorted index
# also verifies that the resultant index has the correct name
import datetime as DT
df = DataFrame({
df_original = DataFrame({
'Buyer': 'Carl Carl Carl Carl Joe Carl'.split(),
'Quantity': [18,3,5,1,9,3],
'Date' : [
Expand All @@ -2925,29 +2925,34 @@ def test_groupby_with_timegrouper(self):
DT.datetime(2013,12,2,12,0),
DT.datetime(2013,9,2,14,0),
]})
df = df.set_index(['Date'])

# GH 6908 change target column's order
df_reordered = df_original.sort(columns='Quantity')

expected = DataFrame({ 'Quantity' : np.nan },
index=date_range('20130901 13:00:00','20131205 13:00:00',
freq='5D',name='Date',closed='left'))
expected.iloc[[0,6,18],0] = np.array([24.,6.,9.],dtype='float64')
for df in [df_original, df_reordered]:
df = df.set_index(['Date'])

result1 = df.resample('5D',how=sum)
assert_frame_equal(result1, expected)
expected = DataFrame({ 'Quantity' : np.nan },
index=date_range('20130901 13:00:00','20131205 13:00:00',
freq='5D',name='Date',closed='left'))
expected.iloc[[0,6,18],0] = np.array([24.,6.,9.],dtype='float64')

df_sorted = df.sort_index()
result2 = df_sorted.groupby(pd.TimeGrouper(freq='5D')).sum()
assert_frame_equal(result2, expected)
result1 = df.resample('5D',how=sum)
assert_frame_equal(result1, expected)

result3 = df.groupby(pd.TimeGrouper(freq='5D')).sum()
assert_frame_equal(result3, expected)
df_sorted = df.sort_index()
result2 = df_sorted.groupby(pd.TimeGrouper(freq='5D')).sum()
assert_frame_equal(result2, expected)

result3 = df.groupby(pd.TimeGrouper(freq='5D')).sum()
assert_frame_equal(result3, expected)

def test_groupby_with_timegrouper_methods(self):
# GH 3881
# make sure API of timegrouper conforms

import datetime as DT
df = pd.DataFrame({
df_original = pd.DataFrame({
'Branch' : 'A A A A A B'.split(),
'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(),
'Quantity': [1,3,5,8,9,3],
Expand All @@ -2960,13 +2965,16 @@ def test_groupby_with_timegrouper_methods(self):
DT.datetime(2013,12,2,14,0),
]})

df = df.set_index('Date', drop=False)
g = df.groupby(pd.TimeGrouper('6M'))
self.assertTrue(g.group_keys)
self.assertTrue(isinstance(g.grouper,pd.core.groupby.BinGrouper))
groups = g.groups
self.assertTrue(isinstance(groups,dict))
self.assertTrue(len(groups) == 3)
df_sorted = df_original.sort(columns='Quantity', ascending=False)

for df in [df_original, df_sorted]:
df = df.set_index('Date', drop=False)
g = df.groupby(pd.TimeGrouper('6M'))
self.assertTrue(g.group_keys)
self.assertTrue(isinstance(g.grouper,pd.core.groupby.BinGrouper))
groups = g.groups
self.assertTrue(isinstance(groups,dict))
self.assertTrue(len(groups) == 3)

def test_timegrouper_with_reg_groups(self):

Expand All @@ -2975,7 +2983,7 @@ def test_timegrouper_with_reg_groups(self):

import datetime as DT

df = DataFrame({
df_original = DataFrame({
'Branch' : 'A A A A A A A B'.split(),
'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
'Quantity': [1,3,5,1,8,1,9,3],
Expand All @@ -2990,32 +2998,34 @@ def test_timegrouper_with_reg_groups(self):
DT.datetime(2013,12,2,14,0),
]}).set_index('Date')

expected = DataFrame({
'Buyer': 'Carl Joe Mark'.split(),
'Quantity': [10,18,3],
'Date' : [
DT.datetime(2013,12,31,0,0),
DT.datetime(2013,12,31,0,0),
DT.datetime(2013,12,31,0,0),
]}).set_index(['Date','Buyer'])

result = df.groupby([pd.Grouper(freq='A'),'Buyer']).sum()
assert_frame_equal(result,expected)

expected = DataFrame({
'Buyer': 'Carl Mark Carl Joe'.split(),
'Quantity': [1,3,9,18],
'Date' : [
DT.datetime(2013,1,1,0,0),
DT.datetime(2013,1,1,0,0),
DT.datetime(2013,7,1,0,0),
DT.datetime(2013,7,1,0,0),
]}).set_index(['Date','Buyer'])

result = df.groupby([pd.Grouper(freq='6MS'),'Buyer']).sum()
assert_frame_equal(result,expected)

df = DataFrame({
df_sorted = df_original.sort(columns='Quantity', ascending=False)

for df in [df_original, df_sorted]:
expected = DataFrame({
'Buyer': 'Carl Joe Mark'.split(),
'Quantity': [10,18,3],
'Date' : [
DT.datetime(2013,12,31,0,0),
DT.datetime(2013,12,31,0,0),
DT.datetime(2013,12,31,0,0),
]}).set_index(['Date','Buyer'])

result = df.groupby([pd.Grouper(freq='A'),'Buyer']).sum()
assert_frame_equal(result,expected)

expected = DataFrame({
'Buyer': 'Carl Mark Carl Joe'.split(),
'Quantity': [1,3,9,18],
'Date' : [
DT.datetime(2013,1,1,0,0),
DT.datetime(2013,1,1,0,0),
DT.datetime(2013,7,1,0,0),
DT.datetime(2013,7,1,0,0),
]}).set_index(['Date','Buyer'])
result = df.groupby([pd.Grouper(freq='6MS'),'Buyer']).sum()
assert_frame_equal(result,expected)

df_original = DataFrame({
'Branch' : 'A A A A A A A B'.split(),
'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
'Quantity': [1,3,5,1,8,1,9,3],
Expand All @@ -3030,81 +3040,105 @@ def test_timegrouper_with_reg_groups(self):
DT.datetime(2013,10,2,14,0),
]}).set_index('Date')

expected = DataFrame({
'Buyer': 'Carl Joe Mark Carl Joe'.split(),
'Quantity': [6,8,3,4,10],
'Date' : [
DT.datetime(2013,10,1,0,0),
DT.datetime(2013,10,1,0,0),
DT.datetime(2013,10,1,0,0),
DT.datetime(2013,10,2,0,0),
DT.datetime(2013,10,2,0,0),
]}).set_index(['Date','Buyer'])

result = df.groupby([pd.Grouper(freq='1D'),'Buyer']).sum()
assert_frame_equal(result,expected)

result = df.groupby([pd.Grouper(freq='1M'),'Buyer']).sum()
expected = DataFrame({
'Buyer': 'Carl Joe Mark'.split(),
'Quantity': [10,18,3],
'Date' : [
DT.datetime(2013,10,31,0,0),
DT.datetime(2013,10,31,0,0),
DT.datetime(2013,10,31,0,0),
]}).set_index(['Date','Buyer'])
assert_frame_equal(result,expected)

# passing the name
df = df.reset_index()
result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum()
assert_frame_equal(result,expected)

self.assertRaises(KeyError, lambda : df.groupby([pd.Grouper(freq='1M',key='foo'),'Buyer']).sum())

# passing the level
df = df.set_index('Date')
result = df.groupby([pd.Grouper(freq='1M',level='Date'),'Buyer']).sum()
assert_frame_equal(result,expected)
result = df.groupby([pd.Grouper(freq='1M',level=0),'Buyer']).sum()
assert_frame_equal(result,expected)

self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',level='foo'),'Buyer']).sum())

# multi names
df = df.copy()
df['Date'] = df.index + pd.offsets.MonthEnd(2)
result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum()
expected = DataFrame({
'Buyer': 'Carl Joe Mark'.split(),
'Quantity': [10,18,3],
'Date' : [
DT.datetime(2013,11,30,0,0),
DT.datetime(2013,11,30,0,0),
DT.datetime(2013,11,30,0,0),
]}).set_index(['Date','Buyer'])
assert_frame_equal(result,expected)

# error as we have both a level and a name!
self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',key='Date',level='Date'),'Buyer']).sum())

df_sorted = df_original.sort(columns='Quantity', ascending=False)
for df in [df_original, df_sorted]:

expected = DataFrame({
'Buyer': 'Carl Joe Mark Carl Joe'.split(),
'Quantity': [6,8,3,4,10],
'Date' : [
DT.datetime(2013,10,1,0,0),
DT.datetime(2013,10,1,0,0),
DT.datetime(2013,10,1,0,0),
DT.datetime(2013,10,2,0,0),
DT.datetime(2013,10,2,0,0),
]}).set_index(['Date','Buyer'])

result = df.groupby([pd.Grouper(freq='1D'),'Buyer']).sum()
assert_frame_equal(result,expected)

result = df.groupby([pd.Grouper(freq='1M'),'Buyer']).sum()
expected = DataFrame({
'Buyer': 'Carl Joe Mark'.split(),
'Quantity': [10,18,3],
'Date' : [
DT.datetime(2013,10,31,0,0),
DT.datetime(2013,10,31,0,0),
DT.datetime(2013,10,31,0,0),
]}).set_index(['Date','Buyer'])
assert_frame_equal(result,expected)

# passing the name
df = df.reset_index()
result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum()
assert_frame_equal(result,expected)

self.assertRaises(KeyError, lambda : df.groupby([pd.Grouper(freq='1M',key='foo'),'Buyer']).sum())

# passing the level
df = df.set_index('Date')
result = df.groupby([pd.Grouper(freq='1M',level='Date'),'Buyer']).sum()
assert_frame_equal(result,expected)
result = df.groupby([pd.Grouper(freq='1M',level=0),'Buyer']).sum()
assert_frame_equal(result,expected)

self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',level='foo'),'Buyer']).sum())

# multi names
df = df.copy()
df['Date'] = df.index + pd.offsets.MonthEnd(2)
result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum()
expected = DataFrame({
'Buyer': 'Carl Joe Mark'.split(),
'Quantity': [10,18,3],
'Date' : [
DT.datetime(2013,11,30,0,0),
DT.datetime(2013,11,30,0,0),
DT.datetime(2013,11,30,0,0),
]}).set_index(['Date','Buyer'])
assert_frame_equal(result,expected)

# error as we have both a level and a name!
self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',key='Date',level='Date'),'Buyer']).sum())


# single groupers
expected = DataFrame({ 'Quantity' : [31],
'Date' : [DT.datetime(2013,10,31,0,0)] }).set_index('Date')
result = df.groupby(pd.Grouper(freq='1M')).sum()
assert_frame_equal(result, expected)

# single groupers
expected = DataFrame({ 'Quantity' : [31],
'Date' : [DT.datetime(2013,10,31,0,0)] }).set_index('Date')
result = df.groupby(pd.Grouper(freq='1M')).sum()
assert_frame_equal(result, expected)
result = df.groupby([pd.Grouper(freq='1M')]).sum()
assert_frame_equal(result, expected)

result = df.groupby([pd.Grouper(freq='1M')]).sum()
assert_frame_equal(result, expected)
expected = DataFrame({ 'Quantity' : [31],
'Date' : [DT.datetime(2013,11,30,0,0)] }).set_index('Date')
result = df.groupby(pd.Grouper(freq='1M',key='Date')).sum()
assert_frame_equal(result, expected)

expected = DataFrame({ 'Quantity' : [31],
'Date' : [DT.datetime(2013,11,30,0,0)] }).set_index('Date')
result = df.groupby(pd.Grouper(freq='1M',key='Date')).sum()
assert_frame_equal(result, expected)
result = df.groupby([pd.Grouper(freq='1M',key='Date')]).sum()
assert_frame_equal(result, expected)

result = df.groupby([pd.Grouper(freq='1M',key='Date')]).sum()
assert_frame_equal(result, expected)
# GH 6764 multiple grouping with/without sort
df = DataFrame({
'date' : pd.to_datetime([
'20121002','20121007','20130130','20130202','20130305','20121002',
'20121207','20130130','20130202','20130305','20130202','20130305']),
'user_id' : [1,1,1,1,1,3,3,3,5,5,5,5],
'whole_cost' : [1790,364,280,259,201,623,90,312,359,301,359,801],
'cost1' : [12,15,10,24,39,1,0,90,45,34,1,12] }).set_index('date')

for freq in ['D', 'M', 'A', 'Q-APR']:
expected = df.groupby('user_id')['whole_cost'].resample(
freq, how='sum').dropna().reorder_levels(
['date','user_id']).sortlevel().astype('int64')
expected.name = 'whole_cost'

result1 = df.sort_index().groupby([pd.TimeGrouper(freq=freq), 'user_id'])['whole_cost'].sum()
assert_series_equal(result1, expected)

result2 = df.groupby([pd.TimeGrouper(freq=freq), 'user_id'])['whole_cost'].sum()
assert_series_equal(result2, expected)

def test_cumcount(self):
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
Expand Down
11 changes: 6 additions & 5 deletions pandas/tseries/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ def _get_binner_for_grouping(self, obj):
# since we may have had to sort
# may need to reorder groups here
if self.indexer is not None:
grouper = grouper.take(self.indexer)
indexer = self.indexer.argsort(kind='quicksort')
grouper = grouper.take(indexer)
return grouper

def _get_time_bins(self, ax):
Expand All @@ -161,7 +162,7 @@ def _get_time_bins(self, ax):

# a little hack
trimmed = False
if (len(binner) > 2 and binner[-2] == ax[-1] and
if (len(binner) > 2 and binner[-2] == ax.max() and
self.closed == 'right'):

binner = binner[:-1]
Expand Down Expand Up @@ -204,7 +205,7 @@ def _adjust_bin_edges(self, binner, ax_values):
bin_edges = bin_edges + day_nanos - 1

# intraday values on last day
if bin_edges[-2] > ax_values[-1]:
if bin_edges[-2] > ax_values.max():
bin_edges = bin_edges[:-1]
binner = binner[:-1]

Expand Down Expand Up @@ -320,8 +321,8 @@ def _resample_periods(self):
# Get the fill indexer
indexer = memb.get_indexer(new_index, method=self.fill_method,
limit=self.limit)

return _take_new_index(obj, indexer, new_index, axis=self.axis)

else:
raise ValueError('Frequency %s cannot be resampled to %s'
% (axlabels.freq, self.freq))
Expand Down Expand Up @@ -352,7 +353,7 @@ def _get_range_edges(axis, offset, closed='left', base=0):
return _adjust_dates_anchored(axis[0], axis[-1], offset,
closed=closed, base=base)

first, last = axis[0], axis[-1]
first, last = axis.min(), axis.max()
if not isinstance(offset, Tick): # and first.time() != last.time():
# hack!
first = tools.normalize_date(first)
Expand Down