From 2cf3eb63aed9bbeb50ca5fbcf0ed809626f68ba9 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 16 Apr 2014 19:18:37 +0900 Subject: [PATCH] BUG: TimeGrouper outputs different result by column order --- doc/source/release.rst | 1 + pandas/core/groupby.py | 10 +- pandas/tests/test_groupby.py | 272 ++++++++++++++++++++--------------- pandas/tseries/resample.py | 11 +- 4 files changed, 165 insertions(+), 129 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index cc8e271d62183..6d8f915e2ebb8 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -294,6 +294,7 @@ Bug Fixes - Bug in TimeGrouper/resample when presented with a non-monotonic DatetimeIndex would return invalid results. (:issue:`4161`) - Bug in index name propogation in TimeGrouper/resample (:issue:`4161`) - TimeGrouper has a more compatible API to the rest of the groupers (e.g. ``groups`` was missing) (:issue:`3881`) +- Bug in multiple grouping with a TimeGrouper depending on target column order (:issue:`6764`) - Bug in ``pd.eval`` when parsing strings with possible tokens like ``'&'`` (:issue:`6351`) - Bug correctly handle placements of ``-inf`` in Panels when dividing by integer 0 (:issue:`6178`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index a32b25312d4ba..c0222ad248e0c 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -263,11 +263,11 @@ def _set_grouper(self, obj, sort=False): if not (level == 0 or level == ax.name): raise ValueError("The grouper level {0} is not valid".format(level)) - # possibly sort - if (self.sort or sort) and not ax.is_monotonic: - indexer = self.indexer = ax.argsort(kind='quicksort') - ax = ax.take(indexer) - obj = obj.take(indexer, axis=self.axis, convert=False, is_copy=False) + # possibly sort + if (self.sort or sort) and not ax.is_monotonic: + indexer = self.indexer = ax.argsort(kind='quicksort') + ax = ax.take(indexer) + obj = obj.take(indexer, axis=self.axis, convert=False, is_copy=False) self.obj = obj self.grouper = ax diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index c0b7425485cba..22d92c7b19fe1 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2914,7 +2914,7 @@ def test_groupby_with_timegrouper(self): # TimeGrouper requires a sorted index # also verifies that the resultant index has the correct name import datetime as DT - df = DataFrame({ + df_original = DataFrame({ 'Buyer': 'Carl Carl Carl Carl Joe Carl'.split(), 'Quantity': [18,3,5,1,9,3], 'Date' : [ @@ -2925,29 +2925,34 @@ def test_groupby_with_timegrouper(self): DT.datetime(2013,12,2,12,0), DT.datetime(2013,9,2,14,0), ]}) - df = df.set_index(['Date']) + + # GH 6908 change target column's order + df_reordered = df_original.sort(columns='Quantity') - expected = DataFrame({ 'Quantity' : np.nan }, - index=date_range('20130901 13:00:00','20131205 13:00:00', - freq='5D',name='Date',closed='left')) - expected.iloc[[0,6,18],0] = np.array([24.,6.,9.],dtype='float64') + for df in [df_original, df_reordered]: + df = df.set_index(['Date']) - result1 = df.resample('5D',how=sum) - assert_frame_equal(result1, expected) + expected = DataFrame({ 'Quantity' : np.nan }, + index=date_range('20130901 13:00:00','20131205 13:00:00', + freq='5D',name='Date',closed='left')) + expected.iloc[[0,6,18],0] = np.array([24.,6.,9.],dtype='float64') - df_sorted = df.sort_index() - result2 = df_sorted.groupby(pd.TimeGrouper(freq='5D')).sum() - assert_frame_equal(result2, expected) + result1 = df.resample('5D',how=sum) + assert_frame_equal(result1, expected) - result3 = df.groupby(pd.TimeGrouper(freq='5D')).sum() - assert_frame_equal(result3, expected) + df_sorted = df.sort_index() + result2 = df_sorted.groupby(pd.TimeGrouper(freq='5D')).sum() + assert_frame_equal(result2, expected) + + result3 = df.groupby(pd.TimeGrouper(freq='5D')).sum() + assert_frame_equal(result3, expected) def test_groupby_with_timegrouper_methods(self): # GH 3881 # make sure API of timegrouper conforms import datetime as DT - df = pd.DataFrame({ + df_original = pd.DataFrame({ 'Branch' : 'A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(), 'Quantity': [1,3,5,8,9,3], @@ -2960,13 +2965,16 @@ def test_groupby_with_timegrouper_methods(self): DT.datetime(2013,12,2,14,0), ]}) - df = df.set_index('Date', drop=False) - g = df.groupby(pd.TimeGrouper('6M')) - self.assertTrue(g.group_keys) - self.assertTrue(isinstance(g.grouper,pd.core.groupby.BinGrouper)) - groups = g.groups - self.assertTrue(isinstance(groups,dict)) - self.assertTrue(len(groups) == 3) + df_sorted = df_original.sort(columns='Quantity', ascending=False) + + for df in [df_original, df_sorted]: + df = df.set_index('Date', drop=False) + g = df.groupby(pd.TimeGrouper('6M')) + self.assertTrue(g.group_keys) + self.assertTrue(isinstance(g.grouper,pd.core.groupby.BinGrouper)) + groups = g.groups + self.assertTrue(isinstance(groups,dict)) + self.assertTrue(len(groups) == 3) def test_timegrouper_with_reg_groups(self): @@ -2975,7 +2983,7 @@ def test_timegrouper_with_reg_groups(self): import datetime as DT - df = DataFrame({ + df_original = DataFrame({ 'Branch' : 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), 'Quantity': [1,3,5,1,8,1,9,3], @@ -2990,32 +2998,34 @@ def test_timegrouper_with_reg_groups(self): DT.datetime(2013,12,2,14,0), ]}).set_index('Date') - expected = DataFrame({ - 'Buyer': 'Carl Joe Mark'.split(), - 'Quantity': [10,18,3], - 'Date' : [ - DT.datetime(2013,12,31,0,0), - DT.datetime(2013,12,31,0,0), - DT.datetime(2013,12,31,0,0), - ]}).set_index(['Date','Buyer']) - - result = df.groupby([pd.Grouper(freq='A'),'Buyer']).sum() - assert_frame_equal(result,expected) - - expected = DataFrame({ - 'Buyer': 'Carl Mark Carl Joe'.split(), - 'Quantity': [1,3,9,18], - 'Date' : [ - DT.datetime(2013,1,1,0,0), - DT.datetime(2013,1,1,0,0), - DT.datetime(2013,7,1,0,0), - DT.datetime(2013,7,1,0,0), - ]}).set_index(['Date','Buyer']) - - result = df.groupby([pd.Grouper(freq='6MS'),'Buyer']).sum() - assert_frame_equal(result,expected) - - df = DataFrame({ + df_sorted = df_original.sort(columns='Quantity', ascending=False) + + for df in [df_original, df_sorted]: + expected = DataFrame({ + 'Buyer': 'Carl Joe Mark'.split(), + 'Quantity': [10,18,3], + 'Date' : [ + DT.datetime(2013,12,31,0,0), + DT.datetime(2013,12,31,0,0), + DT.datetime(2013,12,31,0,0), + ]}).set_index(['Date','Buyer']) + + result = df.groupby([pd.Grouper(freq='A'),'Buyer']).sum() + assert_frame_equal(result,expected) + + expected = DataFrame({ + 'Buyer': 'Carl Mark Carl Joe'.split(), + 'Quantity': [1,3,9,18], + 'Date' : [ + DT.datetime(2013,1,1,0,0), + DT.datetime(2013,1,1,0,0), + DT.datetime(2013,7,1,0,0), + DT.datetime(2013,7,1,0,0), + ]}).set_index(['Date','Buyer']) + result = df.groupby([pd.Grouper(freq='6MS'),'Buyer']).sum() + assert_frame_equal(result,expected) + + df_original = DataFrame({ 'Branch' : 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), 'Quantity': [1,3,5,1,8,1,9,3], @@ -3030,81 +3040,105 @@ def test_timegrouper_with_reg_groups(self): DT.datetime(2013,10,2,14,0), ]}).set_index('Date') - expected = DataFrame({ - 'Buyer': 'Carl Joe Mark Carl Joe'.split(), - 'Quantity': [6,8,3,4,10], - 'Date' : [ - DT.datetime(2013,10,1,0,0), - DT.datetime(2013,10,1,0,0), - DT.datetime(2013,10,1,0,0), - DT.datetime(2013,10,2,0,0), - DT.datetime(2013,10,2,0,0), - ]}).set_index(['Date','Buyer']) - - result = df.groupby([pd.Grouper(freq='1D'),'Buyer']).sum() - assert_frame_equal(result,expected) - - result = df.groupby([pd.Grouper(freq='1M'),'Buyer']).sum() - expected = DataFrame({ - 'Buyer': 'Carl Joe Mark'.split(), - 'Quantity': [10,18,3], - 'Date' : [ - DT.datetime(2013,10,31,0,0), - DT.datetime(2013,10,31,0,0), - DT.datetime(2013,10,31,0,0), - ]}).set_index(['Date','Buyer']) - assert_frame_equal(result,expected) - - # passing the name - df = df.reset_index() - result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum() - assert_frame_equal(result,expected) - - self.assertRaises(KeyError, lambda : df.groupby([pd.Grouper(freq='1M',key='foo'),'Buyer']).sum()) - - # passing the level - df = df.set_index('Date') - result = df.groupby([pd.Grouper(freq='1M',level='Date'),'Buyer']).sum() - assert_frame_equal(result,expected) - result = df.groupby([pd.Grouper(freq='1M',level=0),'Buyer']).sum() - assert_frame_equal(result,expected) - - self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',level='foo'),'Buyer']).sum()) - - # multi names - df = df.copy() - df['Date'] = df.index + pd.offsets.MonthEnd(2) - result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum() - expected = DataFrame({ - 'Buyer': 'Carl Joe Mark'.split(), - 'Quantity': [10,18,3], - 'Date' : [ - DT.datetime(2013,11,30,0,0), - DT.datetime(2013,11,30,0,0), - DT.datetime(2013,11,30,0,0), - ]}).set_index(['Date','Buyer']) - assert_frame_equal(result,expected) - - # error as we have both a level and a name! - self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',key='Date',level='Date'),'Buyer']).sum()) - + df_sorted = df_original.sort(columns='Quantity', ascending=False) + for df in [df_original, df_sorted]: + + expected = DataFrame({ + 'Buyer': 'Carl Joe Mark Carl Joe'.split(), + 'Quantity': [6,8,3,4,10], + 'Date' : [ + DT.datetime(2013,10,1,0,0), + DT.datetime(2013,10,1,0,0), + DT.datetime(2013,10,1,0,0), + DT.datetime(2013,10,2,0,0), + DT.datetime(2013,10,2,0,0), + ]}).set_index(['Date','Buyer']) + + result = df.groupby([pd.Grouper(freq='1D'),'Buyer']).sum() + assert_frame_equal(result,expected) + + result = df.groupby([pd.Grouper(freq='1M'),'Buyer']).sum() + expected = DataFrame({ + 'Buyer': 'Carl Joe Mark'.split(), + 'Quantity': [10,18,3], + 'Date' : [ + DT.datetime(2013,10,31,0,0), + DT.datetime(2013,10,31,0,0), + DT.datetime(2013,10,31,0,0), + ]}).set_index(['Date','Buyer']) + assert_frame_equal(result,expected) + + # passing the name + df = df.reset_index() + result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum() + assert_frame_equal(result,expected) + + self.assertRaises(KeyError, lambda : df.groupby([pd.Grouper(freq='1M',key='foo'),'Buyer']).sum()) + + # passing the level + df = df.set_index('Date') + result = df.groupby([pd.Grouper(freq='1M',level='Date'),'Buyer']).sum() + assert_frame_equal(result,expected) + result = df.groupby([pd.Grouper(freq='1M',level=0),'Buyer']).sum() + assert_frame_equal(result,expected) + + self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',level='foo'),'Buyer']).sum()) + + # multi names + df = df.copy() + df['Date'] = df.index + pd.offsets.MonthEnd(2) + result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum() + expected = DataFrame({ + 'Buyer': 'Carl Joe Mark'.split(), + 'Quantity': [10,18,3], + 'Date' : [ + DT.datetime(2013,11,30,0,0), + DT.datetime(2013,11,30,0,0), + DT.datetime(2013,11,30,0,0), + ]}).set_index(['Date','Buyer']) + assert_frame_equal(result,expected) + + # error as we have both a level and a name! + self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',key='Date',level='Date'),'Buyer']).sum()) + + + # single groupers + expected = DataFrame({ 'Quantity' : [31], + 'Date' : [DT.datetime(2013,10,31,0,0)] }).set_index('Date') + result = df.groupby(pd.Grouper(freq='1M')).sum() + assert_frame_equal(result, expected) - # single groupers - expected = DataFrame({ 'Quantity' : [31], - 'Date' : [DT.datetime(2013,10,31,0,0)] }).set_index('Date') - result = df.groupby(pd.Grouper(freq='1M')).sum() - assert_frame_equal(result, expected) + result = df.groupby([pd.Grouper(freq='1M')]).sum() + assert_frame_equal(result, expected) - result = df.groupby([pd.Grouper(freq='1M')]).sum() - assert_frame_equal(result, expected) + expected = DataFrame({ 'Quantity' : [31], + 'Date' : [DT.datetime(2013,11,30,0,0)] }).set_index('Date') + result = df.groupby(pd.Grouper(freq='1M',key='Date')).sum() + assert_frame_equal(result, expected) - expected = DataFrame({ 'Quantity' : [31], - 'Date' : [DT.datetime(2013,11,30,0,0)] }).set_index('Date') - result = df.groupby(pd.Grouper(freq='1M',key='Date')).sum() - assert_frame_equal(result, expected) + result = df.groupby([pd.Grouper(freq='1M',key='Date')]).sum() + assert_frame_equal(result, expected) - result = df.groupby([pd.Grouper(freq='1M',key='Date')]).sum() - assert_frame_equal(result, expected) + # GH 6764 multiple grouping with/without sort + df = DataFrame({ + 'date' : pd.to_datetime([ + '20121002','20121007','20130130','20130202','20130305','20121002', + '20121207','20130130','20130202','20130305','20130202','20130305']), + 'user_id' : [1,1,1,1,1,3,3,3,5,5,5,5], + 'whole_cost' : [1790,364,280,259,201,623,90,312,359,301,359,801], + 'cost1' : [12,15,10,24,39,1,0,90,45,34,1,12] }).set_index('date') + + for freq in ['D', 'M', 'A', 'Q-APR']: + expected = df.groupby('user_id')['whole_cost'].resample( + freq, how='sum').dropna().reorder_levels( + ['date','user_id']).sortlevel().astype('int64') + expected.name = 'whole_cost' + + result1 = df.sort_index().groupby([pd.TimeGrouper(freq=freq), 'user_id'])['whole_cost'].sum() + assert_series_equal(result1, expected) + + result2 = df.groupby([pd.TimeGrouper(freq=freq), 'user_id'])['whole_cost'].sum() + assert_series_equal(result2, expected) def test_cumcount(self): df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A']) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 7f243c20fe56e..23a6ae0982771 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -138,7 +138,8 @@ def _get_binner_for_grouping(self, obj): # since we may have had to sort # may need to reorder groups here if self.indexer is not None: - grouper = grouper.take(self.indexer) + indexer = self.indexer.argsort(kind='quicksort') + grouper = grouper.take(indexer) return grouper def _get_time_bins(self, ax): @@ -161,7 +162,7 @@ def _get_time_bins(self, ax): # a little hack trimmed = False - if (len(binner) > 2 and binner[-2] == ax[-1] and + if (len(binner) > 2 and binner[-2] == ax.max() and self.closed == 'right'): binner = binner[:-1] @@ -204,7 +205,7 @@ def _adjust_bin_edges(self, binner, ax_values): bin_edges = bin_edges + day_nanos - 1 # intraday values on last day - if bin_edges[-2] > ax_values[-1]: + if bin_edges[-2] > ax_values.max(): bin_edges = bin_edges[:-1] binner = binner[:-1] @@ -320,8 +321,8 @@ def _resample_periods(self): # Get the fill indexer indexer = memb.get_indexer(new_index, method=self.fill_method, limit=self.limit) - return _take_new_index(obj, indexer, new_index, axis=self.axis) + else: raise ValueError('Frequency %s cannot be resampled to %s' % (axlabels.freq, self.freq)) @@ -352,7 +353,7 @@ def _get_range_edges(axis, offset, closed='left', base=0): return _adjust_dates_anchored(axis[0], axis[-1], offset, closed=closed, base=base) - first, last = axis[0], axis[-1] + first, last = axis.min(), axis.max() if not isinstance(offset, Tick): # and first.time() != last.time(): # hack! first = tools.normalize_date(first)