From 035a62d67585977a05ca391436ed87b86a720a1e Mon Sep 17 00:00:00 2001 From: danielballan Date: Mon, 14 Apr 2014 16:50:28 -0400 Subject: [PATCH 1/9] ENH: Allow aggregate numeric operations on timedelta64. --- pandas/core/groupby.py | 9 +++++++-- pandas/tests/test_groupby.py | 13 +++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index cb5dedc887bca..efef5d94e4d52 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1084,7 +1084,8 @@ def _cython_agg_general(self, how, numeric_only=True): output = {} for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) - if numeric_only and not is_numeric: + is_timdelta64 = is_timedelta64_dtype(obj.dtype) + if numeric_only and not (is_numeric or is_timdelta64): continue try: @@ -2567,8 +2568,12 @@ def _cython_agg_blocks(self, how, numeric_only=True): data = data.get_numeric_data(copy=False) for block in data.blocks: - values = block._try_operate(block.values) + is_numeric = is_numeric_dtype(values.dtype) + is_timedelta64 = is_timedelta64_dtype(values.dtype) + + if numeric_only and not (is_numeric or is_timedelta64): + continue if block.is_numeric: values = _algos.ensure_float64(values) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 4077f468d8b1f..5b84c4ba18b00 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -4365,6 +4365,19 @@ def test_index_label_overlaps_location(self): expected = ser.take([1, 3, 4]) assert_series_equal(actual, expected) + def test_groupby_methods_on_timedelta64(self): + df = self.df.copy().iloc[:4] + df['E'] = pd.to_timedelta(['00:00:01', '00:00:02', '00:00:03', '00:00:04']) + # DataFrameGroupBy + actual = df.groupby('A').mean()['E'] + expected = pd.to_timedelta(Series(['00:00:03', '00:00:02'], index=['bar', 'foo'], name='E')) + assert_series_equal(actual, expected) + + ser = df['E'] + # SeriesGroupBy + actual = ser.groupby(df['A']).mean() + assert_series_equal(actual, expected) + def test_groupby_selection_with_methods(self): # some methods which require DatetimeIndex rng = pd.date_range('2014', periods=len(self.df)) From 5273f579ffb795164bd1ff6d0794b7b8a0d72571 Mon Sep 17 00:00:00 2001 From: danielballan Date: Tue, 15 Apr 2014 08:21:02 -0400 Subject: [PATCH 2/9] TST: More tests --- pandas/tests/test_groupby.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 5b84c4ba18b00..6738683e00c2e 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -7,7 +7,7 @@ from datetime import datetime from numpy import nan -from pandas import date_range,bdate_range, Timestamp +from pandas import date_range,bdate_range, Timestamp, _np_version_under1p7 from pandas.core.index import Index, MultiIndex, Int64Index from pandas.core.api import Categorical, DataFrame from pandas.core.groupby import (SpecificationError, DataError, @@ -603,6 +603,26 @@ def f(grp): e.name = None assert_series_equal(result,e) + # ...and with timedeltas + if not _np_version_under1p7: + df1 = df.copy() + df1['D'] = pd.to_timedelta(['00:00:01', '00:00:02', '00:00:03', + '00:00:04', '00:00:05', '00:00:06', '00:00:07']) + result = df1.groupby('A').apply(f)[['D']] + e = df1.groupby('A').first()[['D']] + e.loc['Pony'] = np.nan + assert_frame_equal(result, e) + + def f(grp): + if grp.name == 'Pony': + return None + return grp.iloc[0].loc['D'] + result = df1.groupby('A').apply(f) + e = df1.groupby('A').first()['D'].copy() + e.loc['Pony'] = np.nan + e.name = None + assert_series_equal(result,e) + def test_agg_api(self): # GH 6337 From 00cbc23047404a39824d86a325a775f9be50ad2c Mon Sep 17 00:00:00 2001 From: danielballan Date: Thu, 17 Apr 2014 09:12:50 -0400 Subject: [PATCH 3/9] WIP: Skip timedelta tests if np < 1.7 --- pandas/tests/test_groupby.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 6738683e00c2e..a96afd57a06dc 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -41,6 +41,12 @@ def _skip_if_mpl_not_installed(): except ImportError: raise nose.SkipTest("matplotlib not installed") + +def _skip_if_np_version_under1p7(): + if _np_version_under1p7: + raise nose.SkipTest("numpy version 1.7 has throughly broken timedelta") + + def commonSetUp(self): self.dateRange = bdate_range('1/1/2005', periods=250) self.stringIndex = Index([rands(8).upper() for x in range(250)]) @@ -607,21 +613,24 @@ def f(grp): if not _np_version_under1p7: df1 = df.copy() df1['D'] = pd.to_timedelta(['00:00:01', '00:00:02', '00:00:03', - '00:00:04', '00:00:05', '00:00:06', '00:00:07']) + '00:00:04', '00:00:05', '00:00:06', + '00:00:07']) result = df1.groupby('A').apply(f)[['D']] e = df1.groupby('A').first()[['D']] e.loc['Pony'] = np.nan + print(type(result)) + print(type(e)) assert_frame_equal(result, e) def f(grp): if grp.name == 'Pony': return None return grp.iloc[0].loc['D'] - result = df1.groupby('A').apply(f) + result = df1.groupby('A').apply(f)['D'] e = df1.groupby('A').first()['D'].copy() e.loc['Pony'] = np.nan e.name = None - assert_series_equal(result,e) + assert_series_equal(result, e) def test_agg_api(self): @@ -4386,6 +4395,7 @@ def test_index_label_overlaps_location(self): assert_series_equal(actual, expected) def test_groupby_methods_on_timedelta64(self): + _skip_if_np_version_under1p7() df = self.df.copy().iloc[:4] df['E'] = pd.to_timedelta(['00:00:01', '00:00:02', '00:00:03', '00:00:04']) # DataFrameGroupBy From 0974841d205b89bf69450d93c921fa282f06dc4b Mon Sep 17 00:00:00 2001 From: danielballan Date: Sun, 22 Jun 2014 10:00:15 -0400 Subject: [PATCH 4/9] WIP: Improved approach, but all is broken --- pandas/core/groupby.py | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index efef5d94e4d52..d339888b30a68 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1083,13 +1083,24 @@ def _try_cast(self, result, obj): def _cython_agg_general(self, how, numeric_only=True): output = {} for name, obj in self._iterate_slices(): - is_numeric = is_numeric_dtype(obj.dtype) - is_timdelta64 = is_timedelta64_dtype(obj.dtype) - if numeric_only and not (is_numeric or is_timdelta64): + if is_numeric_dtype(obj.dtype): + obj = com.ensure_float(obj) + is_numeric = True + out_dtype = 'f%d' % obj.dtype.itemsize + else: + is_numeric = issubclass(obj.dtype.type, (np.datetime64, + np.timedelta64)) + out_dtype = 'float64' + if is_numeric: + values = obj.view('int64') + else: + values = obj.astype(object) + + if numeric_only and not is_numeric: continue try: - result, names = self.grouper.aggregate(obj.values, how) + result, names = self.grouper.aggregate(values, how) except AssertionError as e: raise GroupByError(str(e)) output[name] = self._try_cast(result, obj) @@ -2569,12 +2580,22 @@ def _cython_agg_blocks(self, how, numeric_only=True): for block in data.blocks: values = block._try_operate(block.values) - is_numeric = is_numeric_dtype(values.dtype) - is_timedelta64 = is_timedelta64_dtype(values.dtype) - if numeric_only and not (is_numeric or is_timedelta64): + if is_numeric_dtype(values.dtype): + values = com.ensure_float(values) + is_numeric = True + else: + is_numeric = issubclass(values.dtype.type, (np.datetime64, + np.timedelta64)) + if is_numeric: + values = values.view('int64') + else: + values = values.astype(object) + + if numeric_only and not is_numeric: continue + # TODO DAN if block.is_numeric: values = _algos.ensure_float64(values) From f5ff0618a477278c19e5c3741b18ba43feacbde2 Mon Sep 17 00:00:00 2001 From: danielballan Date: Sun, 22 Jun 2014 22:32:33 -0400 Subject: [PATCH 5/9] WIP: Improved approach, 6 tests failing. --- pandas/core/groupby.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index d339888b30a68..36bf87c6aba8d 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1087,6 +1087,7 @@ def _cython_agg_general(self, how, numeric_only=True): obj = com.ensure_float(obj) is_numeric = True out_dtype = 'f%d' % obj.dtype.itemsize + values = obj.values else: is_numeric = issubclass(obj.dtype.type, (np.datetime64, np.timedelta64)) From 95d67ead28820c5752607c9cc7ac35f0a0597636 Mon Sep 17 00:00:00 2001 From: danielballan Date: Mon, 23 Jun 2014 22:07:36 -0400 Subject: [PATCH 6/9] FIX: Fix downcasting of float to timedelta. --- pandas/core/common.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index f8f5928ca7d51..171ce9462452f 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1271,14 +1271,23 @@ def _possibly_downcast_to_dtype(result, dtype): dtype = np.dtype(dtype) try: - # don't allow upcasts here (except if empty) + print dtype.kind, result.dtype.kind if dtype.kind == result.dtype.kind: if result.dtype.itemsize <= dtype.itemsize and np.prod(result.shape): return result if issubclass(dtype.type, np.floating): return result.astype(dtype) + + # a datetimelike + elif ((dtype.kind == 'M' and result.dtype.kind == 'i') or + dtype.kind == 'm'): + try: + result = result.astype(dtype) + except: + pass + elif dtype == np.bool_ or issubclass(dtype.type, np.integer): # if we don't have any elements, just astype it @@ -1309,13 +1318,6 @@ def _possibly_downcast_to_dtype(result, dtype): if (new_result == result).all(): return new_result - # a datetimelike - elif dtype.kind in ['M','m'] and result.dtype.kind in ['i']: - try: - result = result.astype(dtype) - except: - pass - except: pass From c1e81a4a0f7fda0d362f1de910a11562689574a0 Mon Sep 17 00:00:00 2001 From: danielballan Date: Mon, 23 Jun 2014 22:15:53 -0400 Subject: [PATCH 7/9] CLN: Removed unused variable. PEP8. --- pandas/core/groupby.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 36bf87c6aba8d..629b64d17bf77 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1090,8 +1090,7 @@ def _cython_agg_general(self, how, numeric_only=True): values = obj.values else: is_numeric = issubclass(obj.dtype.type, (np.datetime64, - np.timedelta64)) - out_dtype = 'float64' + np.timedelta64)) if is_numeric: values = obj.view('int64') else: From b47d3c8ef46668ecb7aee8e703a11f9adc66e49b Mon Sep 17 00:00:00 2001 From: danielballan Date: Thu, 26 Jun 2014 15:35:04 -0400 Subject: [PATCH 8/9] WIP: Rebased after #7560 --- pandas/core/groupby.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 629b64d17bf77..4b55b8cced559 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2581,20 +2581,6 @@ def _cython_agg_blocks(self, how, numeric_only=True): for block in data.blocks: values = block._try_operate(block.values) - if is_numeric_dtype(values.dtype): - values = com.ensure_float(values) - is_numeric = True - else: - is_numeric = issubclass(values.dtype.type, (np.datetime64, - np.timedelta64)) - if is_numeric: - values = values.view('int64') - else: - values = values.astype(object) - - if numeric_only and not is_numeric: - continue - # TODO DAN if block.is_numeric: values = _algos.ensure_float64(values) From a89ab8b4b4ca177d6e0a59a708a8a1985e046c9b Mon Sep 17 00:00:00 2001 From: danielballan Date: Tue, 27 Jan 2015 12:40:32 -0500 Subject: [PATCH 9/9] MNT: Remove checks for numpy < 1.7. --- pandas/tests/test_groupby.py | 49 ++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index a96afd57a06dc..734287baaa50d 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -7,7 +7,7 @@ from datetime import datetime from numpy import nan -from pandas import date_range,bdate_range, Timestamp, _np_version_under1p7 +from pandas import date_range,bdate_range, Timestamp from pandas.core.index import Index, MultiIndex, Int64Index from pandas.core.api import Categorical, DataFrame from pandas.core.groupby import (SpecificationError, DataError, @@ -42,11 +42,6 @@ def _skip_if_mpl_not_installed(): raise nose.SkipTest("matplotlib not installed") -def _skip_if_np_version_under1p7(): - if _np_version_under1p7: - raise nose.SkipTest("numpy version 1.7 has throughly broken timedelta") - - def commonSetUp(self): self.dateRange = bdate_range('1/1/2005', periods=250) self.stringIndex = Index([rands(8).upper() for x in range(250)]) @@ -610,27 +605,26 @@ def f(grp): assert_series_equal(result,e) # ...and with timedeltas - if not _np_version_under1p7: - df1 = df.copy() - df1['D'] = pd.to_timedelta(['00:00:01', '00:00:02', '00:00:03', - '00:00:04', '00:00:05', '00:00:06', - '00:00:07']) - result = df1.groupby('A').apply(f)[['D']] - e = df1.groupby('A').first()[['D']] - e.loc['Pony'] = np.nan - print(type(result)) - print(type(e)) - assert_frame_equal(result, e) - - def f(grp): - if grp.name == 'Pony': - return None - return grp.iloc[0].loc['D'] - result = df1.groupby('A').apply(f)['D'] - e = df1.groupby('A').first()['D'].copy() - e.loc['Pony'] = np.nan - e.name = None - assert_series_equal(result, e) + df1 = df.copy() + df1['D'] = pd.to_timedelta(['00:00:01', '00:00:02', '00:00:03', + '00:00:04', '00:00:05', '00:00:06', + '00:00:07']) + result = df1.groupby('A').apply(f)[['D']] + e = df1.groupby('A').first()[['D']] + e.loc['Pony'] = np.nan + print(type(result)) + print(type(e)) + assert_frame_equal(result, e) + + def f(grp): + if grp.name == 'Pony': + return None + return grp.iloc[0].loc['D'] + result = df1.groupby('A').apply(f)['D'] + e = df1.groupby('A').first()['D'].copy() + e.loc['Pony'] = np.nan + e.name = None + assert_series_equal(result, e) def test_agg_api(self): @@ -4395,7 +4389,6 @@ def test_index_label_overlaps_location(self): assert_series_equal(actual, expected) def test_groupby_methods_on_timedelta64(self): - _skip_if_np_version_under1p7() df = self.df.copy().iloc[:4] df['E'] = pd.to_timedelta(['00:00:01', '00:00:02', '00:00:03', '00:00:04']) # DataFrameGroupBy