From 717afb4c394fdf09778234c7f0369eaf0b45be58 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 16 Jan 2017 13:32:50 -0800 Subject: [PATCH] BUG: cummin/cummax nans inf values for int64 migrate accum into cython move datetime check into cython mask int64 inf values before entering cython functions Edit whatsnew and alter condition add missing methods edit space Linting & whatsnew edit --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/groupby.py | 35 +++++++++++++++----------- pandas/src/algos_groupby_helper.pxi.in | 22 ++++++++++------ pandas/tests/groupby/test_groupby.py | 30 ++++++++-------------- 4 files changed, 47 insertions(+), 42 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9895afdc4c7b5..9c4bb0980b569 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -365,7 +365,7 @@ Performance Improvements - Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`) - Improved performance of timeseries plotting with an irregular DatetimeIndex (or with ``compat_x=True``) (:issue:`15073`). -- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`) +- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`) - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index e4edbcacd5b0a..99220232114ce 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -18,13 +18,13 @@ is_timedelta64_dtype, is_datetime64_dtype, is_categorical_dtype, is_datetimelike, - is_datetime_or_timedelta_dtype, is_datetime64_any_dtype, is_bool, is_integer_dtype, is_complex_dtype, is_bool_dtype, is_scalar, is_list_like, + needs_i8_conversion, _ensure_float64, _ensure_platform_int, _ensure_int64, @@ -1876,15 +1876,21 @@ def _cython_operation(self, kind, values, how, axis): "supported for the 'how' argument") out_shape = (self.ngroups,) + values.shape[1:] + is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) - if is_datetime_or_timedelta_dtype(values.dtype): + if is_datetimelike: values = values.view('int64') is_numeric = True elif is_bool_dtype(values.dtype): values = _ensure_float64(values) elif is_integer_dtype(values): - values = values.astype('int64', copy=False) + # we use iNaT for the missing value on ints + # so pre-convert to guard this condition + if (values == tslib.iNaT).any(): + values = _ensure_float64(values) + else: + values = values.astype('int64', copy=False) elif is_numeric and not is_complex_dtype(values): values = _ensure_float64(values) else: @@ -1913,20 +1919,20 @@ def _cython_operation(self, kind, values, how, axis): fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( - result, counts, values, labels, func, is_numeric) + result, counts, values, labels, func, is_numeric, + is_datetimelike) elif kind == 'transform': result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) - # temporary storange for running-total type tranforms - accum = np.empty(out_shape, dtype=out_dtype) result = self._transform( - result, accum, values, labels, func, is_numeric) + result, values, labels, func, is_numeric, is_datetimelike) if is_integer_dtype(result): - if len(result[result == tslib.iNaT]) > 0: + mask = result == tslib.iNaT + if mask.any(): result = result.astype('float64') - result[result == tslib.iNaT] = np.nan + result[mask] = np.nan if kind == 'aggregate' and \ self._filter_empty_groups and not counts.all(): @@ -1962,7 +1968,7 @@ def transform(self, values, how, axis=0): return self._cython_operation('transform', values, how, axis) def _aggregate(self, result, counts, values, comp_ids, agg_func, - is_numeric): + is_numeric, is_datetimelike): if values.ndim > 3: # punting for now raise NotImplementedError("number of dimensions is currently " @@ -1977,8 +1983,9 @@ def _aggregate(self, result, counts, values, comp_ids, agg_func, return result - def _transform(self, result, accum, values, comp_ids, transform_func, - is_numeric): + def _transform(self, result, values, comp_ids, transform_func, + is_numeric, is_datetimelike): + comp_ids, _, ngroups = self.group_info if values.ndim > 3: # punting for now @@ -1989,9 +1996,9 @@ def _transform(self, result, accum, values, comp_ids, transform_func, chunk = chunk.squeeze() transform_func(result[:, :, i], values, - comp_ids, accum) + comp_ids, is_datetimelike) else: - transform_func(result, values, comp_ids, accum) + transform_func(result, values, comp_ids, is_datetimelike) return result diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in index 4b031afed3d43..fda1e51bd2b1f 100644 --- a/pandas/src/algos_groupby_helper.pxi.in +++ b/pandas/src/algos_groupby_helper.pxi.in @@ -574,16 +574,18 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[{{dest_type2}}, ndim=2] values, ndarray[int64_t] labels, - ndarray[{{dest_type2}}, ndim=2] accum): + bint is_datetimelike): """ Only transforms on axis=0 """ cdef: Py_ssize_t i, j, N, K, size {{dest_type2}} val, min_val = 0 + ndarray[{{dest_type2}}, ndim=2] accum int64_t lab N, K = ( values).shape + accum = np.empty_like(values) accum.fill({{inf_val}}) with nogil: @@ -600,7 +602,7 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, accum[lab, j] = min_val out[i, j] = accum[lab, j] # val = nan - else: + elif is_datetimelike: out[i, j] = {{nan_val}} @@ -609,16 +611,18 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[{{dest_type2}}, ndim=2] values, ndarray[int64_t] labels, - ndarray[{{dest_type2}}, ndim=2] accum): + bint is_datetimelike): """ Only transforms on axis=0 """ cdef: Py_ssize_t i, j, N, K, size {{dest_type2}} val, max_val = 0 + ndarray[{{dest_type2}}, ndim=2] accum int64_t lab N, K = ( values).shape + accum = np.empty_like(values) accum.fill(-{{inf_val}}) with nogil: @@ -635,7 +639,7 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, accum[lab, j] = max_val out[i, j] = accum[lab, j] # val = nan - else: + elif is_datetimelike: out[i, j] = {{nan_val}} {{endfor}} @@ -682,17 +686,18 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, def group_cumprod_float64(float64_t[:, :] out, float64_t[:, :] values, int64_t[:] labels, - float64_t[:, :] accum): + bint is_datetimelike): """ Only transforms on axis=0 """ cdef: Py_ssize_t i, j, N, K, size float64_t val + float64_t[:, :] accum int64_t lab N, K = ( values).shape - accum = np.ones_like(accum) + accum = np.ones_like(values) with nogil: for i in range(N): @@ -712,17 +717,18 @@ def group_cumprod_float64(float64_t[:, :] out, def group_cumsum(numeric[:, :] out, numeric[:, :] values, int64_t[:] labels, - numeric[:, :] accum): + is_datetimelike): """ Only transforms on axis=0 """ cdef: Py_ssize_t i, j, N, K, size numeric val + numeric[:, :] accum int64_t lab N, K = ( values).shape - accum = np.zeros_like(accum) + accum = np.zeros_like(values) with nogil: for i in range(N): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 68689189a92b2..ffb6025163a6b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -5507,13 +5507,13 @@ def test_cython_group_transform_algos(self): ops = [(pd.algos.group_cumprod_float64, np.cumproduct, [np.float64]), (pd.algos.group_cumsum, np.cumsum, dtypes)] + is_datetimelike = False for pd_op, np_op, dtypes in ops: for dtype in dtypes: data = np.array([[1], [2], [3], [4]], dtype=dtype) ans = np.zeros_like(data) - accum = np.array([[0]], dtype=dtype) labels = np.array([0, 0, 0, 0], dtype=np.int64) - pd_op(ans, data, labels, accum) + pd_op(ans, data, labels, is_datetimelike) self.assert_numpy_array_equal(np_op(data), ans[:, 0], check_dtype=False) @@ -5521,25 +5521,24 @@ def test_cython_group_transform_algos(self): labels = np.array([0, 0, 0, 0, 0], dtype=np.int64) data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64') - accum = np.array([[0.0]]) actual = np.zeros_like(data) actual.fill(np.nan) - pd.algos.group_cumprod_float64(actual, data, labels, accum) + pd.algos.group_cumprod_float64(actual, data, labels, is_datetimelike) expected = np.array([1, 2, 6, np.nan, 24], dtype='float64') self.assert_numpy_array_equal(actual[:, 0], expected) - accum = np.array([[0.0]]) actual = np.zeros_like(data) actual.fill(np.nan) - pd.algos.group_cumsum(actual, data, labels, accum) + pd.algos.group_cumsum(actual, data, labels, is_datetimelike) expected = np.array([1, 3, 6, np.nan, 10], dtype='float64') self.assert_numpy_array_equal(actual[:, 0], expected) # timedelta + is_datetimelike = True data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None] - accum = np.array([[0]], dtype='int64') actual = np.zeros_like(data, dtype='int64') - pd.algos.group_cumsum(actual, data.view('int64'), labels, accum) + pd.algos.group_cumsum(actual, data.view('int64'), labels, + is_datetimelike) expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64( 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'), np.timedelta64(5, 'ns')]) @@ -5965,12 +5964,9 @@ def test_cummin_cummax(self): df.loc[[2, 6], 'B'] = min_val expected.loc[[2, 3, 6, 7], 'B'] = min_val result = df.groupby('A').cummin() - - # TODO: GH 15019 - # overwriting NaNs - # tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() - # tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # cummax expected = pd.DataFrame({'B': expected_maxs}).astype(dtype) @@ -5983,13 +5979,9 @@ def test_cummin_cummax(self): df.loc[[2, 6], 'B'] = max_val expected.loc[[2, 3, 6, 7], 'B'] = max_val result = df.groupby('A').cummax() - - # TODO: GH 15019 - # overwriting NaNs - # tm.assert_frame_equal(result, expected) - + tm.assert_frame_equal(result, expected) expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() - # tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # Test nan in some values base_df.loc[[0, 2, 4, 6], 'B'] = np.nan