From 717afb4c394fdf09778234c7f0369eaf0b45be58 Mon Sep 17 00:00:00 2001
From: Matt Roeschke <emailformattr@gmail.com>
Date: Mon, 16 Jan 2017 13:32:50 -0800
Subject: [PATCH] BUG: cummin/cummax nans inf values for int64

migrate accum into cython

move datetime check into cython

mask int64 inf values before entering cython functions

Edit whatsnew and alter condition

add missing methods

edit space

Linting & whatsnew edit
---
 doc/source/whatsnew/v0.20.0.txt        |  2 +-
 pandas/core/groupby.py                 | 35 +++++++++++++++-----------
 pandas/src/algos_groupby_helper.pxi.in | 22 ++++++++++------
 pandas/tests/groupby/test_groupby.py   | 30 ++++++++--------------
 4 files changed, 47 insertions(+), 42 deletions(-)

diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index 9895afdc4c7b5..9c4bb0980b569 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -365,7 +365,7 @@ Performance Improvements
 - Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`)
 - Improved performance of timeseries plotting with an irregular DatetimeIndex
   (or with ``compat_x=True``) (:issue:`15073`).
-- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`)
+- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`)
 
 - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object.
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index e4edbcacd5b0a..99220232114ce 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -18,13 +18,13 @@
                                  is_timedelta64_dtype, is_datetime64_dtype,
                                  is_categorical_dtype,
                                  is_datetimelike,
-                                 is_datetime_or_timedelta_dtype,
                                  is_datetime64_any_dtype,
                                  is_bool, is_integer_dtype,
                                  is_complex_dtype,
                                  is_bool_dtype,
                                  is_scalar,
                                  is_list_like,
+                                 needs_i8_conversion,
                                  _ensure_float64,
                                  _ensure_platform_int,
                                  _ensure_int64,
@@ -1876,15 +1876,21 @@ def _cython_operation(self, kind, values, how, axis):
                                           "supported for the 'how' argument")
             out_shape = (self.ngroups,) + values.shape[1:]
 
+        is_datetimelike = needs_i8_conversion(values.dtype)
         is_numeric = is_numeric_dtype(values.dtype)
 
-        if is_datetime_or_timedelta_dtype(values.dtype):
+        if is_datetimelike:
             values = values.view('int64')
             is_numeric = True
         elif is_bool_dtype(values.dtype):
             values = _ensure_float64(values)
         elif is_integer_dtype(values):
-            values = values.astype('int64', copy=False)
+            # we use iNaT for the missing value on ints
+            # so pre-convert to guard this condition
+            if (values == tslib.iNaT).any():
+                values = _ensure_float64(values)
+            else:
+                values = values.astype('int64', copy=False)
         elif is_numeric and not is_complex_dtype(values):
             values = _ensure_float64(values)
         else:
@@ -1913,20 +1919,20 @@ def _cython_operation(self, kind, values, how, axis):
                                  fill_value=np.nan)
             counts = np.zeros(self.ngroups, dtype=np.int64)
             result = self._aggregate(
-                result, counts, values, labels, func, is_numeric)
+                result, counts, values, labels, func, is_numeric,
+                is_datetimelike)
         elif kind == 'transform':
             result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
                                  fill_value=np.nan)
 
-            # temporary storange for running-total type tranforms
-            accum = np.empty(out_shape, dtype=out_dtype)
             result = self._transform(
-                result, accum, values, labels, func, is_numeric)
+                result, values, labels, func, is_numeric, is_datetimelike)
 
         if is_integer_dtype(result):
-            if len(result[result == tslib.iNaT]) > 0:
+            mask = result == tslib.iNaT
+            if mask.any():
                 result = result.astype('float64')
-                result[result == tslib.iNaT] = np.nan
+                result[mask] = np.nan
 
         if kind == 'aggregate' and \
            self._filter_empty_groups and not counts.all():
@@ -1962,7 +1968,7 @@ def transform(self, values, how, axis=0):
         return self._cython_operation('transform', values, how, axis)
 
     def _aggregate(self, result, counts, values, comp_ids, agg_func,
-                   is_numeric):
+                   is_numeric, is_datetimelike):
         if values.ndim > 3:
             # punting for now
             raise NotImplementedError("number of dimensions is currently "
@@ -1977,8 +1983,9 @@ def _aggregate(self, result, counts, values, comp_ids, agg_func,
 
         return result
 
-    def _transform(self, result, accum, values, comp_ids, transform_func,
-                   is_numeric):
+    def _transform(self, result, values, comp_ids, transform_func,
+                   is_numeric, is_datetimelike):
+
         comp_ids, _, ngroups = self.group_info
         if values.ndim > 3:
             # punting for now
@@ -1989,9 +1996,9 @@ def _transform(self, result, accum, values, comp_ids, transform_func,
 
                 chunk = chunk.squeeze()
                 transform_func(result[:, :, i], values,
-                               comp_ids, accum)
+                               comp_ids, is_datetimelike)
         else:
-            transform_func(result, values, comp_ids, accum)
+            transform_func(result, values, comp_ids, is_datetimelike)
 
         return result
 
diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in
index 4b031afed3d43..fda1e51bd2b1f 100644
--- a/pandas/src/algos_groupby_helper.pxi.in
+++ b/pandas/src/algos_groupby_helper.pxi.in
@@ -574,16 +574,18 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                           ndarray[{{dest_type2}}, ndim=2] values,
                           ndarray[int64_t] labels,
-                          ndarray[{{dest_type2}}, ndim=2] accum):
+                          bint is_datetimelike):
     """
     Only transforms on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, size
         {{dest_type2}} val, min_val = 0
+        ndarray[{{dest_type2}}, ndim=2] accum
         int64_t lab
 
     N, K = (<object> values).shape
+    accum = np.empty_like(values)
     accum.fill({{inf_val}})
 
     with nogil:
@@ -600,7 +602,7 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                     accum[lab, j] = min_val
                     out[i, j] = accum[lab, j]
                 # val = nan
-                else:
+                elif is_datetimelike:
                     out[i, j] = {{nan_val}}
 
 
@@ -609,16 +611,18 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                           ndarray[{{dest_type2}}, ndim=2] values,
                           ndarray[int64_t] labels,
-                          ndarray[{{dest_type2}}, ndim=2] accum):
+                          bint is_datetimelike):
     """
     Only transforms on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, size
         {{dest_type2}} val, max_val = 0
+        ndarray[{{dest_type2}}, ndim=2] accum
         int64_t lab
 
     N, K = (<object> values).shape
+    accum = np.empty_like(values)
     accum.fill(-{{inf_val}})
 
     with nogil:
@@ -635,7 +639,7 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                     accum[lab, j] = max_val
                     out[i, j] = accum[lab, j]
                 # val = nan
-                else:
+                elif is_datetimelike:
                     out[i, j] = {{nan_val}}
 
 {{endfor}}
@@ -682,17 +686,18 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
 def group_cumprod_float64(float64_t[:, :] out,
                           float64_t[:, :] values,
                           int64_t[:] labels,
-                          float64_t[:, :] accum):
+                          bint is_datetimelike):
     """
     Only transforms on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, size
         float64_t val
+        float64_t[:, :] accum
         int64_t lab
 
     N, K = (<object> values).shape
-    accum = np.ones_like(accum)
+    accum = np.ones_like(values)
 
     with nogil:
         for i in range(N):
@@ -712,17 +717,18 @@ def group_cumprod_float64(float64_t[:, :] out,
 def group_cumsum(numeric[:, :] out,
                  numeric[:, :] values,
                  int64_t[:] labels,
-                 numeric[:, :] accum):
+                 is_datetimelike):
     """
     Only transforms on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, size
         numeric val
+        numeric[:, :] accum
         int64_t lab
 
     N, K = (<object> values).shape
-    accum = np.zeros_like(accum)
+    accum = np.zeros_like(values)
 
     with nogil:
         for i in range(N):
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 68689189a92b2..ffb6025163a6b 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -5507,13 +5507,13 @@ def test_cython_group_transform_algos(self):
         ops = [(pd.algos.group_cumprod_float64, np.cumproduct, [np.float64]),
                (pd.algos.group_cumsum, np.cumsum, dtypes)]
 
+        is_datetimelike = False
         for pd_op, np_op, dtypes in ops:
             for dtype in dtypes:
                 data = np.array([[1], [2], [3], [4]], dtype=dtype)
                 ans = np.zeros_like(data)
-                accum = np.array([[0]], dtype=dtype)
                 labels = np.array([0, 0, 0, 0], dtype=np.int64)
-                pd_op(ans, data, labels, accum)
+                pd_op(ans, data, labels, is_datetimelike)
                 self.assert_numpy_array_equal(np_op(data), ans[:, 0],
                                               check_dtype=False)
 
@@ -5521,25 +5521,24 @@ def test_cython_group_transform_algos(self):
         labels = np.array([0, 0, 0, 0, 0], dtype=np.int64)
 
         data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64')
-        accum = np.array([[0.0]])
         actual = np.zeros_like(data)
         actual.fill(np.nan)
-        pd.algos.group_cumprod_float64(actual, data, labels, accum)
+        pd.algos.group_cumprod_float64(actual, data, labels, is_datetimelike)
         expected = np.array([1, 2, 6, np.nan, 24], dtype='float64')
         self.assert_numpy_array_equal(actual[:, 0], expected)
 
-        accum = np.array([[0.0]])
         actual = np.zeros_like(data)
         actual.fill(np.nan)
-        pd.algos.group_cumsum(actual, data, labels, accum)
+        pd.algos.group_cumsum(actual, data, labels, is_datetimelike)
         expected = np.array([1, 3, 6, np.nan, 10], dtype='float64')
         self.assert_numpy_array_equal(actual[:, 0], expected)
 
         # timedelta
+        is_datetimelike = True
         data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None]
-        accum = np.array([[0]], dtype='int64')
         actual = np.zeros_like(data, dtype='int64')
-        pd.algos.group_cumsum(actual, data.view('int64'), labels, accum)
+        pd.algos.group_cumsum(actual, data.view('int64'), labels,
+                              is_datetimelike)
         expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64(
             2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'),
             np.timedelta64(5, 'ns')])
@@ -5965,12 +5964,9 @@ def test_cummin_cummax(self):
             df.loc[[2, 6], 'B'] = min_val
             expected.loc[[2, 3, 6, 7], 'B'] = min_val
             result = df.groupby('A').cummin()
-
-            # TODO: GH 15019
-            # overwriting NaNs
-            # tm.assert_frame_equal(result, expected)
+            tm.assert_frame_equal(result, expected)
             expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
-            # tm.assert_frame_equal(result, expected)
+            tm.assert_frame_equal(result, expected)
 
             # cummax
             expected = pd.DataFrame({'B': expected_maxs}).astype(dtype)
@@ -5983,13 +5979,9 @@ def test_cummin_cummax(self):
             df.loc[[2, 6], 'B'] = max_val
             expected.loc[[2, 3, 6, 7], 'B'] = max_val
             result = df.groupby('A').cummax()
-
-            # TODO: GH 15019
-            # overwriting NaNs
-            # tm.assert_frame_equal(result, expected)
-
+            tm.assert_frame_equal(result, expected)
             expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
-            # tm.assert_frame_equal(result, expected)
+            tm.assert_frame_equal(result, expected)
 
         # Test nan in some values
         base_df.loc[[0, 2, 4, 6], 'B'] = np.nan