add tests for inf and nan fixes on cumsum and sum

CloseChoice · CloseChoice · commit c6b104714819 · 2021-11-19T17:58:54.000+01:00
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -240,33 +240,39 @@ def group_cumsum(numeric_t[:, ::1] out,
     accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
     compensation = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
 
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
+    for i in range(N):
+        lab = labels[i]
 
-            if lab < 0:
-                continue
-            for j in range(K):
-                val = values[i, j]
+        if lab < 0:
+            continue
+        for j in range(K):
+            val = values[i, j]
 
-                # For floats, use Kahan summation to reduce floating-point
-                # error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
-                if numeric_t == float32_t or numeric_t == float64_t:
-                    if val == val:
-                        y = val - compensation[lab, j]
-                        t = accum[lab, j] + y
-                        compensation[lab, j] = t - accum[lab, j] - y
-                        accum[lab, j] = t
-                        out[i, j] = t
-                    else:
-                        out[i, j] = NaN
-                        if not skipna:
-                            accum[lab, j] = NaN
-                            break
-                else:
-                    t = val + accum[lab, j]
+            # For floats, use Kahan summation to reduce floating-point
+            # error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
+            if numeric_t == float32_t or numeric_t == float64_t:
+                if np.isinf(val):
+                    out[i, j] = val
+                    accum[lab, j] = val
+                    break
+                elif val == val:
+                    y = val - compensation[lab, j]
+                    t = accum[lab, j] + y
+                    compensation[lab, j] = t - accum[lab, j] - y
                     accum[lab, j] = t
                     out[i, j] = t
+                    if np.isinf(t):
+                        compensation[lab, j] = 0
+                        break
+                else:
+                    out[i, j] = NaN
+                    if not skipna:
+                        accum[lab, j] = NaN
+                        break
+            else:
+                t = val + accum[lab, j]
+                accum[lab, j] = t
+                out[i, j] = t
 
 
 @cython.boundscheck(False)
@@ -517,7 +523,6 @@ def group_add(add_t[:, ::1] out,
     N, K = (<object>values).shape
 
     if add_t is object:
-        print('IN GROUPADD: Val')
         # NB: this does not use 'compensation' like the non-object track does.
         for i in range(N):
             lab = labels[i]
@@ -547,46 +552,34 @@ def group_add(add_t[:, ::1] out,
                 else:
                     out[i, j] = sumx[i, j]
     else:
-        # print('IN GROUPADD wihtout gil: Val. THIS IS N ' + str(N))
         for i in range(N):
             lab = labels[i]
-            # print('IN GROUPADD without gil: lab ' + str(lab) + ' WITH MINCOUNT ' + str(min_count))
             if lab < 0:
                 continue
 
             counts[lab] += 1
             for j in range(K):
                 val = values[i, j]
 
-                # not nan
-                # print('this is val ' + str(val))
-                # print('this is val == val ' + str(val == val))
+                # todo: using np.isinf not possible without gil, find other
+                #  function to release gil for the whole block
                 if np.isinf(val):
-                    # print('val is INF or nan')
                     sumx[lab, j] = val
                     break
-                elif val == val:
+                # not nan
+                if val == val:
                     nobs[lab, j] += 1
-                    # print('before adding val ' + str(val))
                     y = val - compensation[lab, j]
                     t = sumx[lab, j] + y
                     compensation[lab, j] = t - sumx[lab, j] - y
                     sumx[lab, j] = t
-                    # print('after adding val ' + str(sumx[lab, j]))
-                # val is nan
-                else:
-                    sumx[lab, j] = val
 
-        # print('ncounts ' + str(ncounts))
         for i in range(ncounts):
             for j in range(K):
                 if nobs[i, j] < min_count:
-                    # print('CATCH CASE WHERE nan is given, MIN_COUNT: ' + str(min_count) + ' nobs[i, j] ' + str(nobs[i, j]))
                     out[i, j] = NAN
                 else:
-                    # print('THIS IS OUT in ELSE CASE ' + str(sumx[i, j]))
                     out[i, j] = sumx[i, j]
-                # print('THIS IS OUT ' + str(out[i, j]))
 
 
 @cython.wraparound(False)
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -87,7 +87,7 @@ cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x) nogi
 
 
 cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
-                         float64_t *compensation) nogil:
+                         float64_t *compensation):
     """ add a value from the sum calc using Kahan summation """
 
     cdef:
@@ -100,10 +100,14 @@ cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
         t = sum_x[0] + y
         compensation[0] = t - sum_x[0] - y
         sum_x[0] = t
+    if np.isinf(val):
+        sum_x[0] = val
+        nobs[0] = nobs[0] + 1
+        compensation[0] = 0
 
 
 cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
-                            float64_t *compensation) nogil:
+                            float64_t *compensation):
     """ remove a value from the sum calc using Kahan summation """
 
     cdef:
@@ -116,6 +120,10 @@ cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
         t = sum_x[0] + y
         compensation[0] = t - sum_x[0] - y
         sum_x[0] = t
+    if np.isinf(val):
+        sum_x[0] = val
+        nobs[0] = nobs[0] - 1
+        compensation[0] = 0
 
 
 def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
@@ -133,35 +141,32 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
     )
     output = np.empty(N, dtype=np.float64)
 
-    with nogil:
-
-        for i in range(0, N):
-            s = start[i]
-            e = end[i]
-
-            if i == 0 or not is_monotonic_increasing_bounds:
-
-                # setup
+    for i in range(0, N):
+        s = start[i]
+        e = end[i]
 
-                for j in range(s, e):
-                    add_sum(values[j], &nobs, &sum_x, &compensation_add)
+        if i == 0 or not is_monotonic_increasing_bounds:
 
-            else:
+            # setup
+            for j in range(s, e):
+                print('go into add_sum, values[j] ' + str(values[j]))
+                add_sum(values[j], &nobs, &sum_x, &compensation_add)
 
-                # calculate deletes
-                for j in range(start[i - 1], s):
-                    remove_sum(values[j], &nobs, &sum_x, &compensation_remove)
+        else:
+            # calculate deletes
+            for j in range(start[i - 1], s):
+                remove_sum(values[j], &nobs, &sum_x, &compensation_remove)
 
-                # calculate adds
-                for j in range(end[i - 1], e):
-                    add_sum(values[j], &nobs, &sum_x, &compensation_add)
+            # calculate adds
+            for j in range(end[i - 1], e):
+                add_sum(values[j], &nobs, &sum_x, &compensation_add)
 
-            output[i] = calc_sum(minp, nobs, sum_x)
+        output[i] = calc_sum(minp, nobs, sum_x)
 
-            if not is_monotonic_increasing_bounds:
-                nobs = 0
-                sum_x = 0.0
-                compensation_remove = 0.0
+        if not is_monotonic_increasing_bounds:
+            nobs = 0
+            sum_x = 0.0
+            compensation_remove = 0.0
 
     return output
 
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1490,7 +1490,6 @@ def _agg_general(
 
         with self._group_selection_context():
             # try a cython aggregation if we can
-            # #import pdb; pdb.set_trace()
             result = self._cython_agg_general(
                 how=alias,
                 alt=npfunc,
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -514,7 +514,6 @@ def _call_cython_op(
                 result_mask = result_mask.T
 
         out_shape = self._get_output_shape(ngroups, values)
-        #import pdb; pdb.set_trace()
         func, values = self.get_cython_func_and_vals(values, is_numeric)
         out_dtype = self.get_out_dtype(values.dtype)
 
@@ -926,7 +925,6 @@ def _cython_operation(
 
         ids, _, _ = self.group_info
         ngroups = self.ngroups
-        #import pdb; pdb.set_trace()
         return cy_op.cython_operation(
             values=values,
             axis=axis,
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1259,7 +1259,6 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
         dropped_any = False
 
         for blk in self.blocks:
-            #import pdb; pdb.set_trace()
             if blk.is_object:
                 # split on object-dtype blocks bc some columns may raise
                 #  while others do not.
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -58,7 +58,6 @@ def test_intercept_builtin_sum():
     result = grouped.agg(builtins.sum)
     result2 = grouped.apply(builtins.sum)
     expected = grouped.sum()
-    import pdb; pdb.set_trace()
     tm.assert_series_equal(result, expected)
     tm.assert_series_equal(result2, expected)
 
@@ -1163,3 +1162,22 @@ def test_mean_on_timedelta():
         pd.to_timedelta([4, 5]), name="time", index=Index(["A", "B"], name="cat")
     )
     tm.assert_series_equal(result, expected)
+
+
+def test_sum_with_nan_inf():
+    df = DataFrame(
+        {"a": ["hello", "hello", "world", "world"], "b": [np.inf, 10, np.nan, 10]}
+    )
+    gb = df.groupby("a")
+    result = gb.sum()
+    expected = DataFrame(
+        [np.inf, 10], index=Index(["hello", "world"], name="a"), columns=["b"]
+    )
+    tm.assert_frame_equal(result, expected)
+
+def test_cumsum_inf():
+    ser = Series([np.inf, 1, 1])
+
+    result = ser.groupby([1, 1, 1]).cumsum()
+    expected = Series([np.inf, np.inf, np.inf])
+    tm.assert_series_equal(result, expected)