Skip to content

Commit c6b1047

Browse files
committed
add tests for inf and nan fixes on cumsum and sum
1 parent 344d53f commit c6b1047

File tree

6 files changed

+82
-70
lines changed

6 files changed

+82
-70
lines changed

pandas/_libs/groupby.pyx

Lines changed: 33 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -240,33 +240,39 @@ def group_cumsum(numeric_t[:, ::1] out,
240240
accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
241241
compensation = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
242242

243-
with nogil:
244-
for i in range(N):
245-
lab = labels[i]
243+
for i in range(N):
244+
lab = labels[i]
246245

247-
if lab < 0:
248-
continue
249-
for j in range(K):
250-
val = values[i, j]
246+
if lab < 0:
247+
continue
248+
for j in range(K):
249+
val = values[i, j]
251250

252-
# For floats, use Kahan summation to reduce floating-point
253-
# error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
254-
if numeric_t == float32_t or numeric_t == float64_t:
255-
if val == val:
256-
y = val - compensation[lab, j]
257-
t = accum[lab, j] + y
258-
compensation[lab, j] = t - accum[lab, j] - y
259-
accum[lab, j] = t
260-
out[i, j] = t
261-
else:
262-
out[i, j] = NaN
263-
if not skipna:
264-
accum[lab, j] = NaN
265-
break
266-
else:
267-
t = val + accum[lab, j]
251+
# For floats, use Kahan summation to reduce floating-point
252+
# error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
253+
if numeric_t == float32_t or numeric_t == float64_t:
254+
if np.isinf(val):
255+
out[i, j] = val
256+
accum[lab, j] = val
257+
break
258+
elif val == val:
259+
y = val - compensation[lab, j]
260+
t = accum[lab, j] + y
261+
compensation[lab, j] = t - accum[lab, j] - y
268262
accum[lab, j] = t
269263
out[i, j] = t
264+
if np.isinf(t):
265+
compensation[lab, j] = 0
266+
break
267+
else:
268+
out[i, j] = NaN
269+
if not skipna:
270+
accum[lab, j] = NaN
271+
break
272+
else:
273+
t = val + accum[lab, j]
274+
accum[lab, j] = t
275+
out[i, j] = t
270276

271277

272278
@cython.boundscheck(False)
@@ -517,7 +523,6 @@ def group_add(add_t[:, ::1] out,
517523
N, K = (<object>values).shape
518524

519525
if add_t is object:
520-
print('IN GROUPADD: Val')
521526
# NB: this does not use 'compensation' like the non-object track does.
522527
for i in range(N):
523528
lab = labels[i]
@@ -547,46 +552,34 @@ def group_add(add_t[:, ::1] out,
547552
else:
548553
out[i, j] = sumx[i, j]
549554
else:
550-
# print('IN GROUPADD wihtout gil: Val. THIS IS N ' + str(N))
551555
for i in range(N):
552556
lab = labels[i]
553-
# print('IN GROUPADD without gil: lab ' + str(lab) + ' WITH MINCOUNT ' + str(min_count))
554557
if lab < 0:
555558
continue
556559

557560
counts[lab] += 1
558561
for j in range(K):
559562
val = values[i, j]
560563

561-
# not nan
562-
# print('this is val ' + str(val))
563-
# print('this is val == val ' + str(val == val))
564+
# todo: using np.isinf not possible without gil, find other
565+
# function to release gil for the whole block
564566
if np.isinf(val):
565-
# print('val is INF or nan')
566567
sumx[lab, j] = val
567568
break
568-
elif val == val:
569+
# not nan
570+
if val == val:
569571
nobs[lab, j] += 1
570-
# print('before adding val ' + str(val))
571572
y = val - compensation[lab, j]
572573
t = sumx[lab, j] + y
573574
compensation[lab, j] = t - sumx[lab, j] - y
574575
sumx[lab, j] = t
575-
# print('after adding val ' + str(sumx[lab, j]))
576-
# val is nan
577-
else:
578-
sumx[lab, j] = val
579576

580-
# print('ncounts ' + str(ncounts))
581577
for i in range(ncounts):
582578
for j in range(K):
583579
if nobs[i, j] < min_count:
584-
# print('CATCH CASE WHERE nan is given, MIN_COUNT: ' + str(min_count) + ' nobs[i, j] ' + str(nobs[i, j]))
585580
out[i, j] = NAN
586581
else:
587-
# print('THIS IS OUT in ELSE CASE ' + str(sumx[i, j]))
588582
out[i, j] = sumx[i, j]
589-
# print('THIS IS OUT ' + str(out[i, j]))
590583

591584

592585
@cython.wraparound(False)

pandas/_libs/window/aggregations.pyx

Lines changed: 30 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x) nogi
8787

8888

8989
cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
90-
float64_t *compensation) nogil:
90+
float64_t *compensation):
9191
""" add a value from the sum calc using Kahan summation """
9292

9393
cdef:
@@ -100,10 +100,14 @@ cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
100100
t = sum_x[0] + y
101101
compensation[0] = t - sum_x[0] - y
102102
sum_x[0] = t
103+
if np.isinf(val):
104+
sum_x[0] = val
105+
nobs[0] = nobs[0] + 1
106+
compensation[0] = 0
103107

104108

105109
cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
106-
float64_t *compensation) nogil:
110+
float64_t *compensation):
107111
""" remove a value from the sum calc using Kahan summation """
108112

109113
cdef:
@@ -116,6 +120,10 @@ cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
116120
t = sum_x[0] + y
117121
compensation[0] = t - sum_x[0] - y
118122
sum_x[0] = t
123+
if np.isinf(val):
124+
sum_x[0] = val
125+
nobs[0] = nobs[0] - 1
126+
compensation[0] = 0
119127

120128

121129
def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
@@ -133,35 +141,32 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
133141
)
134142
output = np.empty(N, dtype=np.float64)
135143

136-
with nogil:
137-
138-
for i in range(0, N):
139-
s = start[i]
140-
e = end[i]
141-
142-
if i == 0 or not is_monotonic_increasing_bounds:
143-
144-
# setup
144+
for i in range(0, N):
145+
s = start[i]
146+
e = end[i]
145147

146-
for j in range(s, e):
147-
add_sum(values[j], &nobs, &sum_x, &compensation_add)
148+
if i == 0 or not is_monotonic_increasing_bounds:
148149

149-
else:
150+
# setup
151+
for j in range(s, e):
152+
print('go into add_sum, values[j] ' + str(values[j]))
153+
add_sum(values[j], &nobs, &sum_x, &compensation_add)
150154

151-
# calculate deletes
152-
for j in range(start[i - 1], s):
153-
remove_sum(values[j], &nobs, &sum_x, &compensation_remove)
155+
else:
156+
# calculate deletes
157+
for j in range(start[i - 1], s):
158+
remove_sum(values[j], &nobs, &sum_x, &compensation_remove)
154159

155-
# calculate adds
156-
for j in range(end[i - 1], e):
157-
add_sum(values[j], &nobs, &sum_x, &compensation_add)
160+
# calculate adds
161+
for j in range(end[i - 1], e):
162+
add_sum(values[j], &nobs, &sum_x, &compensation_add)
158163

159-
output[i] = calc_sum(minp, nobs, sum_x)
164+
output[i] = calc_sum(minp, nobs, sum_x)
160165

161-
if not is_monotonic_increasing_bounds:
162-
nobs = 0
163-
sum_x = 0.0
164-
compensation_remove = 0.0
166+
if not is_monotonic_increasing_bounds:
167+
nobs = 0
168+
sum_x = 0.0
169+
compensation_remove = 0.0
165170

166171
return output
167172

pandas/core/groupby/groupby.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1490,7 +1490,6 @@ def _agg_general(
14901490

14911491
with self._group_selection_context():
14921492
# try a cython aggregation if we can
1493-
# #import pdb; pdb.set_trace()
14941493
result = self._cython_agg_general(
14951494
how=alias,
14961495
alt=npfunc,

pandas/core/groupby/ops.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -514,7 +514,6 @@ def _call_cython_op(
514514
result_mask = result_mask.T
515515

516516
out_shape = self._get_output_shape(ngroups, values)
517-
#import pdb; pdb.set_trace()
518517
func, values = self.get_cython_func_and_vals(values, is_numeric)
519518
out_dtype = self.get_out_dtype(values.dtype)
520519

@@ -926,7 +925,6 @@ def _cython_operation(
926925

927926
ids, _, _ = self.group_info
928927
ngroups = self.ngroups
929-
#import pdb; pdb.set_trace()
930928
return cy_op.cython_operation(
931929
values=values,
932930
axis=axis,

pandas/core/internals/managers.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1259,7 +1259,6 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
12591259
dropped_any = False
12601260

12611261
for blk in self.blocks:
1262-
#import pdb; pdb.set_trace()
12631262
if blk.is_object:
12641263
# split on object-dtype blocks bc some columns may raise
12651264
# while others do not.

pandas/tests/groupby/test_function.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ def test_intercept_builtin_sum():
5858
result = grouped.agg(builtins.sum)
5959
result2 = grouped.apply(builtins.sum)
6060
expected = grouped.sum()
61-
import pdb; pdb.set_trace()
6261
tm.assert_series_equal(result, expected)
6362
tm.assert_series_equal(result2, expected)
6463

@@ -1163,3 +1162,22 @@ def test_mean_on_timedelta():
11631162
pd.to_timedelta([4, 5]), name="time", index=Index(["A", "B"], name="cat")
11641163
)
11651164
tm.assert_series_equal(result, expected)
1165+
1166+
1167+
def test_sum_with_nan_inf():
1168+
df = DataFrame(
1169+
{"a": ["hello", "hello", "world", "world"], "b": [np.inf, 10, np.nan, 10]}
1170+
)
1171+
gb = df.groupby("a")
1172+
result = gb.sum()
1173+
expected = DataFrame(
1174+
[np.inf, 10], index=Index(["hello", "world"], name="a"), columns=["b"]
1175+
)
1176+
tm.assert_frame_equal(result, expected)
1177+
1178+
def test_cumsum_inf():
1179+
ser = Series([np.inf, 1, 1])
1180+
1181+
result = ser.groupby([1, 1, 1]).cumsum()
1182+
expected = Series([np.inf, np.inf, np.inf])
1183+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)