-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
[BUG]: Implement Kahan summation for rolling().mean() to avoid numerical issues #36348
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
f12c078
46f499a
6d67d19
971fb26
5c8d7cc
58cc035
a4e3884
9cd1ebe
4c5b351
c70c91c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -277,32 +277,42 @@ cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs, | |
|
||
|
||
cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, | ||
Py_ssize_t *neg_ct) nogil: | ||
""" add a value from the mean calc """ | ||
Py_ssize_t *neg_ct, float64_t *c) nogil: | ||
""" add a value from the mean calc using Kahan summation """ | ||
cdef: | ||
float64_t y, t | ||
|
||
# Not NaN | ||
if notnan(val): | ||
nobs[0] = nobs[0] + 1 | ||
sum_x[0] = sum_x[0] + val | ||
y = val - c[0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a comment that using kahan summation There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I already added a comment in the function header. Is this sufficient? |
||
t = sum_x[0] + y | ||
c[0] = t - sum_x[0] - y | ||
sum_x[0] = t | ||
if signbit(val): | ||
neg_ct[0] = neg_ct[0] + 1 | ||
|
||
|
||
cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, | ||
Py_ssize_t *neg_ct) nogil: | ||
""" remove a value from the mean calc """ | ||
Py_ssize_t *neg_ct, float64_t *c) nogil: | ||
""" remove a value from the mean calc using Kahan summation """ | ||
cdef: | ||
float64_t y, t | ||
|
||
if notnan(val): | ||
nobs[0] = nobs[0] - 1 | ||
sum_x[0] = sum_x[0] - val | ||
y = - val - c[0] | ||
t = sum_x[0] + y | ||
c[0] = t - sum_x[0] - y | ||
sum_x[0] = t | ||
if signbit(val): | ||
neg_ct[0] = neg_ct[0] - 1 | ||
|
||
|
||
def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start, | ||
ndarray[int64_t] end, int64_t minp, int64_t win): | ||
cdef: | ||
float64_t val, prev_x, sum_x = 0 | ||
float64_t val, prev_x, sum_x = 0, c_add = 0, c_remove = 0 | ||
Py_ssize_t nobs = 0, i, neg_ct = 0, N = len(values) | ||
ndarray[float64_t] output | ||
|
||
|
@@ -311,16 +321,16 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start, | |
with nogil: | ||
for i in range(minp - 1): | ||
val = values[i] | ||
add_mean(val, &nobs, &sum_x, &neg_ct) | ||
add_mean(val, &nobs, &sum_x, &neg_ct, &c_add) | ||
output[i] = NaN | ||
|
||
for i in range(minp - 1, N): | ||
val = values[i] | ||
add_mean(val, &nobs, &sum_x, &neg_ct) | ||
add_mean(val, &nobs, &sum_x, &neg_ct, &c_add) | ||
|
||
if i > win - 1: | ||
prev_x = values[i - win] | ||
remove_mean(prev_x, &nobs, &sum_x, &neg_ct) | ||
remove_mean(prev_x, &nobs, &sum_x, &neg_ct, &c_remove) | ||
|
||
output[i] = calc_mean(minp, nobs, neg_ct, sum_x) | ||
|
||
|
@@ -330,7 +340,7 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start, | |
def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start, | ||
ndarray[int64_t] end, int64_t minp): | ||
cdef: | ||
float64_t val, sum_x = 0 | ||
float64_t val, c_add = 0, c_remove = 0, sum_x = 0 | ||
int64_t s, e | ||
Py_ssize_t nobs = 0, i, j, neg_ct = 0, N = len(values) | ||
ndarray[float64_t] output | ||
|
@@ -350,26 +360,26 @@ def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start, | |
# setup | ||
for j in range(s, e): | ||
val = values[j] | ||
add_mean(val, &nobs, &sum_x, &neg_ct) | ||
add_mean(val, &nobs, &sum_x, &neg_ct, &c_add) | ||
|
||
else: | ||
|
||
# calculate deletes | ||
for j in range(start[i - 1], s): | ||
val = values[j] | ||
remove_mean(val, &nobs, &sum_x, &neg_ct) | ||
remove_mean(val, &nobs, &sum_x, &neg_ct, &c_remove) | ||
|
||
# calculate adds | ||
for j in range(end[i - 1], e): | ||
val = values[j] | ||
add_mean(val, &nobs, &sum_x, &neg_ct) | ||
add_mean(val, &nobs, &sum_x, &neg_ct, &c_add) | ||
|
||
output[i] = calc_mean(minp, nobs, neg_ct, sum_x) | ||
|
||
if not is_monotonic_bounds: | ||
for j in range(s, e): | ||
val = values[j] | ||
remove_mean(val, &nobs, &sum_x, &neg_ct) | ||
remove_mean(val, &nobs, &sum_x, &neg_ct, &c_remove) | ||
return output | ||
|
||
# ---------------------------------------------------------------------- | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -696,3 +696,21 @@ def scaled_sum(*args): | |
expected = DataFrame(data={"X": [0.0, 0.5, 1.0, 1.5, 2.0]}, index=_index) | ||
result = df.groupby(**grouping).rolling(1).apply(scaled_sum, raw=raw, args=(2,)) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
def test_rolling_numerical_accuracy_kahan(): | ||
# GH: 36031 implementing kahan summation | ||
phofl marked this conversation as resolved.
Show resolved
Hide resolved
|
||
df = pd.DataFrame( | ||
{ | ||
"A": [3002399751580331.0, -0.0, -0.0] | ||
}, # First value is a single digit longer. | ||
index=[ | ||
pd.Timestamp("19700101 09:00:00"), | ||
pd.Timestamp("19700101 09:00:03"), | ||
pd.Timestamp("19700101 09:00:06"), | ||
], | ||
) | ||
result = ( | ||
df.resample("1s").ffill().rolling("3s", closed="left", min_periods=3).mean() | ||
) | ||
assert result.values[-1] == 0.0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we compare the entire DataFrame result for this test as well? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
Uh oh!
There was an error while loading. Please reload this page.