Skip to content

Commit 7fe619f

Browse files
committed
Merge pull request #6954 from jreback/overflow
BUG: Bug in sum/mean on 32-bit platforms on overflows (GH6915)
2 parents 682ac7f + ff7bb2c commit 7fe619f

File tree

3 files changed

+80
-18
lines changed

3 files changed

+80
-18
lines changed

doc/source/release.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,7 @@ Bug Fixes
422422
- Bug in ``Series.rank`` and ``DataFrame.rank`` that caused small floats (<1e-13) to all receive the same rank (:issue:`6886`)
423423
- Bug in ``DataFrame.apply`` with functions that used *args or **kwargs and returned
424424
an empty result (:issue:`6952`)
425+
- Bug in sum/mean on 32-bit platforms on overflows (:issue:`6915`)
425426
426427
pandas 0.13.1
427428
-------------

pandas/core/nanops.py

Lines changed: 36 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def f(values, axis=None, skipna=True, **kwds):
7575
result.fill(0)
7676
return result
7777

78-
if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype):
78+
if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name):
7979
result = bn_func(values, axis=axis, **kwds)
8080

8181
# prefer to treat inf/-inf as NA, but must compute the func
@@ -92,11 +92,18 @@ def f(values, axis=None, skipna=True, **kwds):
9292
return f
9393

9494

95-
def _bn_ok_dtype(dt):
95+
def _bn_ok_dtype(dt, name):
9696
# Bottleneck chokes on datetime64
97-
time_types = np.datetime64, np.timedelta64
98-
return dt != np.object_ and not issubclass(dt.type, time_types)
97+
if dt != np.object_ and not issubclass(dt.type, (np.datetime64, np.timedelta64)):
9998

99+
# bottleneck does not properly upcast during the sum
100+
# so can overflow
101+
if name == 'nansum':
102+
if dt != np.bool_ and dt.itemsize < 8:
103+
return False
104+
105+
return True
106+
return False
100107

101108
def _has_infs(result):
102109
if isinstance(result, np.ndarray):
@@ -165,7 +172,18 @@ def _get_values(values, skipna, fill_value=None, fill_value_typ=None,
165172
values = values.copy()
166173

167174
values = _view_if_needed(values)
168-
return values, mask, dtype
175+
176+
# return a platform independent precision dtype
177+
dtype_max = dtype
178+
if dtype.kind == 'i' and not issubclass(
179+
dtype.type, (np.bool, np.datetime64, np.timedelta64)):
180+
dtype_max = np.int64
181+
elif dtype.kind in ['b'] or issubclass(dtype.type, np.bool):
182+
dtype_max = np.int64
183+
elif dtype.kind in ['f']:
184+
dtype_max = np.float64
185+
186+
return values, mask, dtype, dtype_max
169187

170188

171189
def _isfinite(values):
@@ -216,20 +234,20 @@ def _wrap_results(result, dtype):
216234

217235

218236
def nanany(values, axis=None, skipna=True):
219-
values, mask, dtype = _get_values(values, skipna, False, copy=skipna)
237+
values, mask, dtype, _ = _get_values(values, skipna, False, copy=skipna)
220238
return values.any(axis)
221239

222240

223241
def nanall(values, axis=None, skipna=True):
224-
values, mask, dtype = _get_values(values, skipna, True, copy=skipna)
242+
values, mask, dtype, _ = _get_values(values, skipna, True, copy=skipna)
225243
return values.all(axis)
226244

227245

228246
@disallow('M8')
229247
@bottleneck_switch(zero_value=0)
230248
def nansum(values, axis=None, skipna=True):
231-
values, mask, dtype = _get_values(values, skipna, 0)
232-
the_sum = values.sum(axis)
249+
values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
250+
the_sum = values.sum(axis,dtype=dtype_max)
233251
the_sum = _maybe_null_out(the_sum, axis, mask)
234252

235253
return _wrap_results(the_sum, dtype)
@@ -238,8 +256,8 @@ def nansum(values, axis=None, skipna=True):
238256
@disallow('M8')
239257
@bottleneck_switch()
240258
def nanmean(values, axis=None, skipna=True):
241-
values, mask, dtype = _get_values(values, skipna, 0)
242-
the_sum = _ensure_numeric(values.sum(axis))
259+
values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
260+
the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_max))
243261
count = _get_counts(mask, axis)
244262

245263
if axis is not None:
@@ -257,7 +275,7 @@ def nanmean(values, axis=None, skipna=True):
257275
@bottleneck_switch()
258276
def nanmedian(values, axis=None, skipna=True):
259277

260-
values, mask, dtype = _get_values(values, skipna)
278+
values, mask, dtype, dtype_max = _get_values(values, skipna)
261279

262280
def get_median(x):
263281
mask = notnull(x)
@@ -325,7 +343,7 @@ def nanvar(values, axis=None, skipna=True, ddof=1):
325343

326344
@bottleneck_switch()
327345
def nanmin(values, axis=None, skipna=True):
328-
values, mask, dtype = _get_values(values, skipna, fill_value_typ='+inf')
346+
values, mask, dtype, dtype_max = _get_values(values, skipna, fill_value_typ='+inf')
329347

330348
# numpy 1.6.1 workaround in Python 3.x
331349
if (values.dtype == np.object_ and compat.PY3):
@@ -341,7 +359,7 @@ def nanmin(values, axis=None, skipna=True):
341359
if ((axis is not None and values.shape[axis] == 0)
342360
or values.size == 0):
343361
try:
344-
result = com.ensure_float(values.sum(axis))
362+
result = com.ensure_float(values.sum(axis,dtype=dtype_max))
345363
result.fill(np.nan)
346364
except:
347365
result = np.nan
@@ -354,7 +372,7 @@ def nanmin(values, axis=None, skipna=True):
354372

355373
@bottleneck_switch()
356374
def nanmax(values, axis=None, skipna=True):
357-
values, mask, dtype = _get_values(values, skipna, fill_value_typ='-inf')
375+
values, mask, dtype, dtype_max = _get_values(values, skipna, fill_value_typ='-inf')
358376

359377
# numpy 1.6.1 workaround in Python 3.x
360378
if (values.dtype == np.object_ and compat.PY3):
@@ -371,7 +389,7 @@ def nanmax(values, axis=None, skipna=True):
371389
if ((axis is not None and values.shape[axis] == 0)
372390
or values.size == 0):
373391
try:
374-
result = com.ensure_float(values.sum(axis))
392+
result = com.ensure_float(values.sum(axis, dtype=dtype_max))
375393
result.fill(np.nan)
376394
except:
377395
result = np.nan
@@ -386,7 +404,7 @@ def nanargmax(values, axis=None, skipna=True):
386404
"""
387405
Returns -1 in the NA case
388406
"""
389-
values, mask, dtype = _get_values(values, skipna, fill_value_typ='-inf',
407+
values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='-inf',
390408
isfinite=True)
391409
result = values.argmax(axis)
392410
result = _maybe_arg_null_out(result, axis, mask, skipna)
@@ -397,7 +415,7 @@ def nanargmin(values, axis=None, skipna=True):
397415
"""
398416
Returns -1 in the NA case
399417
"""
400-
values, mask, dtype = _get_values(values, skipna, fill_value_typ='+inf',
418+
values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='+inf',
401419
isfinite=True)
402420
result = values.argmin(axis)
403421
result = _maybe_arg_null_out(result, axis, mask, skipna)

pandas/tests/test_series.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,49 @@ def test_nansum_buglet(self):
314314
result = np.nansum(s)
315315
assert_almost_equal(result, 1)
316316

317+
def test_overflow(self):
318+
319+
# GH 6915
320+
# overflowing on the smaller int dtypes
321+
for dtype in ['int32','int64']:
322+
v = np.arange(5000000,dtype=dtype)
323+
s = Series(v)
324+
325+
# no bottleneck
326+
result = s.sum(skipna=False)
327+
self.assertEqual(int(result),v.sum(dtype='int64'))
328+
result = s.min(skipna=False)
329+
self.assertEquals(int(result),0)
330+
result = s.max(skipna=False)
331+
self.assertEquals(int(result),v[-1])
332+
333+
# use bottleneck if available
334+
result = s.sum()
335+
self.assertEqual(int(result),v.sum(dtype='int64'))
336+
result = s.min()
337+
self.assertEquals(int(result),0)
338+
result = s.max()
339+
self.assertEquals(int(result),v[-1])
340+
341+
for dtype in ['float32','float64']:
342+
v = np.arange(5000000,dtype=dtype)
343+
s = Series(v)
344+
345+
# no bottleneck
346+
result = s.sum(skipna=False)
347+
self.assertTrue(np.allclose(float(result),v.sum(dtype='float64')))
348+
result = s.min(skipna=False)
349+
self.assertTrue(np.allclose(float(result),0.0))
350+
result = s.max(skipna=False)
351+
self.assertTrue(np.allclose(float(result),v[-1]))
352+
353+
# use bottleneck if available
354+
result = s.sum()
355+
self.assertTrue(np.allclose(float(result),v.sum(dtype='float64')))
356+
result = s.min()
357+
self.assertTrue(np.allclose(float(result),0.0))
358+
result = s.max()
359+
self.assertTrue(np.allclose(float(result),v[-1]))
317360

318361
class SafeForSparse(object):
319362
pass

0 commit comments

Comments
 (0)