From 2dc49e46f0251213bb724b0968af7d51792134fc Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 29 Sep 2016 13:57:23 -0400 Subject: [PATCH] BUG: Catch overflow in both directions for checked add 1) Add checks to ensure that add overflow does not occur both in the positive or negative directions. 2) Add benchmarks to ensure that operations involving this checked add function are significantly impacted. --- asv_bench/benchmarks/algorithms.py | 26 +++++++++++++++++ asv_bench/benchmarks/timedelta.py | 13 ++++++++- doc/source/whatsnew/v0.19.1.txt | 1 + pandas/core/nanops.py | 37 ++++++++++++++++++++++--- pandas/tests/test_nanops.py | 9 +++++- pandas/tseries/tests/test_timedeltas.py | 2 ++ 6 files changed, 82 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 6eac7b4831f0f..9807639143ddb 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -15,6 +15,14 @@ def setup(self): self.int = pd.Int64Index(np.arange(N).repeat(5)) self.float = pd.Float64Index(np.random.randn(N).repeat(5)) + # Convenience naming. + self.checked_add = pd.core.nanops._checked_add_with_arr + + self.arr = np.arange(1000000) + self.arrpos = np.arange(1000000) + self.arrneg = np.arange(-1000000, 0) + self.arrmixed = np.array([1, -1]).repeat(500000) + def time_int_factorize(self): self.int.factorize() @@ -29,3 +37,21 @@ def time_int_duplicated(self): def time_float_duplicated(self): self.float.duplicated() + + def time_add_overflow_pos_scalar(self): + self.checked_add(self.arr, 1) + + def time_add_overflow_neg_scalar(self): + self.checked_add(self.arr, -1) + + def time_add_overflow_zero_scalar(self): + self.checked_add(self.arr, 0) + + def time_add_overflow_pos_arr(self): + self.checked_add(self.arr, self.arrpos) + + def time_add_overflow_neg_arr(self): + self.checked_add(self.arr, self.arrneg) + + def time_add_overflow_mixed_arr(self): + self.checked_add(self.arr, self.arrmixed) diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index 9719fd87dfb2e..8470525dd01fa 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -1,5 +1,5 @@ from .pandas_vb_common import * -from pandas import to_timedelta +from pandas import to_timedelta, Timestamp class timedelta_convert_int(object): @@ -47,3 +47,14 @@ def time_timedelta_convert_coerce(self): def time_timedelta_convert_ignore(self): to_timedelta(self.arr, errors='ignore') + + +class timedelta_add_overflow(object): + goal_time = 0.2 + + def setup(self): + self.td = to_timedelta(np.arange(1000000)) + self.ts = Timestamp('2000') + + def test_add_td_ts(self): + self.td + self.ts diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 5180b9a092f6c..3605387d767ec 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -35,6 +35,7 @@ Bug Fixes - Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`) +- Bug in ``TimedeltaIndex`` addition with a Datetime-like object where addition overflow in the negative direction was not being caught (:issue:`14068`, :issue:`14453`) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 564586eec5a8e..d7d68ad536be5 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -11,6 +11,7 @@ import pandas.hashtable as _hash from pandas import compat, lib, algos, tslib +from pandas.compat.numpy import _np_version_under1p10 from pandas.types.common import (_ensure_int64, _ensure_object, _ensure_float64, _get_dtype, is_float, is_scalar, @@ -829,9 +830,37 @@ def _checked_add_with_arr(arr, b): Raises ------ - OverflowError if any x + y exceeds the maximum int64 value. + OverflowError if any x + y exceeds the maximum or minimum int64 value. """ - if (np.iinfo(np.int64).max - b < arr).any(): - raise OverflowError("Python int too large to " - "convert to C long") + # For performance reasons, we broadcast 'b' to the new array 'b2' + # so that it has the same size as 'arr'. + if _np_version_under1p10: + if lib.isscalar(b): + b2 = np.empty(arr.shape) + b2.fill(b) + else: + b2 = b + else: + b2 = np.broadcast_to(b, arr.shape) + + # gh-14324: For each element in 'arr' and its corresponding element + # in 'b2', we check the sign of the element in 'b2'. If it is positive, + # we then check whether its sum with the element in 'arr' exceeds + # np.iinfo(np.int64).max. If so, we have an overflow error. If it + # it is negative, we then check whether its sum with the element in + # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow + # error as well. + mask1 = b2 > 0 + mask2 = b2 < 0 + + if not mask1.any(): + to_raise = (np.iinfo(np.int64).min - b2 > arr).any() + elif not mask2.any(): + to_raise = (np.iinfo(np.int64).max - b2 < arr).any() + else: + to_raise = ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]).any() or + (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]).any()) + + if to_raise: + raise OverflowError("Overflow in int64 addition") return arr + b diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index f00fdd196abea..be634228b1b6e 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1004,13 +1004,20 @@ def prng(self): def test_int64_add_overflow(): # see gh-14068 - msg = "too (big|large) to convert" + msg = "Overflow in int64 addition" m = np.iinfo(np.int64).max + n = np.iinfo(np.int64).min with tm.assertRaisesRegexp(OverflowError, msg): nanops._checked_add_with_arr(np.array([m, m]), m) with tm.assertRaisesRegexp(OverflowError, msg): nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m])) + with tm.assertRaisesRegexp(OverflowError, msg): + nanops._checked_add_with_arr(np.array([n, n]), n) + with tm.assertRaisesRegexp(OverflowError, msg): + nanops._checked_add_with_arr(np.array([n, n]), np.array([n, n])) + with tm.assertRaisesRegexp(OverflowError, msg): + nanops._checked_add_with_arr(np.array([m, n]), np.array([n, n])) with tm.assertRaisesRegexp(OverflowError, msg): with tm.assert_produces_warning(RuntimeWarning): nanops._checked_add_with_arr(np.array([m, m]), diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 38e210d698035..f0d14014d6559 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -1957,6 +1957,8 @@ def test_add_overflow(self): to_timedelta(106580, 'D') + Timestamp('2000') with tm.assertRaisesRegexp(OverflowError, msg): Timestamp('2000') + to_timedelta(106580, 'D') + + msg = "Overflow in int64 addition" with tm.assertRaisesRegexp(OverflowError, msg): to_timedelta([106580], 'D') + Timestamp('2000') with tm.assertRaisesRegexp(OverflowError, msg):