From 6ef619ec9fec9d0f12f8ffbd4338e3b23dd7a82b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 15 Jan 2023 15:33:17 +0100 Subject: [PATCH 1/9] BUG: Series constructor overflowing for UInt64 --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/arrays/numeric.py | 15 +++++++++++++++ pandas/tests/series/test_constructors.py | 22 ++++++++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 6c8dd2f8da938..951116264daec 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1075,6 +1075,7 @@ ExtensionArray - Bug in :meth:`array.PandasArray.to_numpy` raising with ``NA`` value when ``na_value`` is specified (:issue:`40638`) - Bug in :meth:`api.types.is_numeric_dtype` where a custom :class:`ExtensionDtype` would not return ``True`` if ``_is_numeric`` returned ``True`` (:issue:`50563`) - Bug in :meth:`api.types.is_integer_dtype`, :meth:`api.types.is_unsigned_integer_dtype`, :meth:`api.types.is_signed_integer_dtype`, :meth:`api.types.is_float_dtype` where a custom :class:`ExtensionDtype` would not return ``True`` if ``kind`` returned the corresponding NumPy type (:issue:`50667`) +- Bug in :class:`Series` constructor unnecessarily overflowing for nullable unsigned integer dtypes (:issue:`38798`) Styler ^^^^^^ diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 0a30c28acb090..bbe273bd34806 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -168,6 +168,7 @@ def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype mask = mask.copy() return values, mask, dtype, inferred_type + original = values values = np.array(values, copy=copy) inferred_type = None if is_object_dtype(values.dtype) or is_string_dtype(values.dtype): @@ -204,6 +205,20 @@ def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype else: dtype = dtype.type + if ( + is_integer_dtype(dtype) + and is_float_dtype(values.dtype) + and len(values) > 0 + and np.max(values) > np.iinfo(np.intp).max + ): + # we either have floats and int targets which will raise below + # or ints that lose precision with float, so keep object for now + inferred_type = lib.infer_dtype(original, skipna=True) + if inferred_type not in ["floating", "mixed-integer-float"]: + values = np.array(original, dtype=dtype, copy=False) + else: + values = np.array(original, dtype="object", copy=False) + # we copy as need to coerce here if mask.any(): values = values.copy() diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 05e40e20f1226..a7c291f845744 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -49,6 +49,7 @@ import pandas._testing as tm from pandas.core.api import Int64Index from pandas.core.arrays import ( + IntegerArray, IntervalArray, period_array, ) @@ -2007,6 +2008,27 @@ def test_series_constructor_ea_int_from_string_bool(self): with pytest.raises(ValueError, match="invalid literal"): Series(["True", "False", "True", pd.NA], dtype="Int64") + @pytest.mark.parametrize("val", [1, 1.0]) + def test_series_constructor_overflow_uint_ea(self, val): + # GH#38798 + max_val = np.iinfo(np.uint64).max - 1 + result = Series([max_val, val], dtype="UInt64") + expected = Series(np.array([max_val, 1], dtype="uint64"), dtype="UInt64") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("val", [1, 1.0]) + def test_series_constructor_overflow_uint_ea_with_na(self, val): + # GH#38798 + max_val = np.iinfo(np.uint64).max - 1 + result = Series([max_val, val, pd.NA], dtype="UInt64") + expected = Series( + IntegerArray( + np.array([max_val, 1, 0], dtype="uint64"), + np.array([0, 0, 1], dtype=np.bool_), + ) + ) + tm.assert_series_equal(result, expected) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): From e02724520096e7a8b23b7bcd384baaf76fc1e5c1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 15 Jan 2023 16:30:29 +0100 Subject: [PATCH 2/9] Remove xfail --- pandas/tests/tools/test_to_numeric.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 1c0a8301d65cc..d3701c30aa50c 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -4,8 +4,6 @@ from numpy import iinfo import pytest -from pandas.compat import is_platform_arm - import pandas as pd from pandas import ( DataFrame, @@ -755,13 +753,7 @@ def test_to_numeric_from_nullable_string(values, nullable_string_dtype, expected ([1.0, 1.1], "Float64", "signed", "Float64"), ([1, pd.NA], "Int64", "signed", "Int8"), ([450, -300], "Int64", "signed", "Int16"), - pytest.param( - [np.iinfo(np.uint64).max - 1, 1], - "UInt64", - "signed", - "UInt64", - marks=pytest.mark.xfail(not is_platform_arm(), reason="GH38798"), - ), + ([np.iinfo(np.uint64).max - 1, 1], "UInt64", "signed", "UInt64"), ([1, 1], "Int64", "unsigned", "UInt8"), ([1.0, 1.0], "Float32", "unsigned", "UInt8"), ([1.0, 1.1], "Float64", "unsigned", "Float64"), From 94a5c6a548611449e1e848d01c6be0e996b96725 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 15 Jan 2023 20:00:19 +0100 Subject: [PATCH 3/9] Fix other issue --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/core/arrays/numeric.py | 25 ++++++++++++------------ pandas/tests/series/test_constructors.py | 12 ++++++++++++ 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 951116264daec..e20a6a04ce417 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1075,7 +1075,7 @@ ExtensionArray - Bug in :meth:`array.PandasArray.to_numpy` raising with ``NA`` value when ``na_value`` is specified (:issue:`40638`) - Bug in :meth:`api.types.is_numeric_dtype` where a custom :class:`ExtensionDtype` would not return ``True`` if ``_is_numeric`` returned ``True`` (:issue:`50563`) - Bug in :meth:`api.types.is_integer_dtype`, :meth:`api.types.is_unsigned_integer_dtype`, :meth:`api.types.is_signed_integer_dtype`, :meth:`api.types.is_float_dtype` where a custom :class:`ExtensionDtype` would not return ``True`` if ``kind`` returned the corresponding NumPy type (:issue:`50667`) -- Bug in :class:`Series` constructor unnecessarily overflowing for nullable unsigned integer dtypes (:issue:`38798`) +- Bug in :class:`Series` constructor unnecessarily overflowing for nullable unsigned integer dtypes (:issue:`38798`, :issue:`25880`) Styler ^^^^^^ diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index bbe273bd34806..e4ab92335aacf 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -205,19 +205,18 @@ def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype else: dtype = dtype.type - if ( - is_integer_dtype(dtype) - and is_float_dtype(values.dtype) - and len(values) > 0 - and np.max(values) > np.iinfo(np.intp).max - ): - # we either have floats and int targets which will raise below - # or ints that lose precision with float, so keep object for now - inferred_type = lib.infer_dtype(original, skipna=True) - if inferred_type not in ["floating", "mixed-integer-float"]: - values = np.array(original, dtype=dtype, copy=False) - else: - values = np.array(original, dtype="object", copy=False) + if is_integer_dtype(dtype) and is_float_dtype(values.dtype) and len(values) > 0: + idx = np.argmax(values) + if values[idx] != original[idx]: + # We have ints that lost precision during the cast. + inferred_type = lib.infer_dtype(original, skipna=True) + if ( + inferred_type not in ["floating", "mixed-integer-float"] + and not mask.any() + ): + values = np.array(original, dtype=dtype, copy=False) + else: + values = np.array(original, dtype="object", copy=False) # we copy as need to coerce here if mask.any(): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index a7c291f845744..8c05eeac39033 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2029,6 +2029,18 @@ def test_series_constructor_overflow_uint_ea_with_na(self, val): ) tm.assert_series_equal(result, expected) + def test_series_constructor_overflow_int_with_nan(self): + # GH#38798 + max_val = np.iinfo(np.uint64).max - 1 + result = Series([max_val, np.nan], dtype="UInt64") + expected = Series( + IntegerArray( + np.array([max_val, 1], dtype="uint64"), + np.array([0, 1], dtype=np.bool_), + ) + ) + tm.assert_series_equal(result, expected) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): From adcaa7f9611c0f6e9a82fc8d5ad4d3996d628c17 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 15 Jan 2023 22:22:32 +0100 Subject: [PATCH 4/9] Fix --- pandas/core/arrays/numeric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index e4ab92335aacf..b5ef10256a80e 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -207,7 +207,7 @@ def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype if is_integer_dtype(dtype) and is_float_dtype(values.dtype) and len(values) > 0: idx = np.argmax(values) - if values[idx] != original[idx]: + if int(values[idx]) != original[idx]: # We have ints that lost precision during the cast. inferred_type = lib.infer_dtype(original, skipna=True) if ( From b3707476b45c708b530f68d66371d3480647c5c9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 15 Jan 2023 23:36:36 +0100 Subject: [PATCH 5/9] Fix --- pandas/core/arrays/numeric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index b5ef10256a80e..b3f78e5133324 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -207,7 +207,7 @@ def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype if is_integer_dtype(dtype) and is_float_dtype(values.dtype) and len(values) > 0: idx = np.argmax(values) - if int(values[idx]) != original[idx]: + if not np.isnan(values[idx]) and int(values[idx]) != original[idx]: # We have ints that lost precision during the cast. inferred_type = lib.infer_dtype(original, skipna=True) if ( From 944df67de411e1c10122e3708875350319a5925f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 16 Jan 2023 01:07:19 +0100 Subject: [PATCH 6/9] Fix --- pandas/core/arrays/numeric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index b3f78e5133324..0bfbd68be48d3 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -206,7 +206,7 @@ def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype dtype = dtype.type if is_integer_dtype(dtype) and is_float_dtype(values.dtype) and len(values) > 0: - idx = np.argmax(values) + idx = np.nanargmax(values) if not np.isnan(values[idx]) and int(values[idx]) != original[idx]: # We have ints that lost precision during the cast. inferred_type = lib.infer_dtype(original, skipna=True) From 1d91cca85c0d65cb69d17e3f50b126a88716286a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 16 Jan 2023 01:17:35 +0100 Subject: [PATCH 7/9] Check for all na --- pandas/core/arrays/numeric.py | 9 +++++++-- pandas/tests/series/test_constructors.py | 11 +++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 0bfbd68be48d3..df8c2e4dab1eb 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -205,9 +205,14 @@ def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype else: dtype = dtype.type - if is_integer_dtype(dtype) and is_float_dtype(values.dtype) and len(values) > 0: + if ( + is_integer_dtype(dtype) + and is_float_dtype(values.dtype) + and len(values) > 0 + and not mask.all() + ): idx = np.nanargmax(values) - if not np.isnan(values[idx]) and int(values[idx]) != original[idx]: + if int(values[idx]) != original[idx]: # We have ints that lost precision during the cast. inferred_type = lib.infer_dtype(original, skipna=True) if ( diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 8c05eeac39033..0c48457eb656e 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2041,6 +2041,17 @@ def test_series_constructor_overflow_int_with_nan(self): ) tm.assert_series_equal(result, expected) + def test_series_constructor_ea_all_na(self): + # GH#38798 + result = Series([np.nan, np.nan], dtype="UInt64") + expected = Series( + IntegerArray( + np.array([1, 1], dtype="uint64"), + np.array([1, 1], dtype=np.bool_), + ) + ) + tm.assert_series_equal(result, expected) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): From 0b1e3f82f2313b052624b9275bb15e93ea63a119 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 16 Jan 2023 01:39:19 +0100 Subject: [PATCH 8/9] Fix all na case --- pandas/core/arrays/numeric.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index df8c2e4dab1eb..6ae2c3a2e2749 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -205,23 +205,21 @@ def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype else: dtype = dtype.type - if ( - is_integer_dtype(dtype) - and is_float_dtype(values.dtype) - and len(values) > 0 - and not mask.all() - ): - idx = np.nanargmax(values) - if int(values[idx]) != original[idx]: - # We have ints that lost precision during the cast. - inferred_type = lib.infer_dtype(original, skipna=True) - if ( - inferred_type not in ["floating", "mixed-integer-float"] - and not mask.any() - ): - values = np.array(original, dtype=dtype, copy=False) - else: - values = np.array(original, dtype="object", copy=False) + if is_integer_dtype(dtype) and is_float_dtype(values.dtype) and len(values) > 0: + if mask.all(): + values = np.ones(values.shape, dtype=dtype) + else: + idx = np.nanargmax(values) + if int(values[idx]) != original[idx]: + # We have ints that lost precision during the cast. + inferred_type = lib.infer_dtype(original, skipna=True) + if ( + inferred_type not in ["floating", "mixed-integer-float"] + and not mask.any() + ): + values = np.array(original, dtype=dtype, copy=False) + else: + values = np.array(original, dtype="object", copy=False) # we copy as need to coerce here if mask.any(): From 9c9a68a688a28f2495f9755d6618e6525028f565 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 18 Jan 2023 12:14:06 +0100 Subject: [PATCH 9/9] Update test_constructors.py --- pandas/tests/series/test_constructors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 0c48457eb656e..bd53e38f4ce28 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2029,7 +2029,7 @@ def test_series_constructor_overflow_uint_ea_with_na(self, val): ) tm.assert_series_equal(result, expected) - def test_series_constructor_overflow_int_with_nan(self): + def test_series_constructor_overflow_uint_with_nan(self): # GH#38798 max_val = np.iinfo(np.uint64).max - 1 result = Series([max_val, np.nan], dtype="UInt64")