diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 75ca21e3e9f72..9bcffdfefb359 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -470,6 +470,7 @@ Other API changes - :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`) - Changed behavior of :class:`Index`, :class:`Series`, and :class:`DataFrame` arithmetic methods when working with object-dtypes, the results no longer do type inference on the result of the array operations, use ``result.infer_objects()`` to do type inference on the result (:issue:`49999`) - Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`) +- Changed behavior of :class:`Series` and :class:`DataFrame` constructors when given an integer dtype and floating-point data that is not round numbers, this now raises ``ValueError`` instead of silently retaining the float dtype; do ``Series(data)`` or ``DataFrame(data)`` to get the old behavior, and ``Series(data).astype(dtype)`` or ``DataFrame(data).astype(dtype)`` to get the specified dtype (:issue:`49599`) - Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`) - Files are now closed when encountering an exception in :func:`read_json` (:issue:`49921`) - Changed behavior of :func:`read_csv`, :func:`read_json` & :func:`read_fwf`, where the index will now always be a :class:`RangeIndex`, when no index is specified. Previously the index would be a :class:`Index` with dtype ``object`` if the new DataFrame/Series has length 0 (:issue:`49572`) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 8984c54e39071..5a80fdb6d9e0e 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -27,7 +27,6 @@ DtypeObj, T, ) -from pandas.errors import IntCastingNaNError from pandas.core.dtypes.base import ( ExtensionDtype, @@ -46,7 +45,6 @@ is_datetime64_ns_dtype, is_dtype_equal, is_extension_array_dtype, - is_float_dtype, is_integer_dtype, is_list_like, is_object_dtype, @@ -503,7 +501,6 @@ def sanitize_array( copy: bool = False, *, allow_2d: bool = False, - strict_ints: bool = False, ) -> ArrayLike: """ Sanitize input data to an ndarray or ExtensionArray, copy if specified, @@ -517,8 +514,6 @@ def sanitize_array( copy : bool, default False allow_2d : bool, default False If False, raise if we have a 2D Arraylike. - strict_ints : bool, default False - If False, silently ignore failures to cast float data to int dtype. Returns ------- @@ -571,32 +566,7 @@ def sanitize_array( if isinstance(data, np.matrix): data = data.A - if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype): - # possibility of nan -> garbage - try: - # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int - # casting aligning with IntCastingNaNError below - with np.errstate(invalid="ignore"): - # GH#15832: Check if we are requesting a numeric dtype and - # that we can convert the data to the requested dtype. - subarr = maybe_cast_to_integer_array(data, dtype) - - except IntCastingNaNError: - raise - except ValueError: - # Pre-2.0, we would have different behavior for Series vs DataFrame. - # DataFrame would call np.array(data, dtype=dtype, copy=copy), - # which would cast to the integer dtype even if the cast is lossy. - # See GH#40110. - if strict_ints: - raise - - # We ignore the dtype arg and return floating values, - # e.g. test_constructor_floating_data_int_dtype - # TODO: where is the discussion that documents the reason for this? - subarr = np.array(data, copy=copy) - - elif dtype is None: + if dtype is None: subarr = data if data.dtype == object: subarr = maybe_infer_to_datetimelike(data) @@ -629,27 +599,8 @@ def sanitize_array( subarr = np.array([], dtype=np.float64) elif dtype is not None: - try: - subarr = _try_cast(data, dtype, copy) - except ValueError: - if is_integer_dtype(dtype): - if strict_ints: - raise - casted = np.array(data, copy=False) - if casted.dtype.kind == "f": - # GH#40110 match the behavior we have if we passed - # a ndarray[float] to begin with - return sanitize_array( - casted, - index, - dtype, - copy=False, - allow_2d=allow_2d, - ) - else: - raise - else: - raise + subarr = _try_cast(data, dtype, copy) + else: subarr = maybe_convert_platform(data) if subarr.dtype == object: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d8e48a755ab26..53491d12e7172 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -499,7 +499,7 @@ def __new__( data = com.asarray_tuplesafe(data, dtype=_dtype_obj) try: - arr = sanitize_array(data, None, dtype=dtype, copy=copy, strict_ints=True) + arr = sanitize_array(data, None, dtype=dtype, copy=copy) except ValueError as err: if "index must be specified when data is not list-like" in str(err): raise cls._raise_scalar_data_error(data) from err diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 8051fff7b329d..2f6063290b0c3 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2706,11 +2706,12 @@ def test_floating_values_integer_dtype(self): arr = np.random.randn(10, 5) - # as of 2.0, we match Series behavior by retaining float dtype instead - # of doing a lossy conversion here. Below we _do_ do the conversion - # since it is lossless. - df = DataFrame(arr, dtype="i8") - assert (df.dtypes == "f8").all() + # GH#49599 in 2.0 we raise instead of either + # a) silently ignoring dtype and returningfloat (the old Series behavior) or + # b) rounding (the old DataFrame behavior) + msg = "Trying to coerce float values to integers" + with pytest.raises(ValueError, match=msg): + DataFrame(arr, dtype="i8") df = DataFrame(arr.round(), dtype="i8") assert (df.dtypes == "i8").all() diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 054be774c2308..0d320b6c4e5d4 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -801,11 +801,13 @@ def test_constructor_floating_data_int_dtype(self, frame_or_series): # not clear if this is what we want long-term expected = frame_or_series(arr) - res = frame_or_series(arr, dtype="i8") - tm.assert_equal(res, expected) + # GH#49599 as of 2.0 we raise instead of silently retaining float dtype + msg = "Trying to coerce float values to integer" + with pytest.raises(ValueError, match=msg): + frame_or_series(arr, dtype="i8") - res = frame_or_series(list(arr), dtype="i8") - tm.assert_equal(res, expected) + with pytest.raises(ValueError, match=msg): + frame_or_series(list(arr), dtype="i8") # pre-2.0, when we had NaNs, we silently ignored the integer dtype arr[0] = np.nan @@ -815,7 +817,12 @@ def test_constructor_floating_data_int_dtype(self, frame_or_series): with pytest.raises(IntCastingNaNError, match=msg): frame_or_series(arr, dtype="i8") - with pytest.raises(IntCastingNaNError, match=msg): + exc = IntCastingNaNError + if frame_or_series is Series: + # TODO: try to align these + exc = ValueError + msg = "cannot convert float NaN to integer" + with pytest.raises(exc, match=msg): # same behavior if we pass list instead of the ndarray frame_or_series(list(arr), dtype="i8") @@ -833,13 +840,14 @@ def test_constructor_coerce_float_fail(self, any_int_numpy_dtype): # see gh-15832 # Updated: make sure we treat this list the same as we would treat # the equivalent ndarray + # GH#49599 pre-2.0 we silently retained float dtype, in 2.0 we raise vals = [1, 2, 3.5] - res = Series(vals, dtype=any_int_numpy_dtype) - expected = Series(np.array(vals), dtype=any_int_numpy_dtype) - tm.assert_series_equal(res, expected) - alt = Series(np.array(vals)) # i.e. we ignore the dtype kwd - tm.assert_series_equal(alt, expected) + msg = "Trying to coerce float values to integer" + with pytest.raises(ValueError, match=msg): + Series(vals, dtype=any_int_numpy_dtype) + with pytest.raises(ValueError, match=msg): + Series(np.array(vals), dtype=any_int_numpy_dtype) def test_constructor_coerce_float_valid(self, float_numpy_dtype): s = Series([1, 2, 3.5], dtype=float_numpy_dtype) @@ -853,9 +861,10 @@ def test_constructor_invalid_coerce_ints_with_float_nan(self, any_int_numpy_dtyp vals = [1, 2, np.nan] # pre-2.0 this would return with a float dtype, in 2.0 we raise - msg = r"Cannot convert non-finite values \(NA or inf\) to integer" - with pytest.raises(IntCastingNaNError, match=msg): + msg = "cannot convert float NaN to integer" + with pytest.raises(ValueError, match=msg): Series(vals, dtype=any_int_numpy_dtype) + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" with pytest.raises(IntCastingNaNError, match=msg): Series(np.array(vals), dtype=any_int_numpy_dtype) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index e5f40aa07d9e8..99b4d127e37ca 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -95,9 +95,10 @@ def test_construct_dask_float_array_int_dtype_match_ndarray(): expected = Series(arr) tm.assert_series_equal(res, expected) - res = Series(darr, dtype="i8") - expected = Series(arr, dtype="i8") - tm.assert_series_equal(res, expected) + # GH#49599 in 2.0 we raise instead of silently ignoring the dtype + msg = "Trying to coerce float values to integers" + with pytest.raises(ValueError, match=msg): + Series(darr, dtype="i8") msg = r"Cannot convert non-finite values \(NA or inf\) to integer" arr[2] = np.nan