From 8002bece71b819126f7736009f24e27aed50385d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 9 May 2024 10:03:21 -0700 Subject: [PATCH] CLN: Stopped dtype inference in sanitize_array with Index[object] --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/construction.py | 14 ++------------ pandas/core/frame.py | 17 +---------------- pandas/tests/frame/indexing/test_setitem.py | 12 +++++------- 4 files changed, 9 insertions(+), 35 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 53b6179dbae93..6d511da22e44a 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -243,6 +243,7 @@ Removal of prior version deprecations/changes - Removed extension test classes ``BaseNoReduceTests``, ``BaseNumericReduceTests``, ``BaseBooleanReduceTests`` (:issue:`54663`) - Removed the "closed" and "normalize" keywords in :meth:`DatetimeIndex.__new__` (:issue:`52628`) - Require :meth:`SparseDtype.fill_value` to be a valid value for the :meth:`SparseDtype.subtype` (:issue:`53043`) +- Stopped performing dtype inference when setting a :class:`Index` into a :class:`DataFrame` (:issue:`56102`) - Stopped performing dtype inference with in :meth:`Index.insert` with object-dtype index; this often affects the index/columns that result when setting new entries into an empty :class:`Series` or :class:`DataFrame` (:issue:`51363`) - Removed the "closed" and "unit" keywords in :meth:`TimedeltaIndex.__new__` (:issue:`52628`, :issue:`55499`) - All arguments in :meth:`Index.sort_values` are now keyword only (:issue:`56493`) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 2718e9819cdf8..f01d8822241c9 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -38,7 +38,6 @@ ensure_object, is_list_like, is_object_dtype, - is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import NumpyEADtype @@ -555,9 +554,7 @@ def sanitize_array( # Avoid ending up with a NumpyExtensionArray dtype = dtype.numpy_dtype - object_index = False - if isinstance(data, ABCIndex) and data.dtype == object and dtype is None: - object_index = True + data_was_index = isinstance(data, ABCIndex) # extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray data = extract_array(data, extract_numpy=True, extract_range=True) @@ -610,15 +607,8 @@ def sanitize_array( if dtype is None: subarr = data - if data.dtype == object: + if data.dtype == object and not data_was_index: subarr = maybe_infer_to_datetimelike(data) - if ( - object_index - and using_pyarrow_string_dtype() - and is_string_dtype(subarr) - ): - # Avoid inference when string option is set - subarr = data elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): from pandas.core.arrays.string_ import StringDtype diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a4decab6e8a2b..9ede2c301c85e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5059,22 +5059,7 @@ def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: if is_list_like(value): com.require_length_match(value, self.index) - arr = sanitize_array(value, self.index, copy=True, allow_2d=True) - if ( - isinstance(value, Index) - and value.dtype == "object" - and arr.dtype != value.dtype - ): # - # TODO: Remove kludge in sanitize_array for string mode when enforcing - # this deprecation - warnings.warn( - "Setting an Index with object dtype into a DataFrame will stop " - "inferring another dtype in a future version. Cast the Index " - "explicitly before setting it into the DataFrame.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return arr, None + return sanitize_array(value, self.index, copy=True, allow_2d=True), None @property def _series(self): diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index ed81e8c8b8129..1fe11c62188e8 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -782,20 +782,18 @@ def test_loc_setitem_ea_dtype(self): df.iloc[:, 0] = Series([11], dtype="Int64") tm.assert_frame_equal(df, expected) - def test_setitem_object_inferring(self): + def test_setitem_index_object_dtype_not_inferring(self): # GH#56102 idx = Index([Timestamp("2019-12-31")], dtype=object) df = DataFrame({"a": [1]}) - with tm.assert_produces_warning(FutureWarning, match="infer"): - df.loc[:, "b"] = idx - with tm.assert_produces_warning(FutureWarning, match="infer"): - df["c"] = idx + df.loc[:, "b"] = idx + df["c"] = idx expected = DataFrame( { "a": [1], - "b": Series([Timestamp("2019-12-31")], dtype="datetime64[ns]"), - "c": Series([Timestamp("2019-12-31")], dtype="datetime64[ns]"), + "b": idx, + "c": idx, } ) tm.assert_frame_equal(df, expected)