From ac09146878898d083603915807ac5c29ce40e878 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 18 Nov 2021 08:42:58 -0800 Subject: [PATCH 01/19] BUG: IntegerArray/FloatingArray constructors mismatched NAs --- pandas/_libs/missing.pyx | 24 +++++++ pandas/core/arrays/floating.py | 15 +++-- pandas/core/arrays/integer.py | 11 ++- pandas/core/internals/blocks.py | 9 +++ .../arrays/floating/test_construction.py | 12 ++-- pandas/tests/frame/indexing/test_indexing.py | 67 +++++++++++++++++++ pandas/tests/series/methods/test_clip.py | 11 ++- pandas/tests/series/test_constructors.py | 18 +++++ 8 files changed, 149 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index b77db2aec4a08..7cd5f78a70914 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -368,6 +368,30 @@ cdef bint checknull_with_nat_and_na(object obj): return checknull_with_nat(obj) or obj is C_NA +@cython.wraparound(False) +@cython.boundscheck(False) +def is_numeric_na(ndarray values): + """ + Check for NA values consistent with IntegerArray/FloatingArray. + + Similar to a vectorized is_valid_na_for_dtype restricted to numeric dtypes. + """ + cdef: + ndarray[uint8_t] result + Py_ssize_t i, N + object val + + N = len(values) + result = np.zeros(N, dtype=np.uint8) + + for i in range(N): + val = values[i] + if val is None or val is C_NA or util.is_nan(val) or is_decimal_na(val): + result[i] = True + + return result.view(bool) + + # ----------------------------------------------------------------------------- # Implementation of NA singleton diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 6d6cc03a1c83e..e08d7fb92e912 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -135,8 +135,7 @@ def coerce_to_array( if is_object_dtype(values): inferred_type = lib.infer_dtype(values, skipna=True) if inferred_type == "empty": - values = np.empty(len(values)) - values.fill(np.nan) + pass elif inferred_type not in [ "floating", "integer", @@ -152,13 +151,19 @@ def coerce_to_array( elif not (is_integer_dtype(values) or is_float_dtype(values)): raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype") + if values.ndim != 1: + raise TypeError("values must be a 1D list-like") + if mask is None: - mask = isna(values) + mask = libmissing.is_numeric_na(values) + mask2 = isna(values) + if not (mask == mask2).all(): + # e.g. if we have a timedelta64("NaT") + raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype") + else: assert len(mask) == len(values) - if not values.ndim == 1: - raise TypeError("values must be a 1D list-like") if not mask.ndim == 1: raise TypeError("mask must be a 1D list-like") diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index d8b7bf2b86d2c..9723b040b4e13 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -35,7 +35,6 @@ is_string_dtype, pandas_dtype, ) -from pandas.core.dtypes.missing import isna from pandas.core.arrays import ExtensionArray from pandas.core.arrays.masked import ( @@ -190,8 +189,7 @@ def coerce_to_array( if is_object_dtype(values) or is_string_dtype(values): inferred_type = lib.infer_dtype(values, skipna=True) if inferred_type == "empty": - values = np.empty(len(values)) - values.fill(np.nan) + pass elif inferred_type not in [ "floating", "integer", @@ -209,13 +207,14 @@ def coerce_to_array( elif not (is_integer_dtype(values) or is_float_dtype(values)): raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype") + if values.ndim != 1: + raise TypeError("values must be a 1D list-like") + if mask is None: - mask = isna(values) + mask = libmissing.is_numeric_na(values) else: assert len(mask) == len(values) - if not values.ndim == 1: - raise TypeError("values must be a 1D list-like") if not mask.ndim == 1: raise TypeError("mask must be a 1D list-like") diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7b6a76f0a5d10..1af59bb61e0c9 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1480,6 +1480,15 @@ def setitem(self, indexer, value): # we are always 1-D indexer = indexer[0] + # TODO(EA2D): not needed with 2D EAS + if isinstance(value, (np.ndarray, ExtensionArray)) and value.ndim == 2: + assert value.shape[1] == 1 + value = value[:, 0] + elif isinstance(value, ABCDataFrame): + # TODO: should we avoid getting here with DataFrame? + assert value.shape[1] == 1 + value = value._ixs(0, axis=1)._values + check_setitem_lengths(indexer, value, self.values) self.values[indexer] = value return self diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index 4ce3dd35b538b..484c269d52333 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -97,14 +97,18 @@ def test_to_array_mixed_integer_float(): np.array(["foo"]), [[1, 2], [3, 4]], [np.nan, {"a": 1}], + # all-NA case used to get quietly swapped out before checking ndim + np.array([pd.NA] * 6, dtype=object).reshape(3, 2), ], ) def test_to_array_error(values): # error in converting existing arrays to FloatingArray - msg = ( - r"(:?.* cannot be converted to a FloatingDtype)" - r"|(:?values must be a 1D list-like)" - r"|(:?Cannot pass scalar)" + msg = "|".join( + [ + "cannot be converted to a FloatingDtype", + "values must be a 1D list-like", + "Cannot pass scalar", + ] ) with pytest.raises((TypeError, ValueError), match=msg): pd.array(values, dtype="Float64") diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 942da38dc5a26..9ee18dcd87a3e 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1217,6 +1217,73 @@ def test_setitem_array_as_cell_value(self): expected = DataFrame({"a": [np.zeros((2,))], "b": [np.zeros((2, 2))]}) tm.assert_frame_equal(df, expected) + def test_iloc_setitem_nullable_2d_values(self): + + df = DataFrame({"A": [1, 2, 3]}, dtype="Int64") + orig = df.copy() + + df.loc[:] = df.values[:, ::-1] + tm.assert_frame_equal(df, orig) + + df.loc[:] = pd.core.arrays.PandasArray(df.values[:, ::-1]) + tm.assert_frame_equal(df, orig) + + df.iloc[:] = df.iloc[:, :] + tm.assert_frame_equal(df, orig) + + @pytest.mark.parametrize( + "null", [pd.NaT, pd.NaT.to_numpy("M8[ns]"), pd.NaT.to_numpy("m8[ns]")] + ) + def test_setting_mismatched_na_into_nullable_fails( + self, null, any_numeric_ea_dtype + ): + # don't cast mismatched nulls to pd.NA + df = DataFrame({"A": [1, 2, 3]}, dtype=any_numeric_ea_dtype) + ser = df["A"] + arr = ser._values + + msg = "|".join( + [ + r"int\(\) argument must be a string, a bytes-like object or a " + "number, not 'NaTType'", + r"timedelta64\[ns\] cannot be converted to an? (Floating|Integer)Dtype", + r"datetime64\[ns\] cannot be converted to an? (Floating|Integer)Dtype", + "object cannot be converted to a FloatingDtype", + ] + ) + with pytest.raises(TypeError, match=msg): + arr[0] = null + + with pytest.raises(TypeError, match=msg): + arr[:2] = [null, null] + + with pytest.raises(TypeError, match=msg): + ser[0] = null + + with pytest.raises(TypeError, match=msg): + ser[:2] = [null, null] + + with pytest.raises(TypeError, match=msg): + ser.iloc[0] = null + + with pytest.raises(TypeError, match=msg): + ser.iloc[:2] = [null, null] + + with pytest.raises(TypeError, match=msg): + df.iloc[0, 0] = null + + with pytest.raises(TypeError, match=msg): + df.iloc[:2, 0] = [null, null] + + # Multi-Block + df2 = df.copy() + df2["B"] = ser.copy() + with pytest.raises(TypeError, match=msg): + df2.iloc[0, 0] = null + + with pytest.raises(TypeError, match=msg): + df2.iloc[:2, 0] = [null, null] + class TestDataFrameIndexingUInt64: def test_setitem(self, uint64_frame): diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index 247f0d50772ce..bc6d5aeb0a581 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -46,9 +46,14 @@ def test_series_clipping_with_na_values(self, any_numeric_ea_dtype, nulls_fixtur # Ensure that clipping method can handle NA values with out failing # GH#40581 - s = Series([nulls_fixture, 1.0, 3.0], dtype=any_numeric_ea_dtype) - s_clipped_upper = s.clip(upper=2.0) - s_clipped_lower = s.clip(lower=2.0) + if nulls_fixture is pd.NaT: + # constructor will raise, see + # test_constructor_mismatched_null_nullable_dtype + return + + ser = Series([nulls_fixture, 1.0, 3.0], dtype=any_numeric_ea_dtype) + s_clipped_upper = ser.clip(upper=2.0) + s_clipped_lower = ser.clip(lower=2.0) expected_upper = Series([nulls_fixture, 1.0, 2.0], dtype=any_numeric_ea_dtype) expected_lower = Series([nulls_fixture, 2.0, 3.0], dtype=any_numeric_ea_dtype) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 692c040a33ff8..c83f57981e11d 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1817,6 +1817,24 @@ def test_constructor_bool_dtype_missing_values(self): expected = Series(True, index=[0], dtype="bool") tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("func", [Series, DataFrame, Index, pd.array]) + def test_constructor_mismatched_null_nullable_dtype( + self, func, any_numeric_ea_dtype + ): + msg = "|".join( + [ + "cannot safely cast non-equivalent object", + r"int\(\) argument must be a string, a bytes-like object or a number", + r"Cannot cast array data from dtype\('O'\) to dtype\('float64'\) " + "according to the rule 'safe'", + "object cannot be converted to a FloatingDtype", + ] + ) + + for null in tm.NP_NAT_OBJECTS + [NaT]: + with pytest.raises(TypeError, match=msg): + func([null, 1.0, 3.0], dtype=any_numeric_ea_dtype) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): From 1166725f07644fefb2f94dddf4422beb92ecefb7 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 18 Nov 2021 08:52:27 -0800 Subject: [PATCH 02/19] Whatsnew, GH ref --- doc/source/whatsnew/v1.4.0.rst | 3 +++ .../tests/arrays/floating/test_construction.py | 2 +- pandas/tests/extension/base/setitem.py | 17 +++++++++++++++++ pandas/tests/frame/indexing/test_indexing.py | 2 +- pandas/tests/series/test_constructors.py | 1 + 5 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 2456406f0eca3..5db79665354f6 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -610,6 +610,8 @@ Indexing - Bug in :meth:`Series.__setitem__` with a boolean mask indexer setting a listlike value of length 1 incorrectly broadcasting that value (:issue:`44265`) - Bug in :meth:`DataFrame.loc.__setitem__` and :meth:`DataFrame.iloc.__setitem__` with mixed dtypes sometimes failing to operate in-place (:issue:`44345`) - Bug in :meth:`DataFrame.loc.__getitem__` incorrectly raising ``KeyError`` when selecting a single column with a boolean key (:issue:`44322`). +- Bug in setting :meth:`DataFrame.iloc` with a single ``ExtensionDtype`` column and setting 2D values e.g. ``df.iloc[:] = df.values`` incorrectly raising (:issue:`44514`) +- Missing ^^^^^^^ @@ -709,6 +711,7 @@ ExtensionArray - Bug in :func:`array` failing to preserve :class:`PandasArray` (:issue:`43887`) - NumPy ufuncs ``np.abs``, ``np.positive``, ``np.negative`` now correctly preserve dtype when called on ExtensionArrays that implement ``__abs__, __pos__, __neg__``, respectively. In particular this is fixed for :class:`TimedeltaArray` (:issue:`43899`) - Avoid raising ``PerformanceWarning`` about fragmented DataFrame when using many columns with an extension dtype (:issue:`44098`) +- Bug in :class:`IntegerArray` and :class:`FloatingArray` construction incorrectly coercing mismatched NA values (e.g. ``np.timedelta64("NaT")``) to numeric NA (:issue:`44514`) - Styler diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index 484c269d52333..4b7b237d2eb7c 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -97,7 +97,7 @@ def test_to_array_mixed_integer_float(): np.array(["foo"]), [[1, 2], [3, 4]], [np.nan, {"a": 1}], - # all-NA case used to get quietly swapped out before checking ndim + # GH#44514 all-NA case used to get quietly swapped out before checking ndim np.array([pd.NA] * 6, dtype=object).reshape(3, 2), ], ) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index a2d100db81a2c..221710fbffca1 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -357,6 +357,23 @@ def test_setitem_series(self, data, full_indexer): ) self.assert_series_equal(result, expected) + def test_setitem_frame_2d_values(self, data): + # GH#44514 + df = pd.DataFrame({"A": data}) + orig = df.copy() + + df.iloc[:] = df + self.assert_frame_equal(df, orig) + + df.iloc[:-1] = df.iloc[:-1] + self.assert_frame_equal(df, orig) + + df.iloc[:] = df.values + self.assert_frame_equal(df, orig) + + df.iloc[:-1] = df.values[:-1] + self.assert_frame_equal(df, orig) + def test_delitem_series(self, data): # GH#40763 ser = pd.Series(data, name="data") diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 9ee18dcd87a3e..1ab9e15ea889b 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1237,7 +1237,7 @@ def test_iloc_setitem_nullable_2d_values(self): def test_setting_mismatched_na_into_nullable_fails( self, null, any_numeric_ea_dtype ): - # don't cast mismatched nulls to pd.NA + # GH#44514 don't cast mismatched nulls to pd.NA df = DataFrame({"A": [1, 2, 3]}, dtype=any_numeric_ea_dtype) ser = df["A"] arr = ser._values diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index c83f57981e11d..defa1af21a60d 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1821,6 +1821,7 @@ def test_constructor_bool_dtype_missing_values(self): def test_constructor_mismatched_null_nullable_dtype( self, func, any_numeric_ea_dtype ): + # GH#44514 msg = "|".join( [ "cannot safely cast non-equivalent object", From 21b6977f8445edd300cad1491d78f811201bd566 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 18 Nov 2021 13:26:10 -0800 Subject: [PATCH 03/19] mypy fixup --- pandas/core/internals/blocks.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1af59bb61e0c9..05a3a30acd21a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1483,7 +1483,9 @@ def setitem(self, indexer, value): # TODO(EA2D): not needed with 2D EAS if isinstance(value, (np.ndarray, ExtensionArray)) and value.ndim == 2: assert value.shape[1] == 1 - value = value[:, 0] + # error: No overload variant of "__getitem__" of "ExtensionArray" + # matches argument type "Tuple[slice, int]" + value = value[:, 0] # type: ignore[call-overload] elif isinstance(value, ABCDataFrame): # TODO: should we avoid getting here with DataFrame? assert value.shape[1] == 1 From a4d89ce4e92bf825e789db491d188b21baf5f458 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 20 Nov 2021 13:40:07 -0800 Subject: [PATCH 04/19] xfail on old numpy --- pandas/tests/series/test_constructors.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index defa1af21a60d..3bde2e2a7cb01 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1817,6 +1817,9 @@ def test_constructor_bool_dtype_missing_values(self): expected = Series(True, index=[0], dtype="bool") tm.assert_series_equal(result, expected) + @pytest.mark.xfail( + np_version_under1p19, reason="np.array([td64nat, float, float]) raises" + ) @pytest.mark.parametrize("func", [Series, DataFrame, Index, pd.array]) def test_constructor_mismatched_null_nullable_dtype( self, func, any_numeric_ea_dtype From d322af3ee10be6cd8e06cd87a16bb30633e06b35 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 20 Nov 2021 13:55:59 -0800 Subject: [PATCH 05/19] xfail ArrayManager --- pandas/tests/extension/base/setitem.py | 17 ++++++++++++++++- pandas/tests/frame/indexing/test_indexing.py | 5 ++++- pandas/tests/series/test_constructors.py | 6 +++++- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 221710fbffca1..68b356ec32c25 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -1,6 +1,13 @@ import numpy as np import pytest +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + IntervalDtype, + PandasDtype, + PeriodDtype, +) + import pandas as pd import pandas._testing as tm from pandas.tests.extension.base.base import BaseExtensionTests @@ -357,8 +364,16 @@ def test_setitem_series(self, data, full_indexer): ) self.assert_series_equal(result, expected) - def test_setitem_frame_2d_values(self, data): + def test_setitem_frame_2d_values(self, data, using_array_manager, request): # GH#44514 + if using_array_manager: + if not isinstance( + data.dtype, (PandasDtype, PeriodDtype, IntervalDtype, DatetimeTZDtype) + ): + # These dtypes have non-broken implementations of _can_hold_element + mark = pytest.mark.xfail(reason="Goes through split path, loses dtype") + request.node.add_marker(mark) + df = pd.DataFrame({"A": data}) orig = df.copy() diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 1ab9e15ea889b..8a1b17e4f9432 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1217,7 +1217,10 @@ def test_setitem_array_as_cell_value(self): expected = DataFrame({"a": [np.zeros((2,))], "b": [np.zeros((2, 2))]}) tm.assert_frame_equal(df, expected) - def test_iloc_setitem_nullable_2d_values(self): + def test_iloc_setitem_nullable_2d_values(self, using_array_manager, request): + if using_array_manager: + mark = pytest.mark.xfail(reason="Goes through split path, loses dtype") + request.node.add_marker(mark) df = DataFrame({"A": [1, 2, 3]}, dtype="Int64") orig = df.copy() diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 3bde2e2a7cb01..0322eb1973142 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1817,6 +1817,9 @@ def test_constructor_bool_dtype_missing_values(self): expected = Series(True, index=[0], dtype="bool") tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:elementwise comparison failed:DeprecationWarning" + ) @pytest.mark.xfail( np_version_under1p19, reason="np.array([td64nat, float, float]) raises" ) @@ -1828,7 +1831,8 @@ def test_constructor_mismatched_null_nullable_dtype( msg = "|".join( [ "cannot safely cast non-equivalent object", - r"int\(\) argument must be a string, a bytes-like object or a number", + r"int\(\) argument must be a string, a bytes-like object " + "or a (real )?number", r"Cannot cast array data from dtype\('O'\) to dtype\('float64'\) " "according to the rule 'safe'", "object cannot be converted to a FloatingDtype", From 67d615da901226a9c9e2a84bd90a71ee5f319255 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 20 Nov 2021 14:47:12 -0800 Subject: [PATCH 06/19] update tested expception message for py310 --- pandas/tests/frame/indexing/test_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 8a1b17e4f9432..1250d5c80e017 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1248,7 +1248,7 @@ def test_setting_mismatched_na_into_nullable_fails( msg = "|".join( [ r"int\(\) argument must be a string, a bytes-like object or a " - "number, not 'NaTType'", + "(real )?number, not 'NaTType'", r"timedelta64\[ns\] cannot be converted to an? (Floating|Integer)Dtype", r"datetime64\[ns\] cannot be converted to an? (Floating|Integer)Dtype", "object cannot be converted to a FloatingDtype", From 117aef7b7d6ffcf2a506640dae875e28d4fddcd9 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 20 Nov 2021 16:20:46 -0800 Subject: [PATCH 07/19] xfail on later numpy --- pandas/tests/series/test_constructors.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 0322eb1973142..ebe47b88a24d4 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -13,7 +13,10 @@ iNaT, lib, ) -from pandas.compat.numpy import np_version_under1p19 +from pandas.compat.numpy import ( + np_version_under1p19, + np_version_under1p20, +) import pandas.util._test_decorators as td from pandas.core.dtypes.common import ( @@ -1821,7 +1824,7 @@ def test_constructor_bool_dtype_missing_values(self): "ignore:elementwise comparison failed:DeprecationWarning" ) @pytest.mark.xfail( - np_version_under1p19, reason="np.array([td64nat, float, float]) raises" + np_version_under1p20, reason="np.array([td64nat, float, float]) raises" ) @pytest.mark.parametrize("func", [Series, DataFrame, Index, pd.array]) def test_constructor_mismatched_null_nullable_dtype( From 2a2f8d2afbe28c71306f8efb13fc7db85caf45e3 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 22 Nov 2021 12:39:35 -0800 Subject: [PATCH 08/19] use decorator --- pandas/core/arrays/floating.py | 5 ++++- pandas/core/arrays/integer.py | 1 + pandas/tests/extension/base/setitem.py | 7 ++++++- pandas/tests/frame/indexing/test_indexing.py | 7 ++----- pandas/util/_test_decorators.py | 5 +++++ 5 files changed, 18 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 8fe8ab28040b8..0561c0bfa1a07 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -4,7 +4,10 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + lib, + missing as libmissing, +) from pandas._typing import ( ArrayLike, AstypeArg, diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index d60c3de119466..0e82ef731bb63 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -7,6 +7,7 @@ from pandas._libs import ( iNaT, lib, + missing as libmissing, ) from pandas._typing import ( ArrayLike, diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 68b356ec32c25..208a1a1757be2 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -364,8 +364,13 @@ def test_setitem_series(self, data, full_indexer): ) self.assert_series_equal(result, expected) - def test_setitem_frame_2d_values(self, data, using_array_manager, request): + def test_setitem_frame_2d_values(self, data, request): # GH#44514 + df = pd.DataFrame({"A": data}) + + # Avoiding using_array_manager fixture + # https://github.com/pandas-dev/pandas/pull/44514#discussion_r754002410 + using_array_manager = isinstance(df._mgr, pd.core.internals.ArrayManager) if using_array_manager: if not isinstance( data.dtype, (PandasDtype, PeriodDtype, IntervalDtype, DatetimeTZDtype) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 1250d5c80e017..418408324d6f2 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1217,11 +1217,8 @@ def test_setitem_array_as_cell_value(self): expected = DataFrame({"a": [np.zeros((2,))], "b": [np.zeros((2, 2))]}) tm.assert_frame_equal(df, expected) - def test_iloc_setitem_nullable_2d_values(self, using_array_manager, request): - if using_array_manager: - mark = pytest.mark.xfail(reason="Goes through split path, loses dtype") - request.node.add_marker(mark) - + @td.xfail_array_manager # with AM goes through split-path, loses dtype + def test_iloc_setitem_nullable_2d_values(self): df = DataFrame({"A": [1, 2, 3]}, dtype="Int64") orig = df.copy() diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index d5ffca36d325f..4aee5b0fb18ef 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -285,6 +285,11 @@ def async_mark(): return async_mark +xfail_array_manager = pytest.mark.skipif( + get_option("mode.data_manager") == "array", + reason="Fails with ArrayManager", +) + skip_array_manager_not_yet_implemented = pytest.mark.skipif( get_option("mode.data_manager") == "array", reason="Not yet implemented for ArrayManager", From 48a453107cf1da170b6bfa5b279052a2cc13dd39 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 26 Nov 2021 19:06:12 -0800 Subject: [PATCH 09/19] raise in is_numeric_na --- pandas/_libs/missing.pyx | 8 +++++--- pandas/core/arrays/floating.py | 4 ---- pandas/tests/frame/indexing/test_indexing.py | 1 + pandas/tests/series/test_constructors.py | 1 + 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index c3a4a3bed739b..585b535775397 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -270,9 +270,11 @@ def is_numeric_na(values: ndarray) -> ndarray: for i in range(N): val = values[i] - if val is None or val is C_NA or util.is_nan(val) or is_decimal_na(val): - result[i] = True - + if checknull(val): + if val is None or val is C_NA or util.is_nan(val) or is_decimal_na(val): + result[i] = True + else: + raise TypeError(f"'values' contains non-numeric NA {val}") return result.view(bool) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 0561c0bfa1a07..1144e8907a8b1 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -153,10 +153,6 @@ def coerce_to_array( if mask is None: mask = libmissing.is_numeric_na(values) - mask2 = isna(values) - if not (mask == mask2).all(): - # e.g. if we have a timedelta64("NaT") - raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype") else: assert len(mask) == len(values) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 0a77862be1f73..b102bcdae57d9 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1250,6 +1250,7 @@ def test_setting_mismatched_na_into_nullable_fails( r"timedelta64\[ns\] cannot be converted to an? (Floating|Integer)Dtype", r"datetime64\[ns\] cannot be converted to an? (Floating|Integer)Dtype", "object cannot be converted to a FloatingDtype", + "'values' contains non-numeric NA", ] ) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index ebe47b88a24d4..43e4c8364c06c 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1839,6 +1839,7 @@ def test_constructor_mismatched_null_nullable_dtype( r"Cannot cast array data from dtype\('O'\) to dtype\('float64'\) " "according to the rule 'safe'", "object cannot be converted to a FloatingDtype", + "'values' contains non-numeric NA", ] ) From 745d24f6e2cf076c63320e3395e2040e2d3bc9aa Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 26 Nov 2021 19:14:13 -0800 Subject: [PATCH 10/19] fixup unused import --- pandas/core/arrays/floating.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 1144e8907a8b1..5e55715ee0e97 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -30,7 +30,6 @@ ExtensionDtype, register_extension_dtype, ) -from pandas.core.dtypes.missing import isna from pandas.core.arrays import ExtensionArray from pandas.core.arrays.numeric import ( From b357a95c195828146210fbf0ea8842f6df4cf71b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 18 Nov 2021 08:42:58 -0800 Subject: [PATCH 11/19] BUG: IntegerArray/FloatingArray constructors mismatched NAs --- pandas/_libs/missing.pyx | 24 +++++++ pandas/core/arrays/floating.py | 15 +++-- pandas/core/arrays/integer.py | 11 ++- pandas/core/internals/blocks.py | 9 +++ .../arrays/floating/test_construction.py | 12 ++-- pandas/tests/frame/indexing/test_indexing.py | 67 +++++++++++++++++++ pandas/tests/series/methods/test_clip.py | 11 ++- pandas/tests/series/test_constructors.py | 18 +++++ 8 files changed, 149 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index cd04f4f6e4b3a..8ae41241b366e 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -248,6 +248,30 @@ cdef bint checknull_with_nat_and_na(object obj): return checknull_with_nat(obj) or obj is C_NA +@cython.wraparound(False) +@cython.boundscheck(False) +def is_numeric_na(ndarray values): + """ + Check for NA values consistent with IntegerArray/FloatingArray. + + Similar to a vectorized is_valid_na_for_dtype restricted to numeric dtypes. + """ + cdef: + ndarray[uint8_t] result + Py_ssize_t i, N + object val + + N = len(values) + result = np.zeros(N, dtype=np.uint8) + + for i in range(N): + val = values[i] + if val is None or val is C_NA or util.is_nan(val) or is_decimal_na(val): + result[i] = True + + return result.view(bool) + + # ----------------------------------------------------------------------------- # Implementation of NA singleton diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 1e7f1aff52d2e..8fe8ab28040b8 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -129,8 +129,7 @@ def coerce_to_array( if is_object_dtype(values): inferred_type = lib.infer_dtype(values, skipna=True) if inferred_type == "empty": - values = np.empty(len(values)) - values.fill(np.nan) + pass elif inferred_type not in [ "floating", "integer", @@ -146,13 +145,19 @@ def coerce_to_array( elif not (is_integer_dtype(values) or is_float_dtype(values)): raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype") + if values.ndim != 1: + raise TypeError("values must be a 1D list-like") + if mask is None: - mask = isna(values) + mask = libmissing.is_numeric_na(values) + mask2 = isna(values) + if not (mask == mask2).all(): + # e.g. if we have a timedelta64("NaT") + raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype") + else: assert len(mask) == len(values) - if not values.ndim == 1: - raise TypeError("values must be a 1D list-like") if not mask.ndim == 1: raise TypeError("mask must be a 1D list-like") diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 12bef068ef44b..d60c3de119466 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -32,7 +32,6 @@ is_string_dtype, pandas_dtype, ) -from pandas.core.dtypes.missing import isna from pandas.core.arrays import ExtensionArray from pandas.core.arrays.masked import BaseMaskedDtype @@ -183,8 +182,7 @@ def coerce_to_array( if is_object_dtype(values) or is_string_dtype(values): inferred_type = lib.infer_dtype(values, skipna=True) if inferred_type == "empty": - values = np.empty(len(values)) - values.fill(np.nan) + pass elif inferred_type not in [ "floating", "integer", @@ -202,13 +200,14 @@ def coerce_to_array( elif not (is_integer_dtype(values) or is_float_dtype(values)): raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype") + if values.ndim != 1: + raise TypeError("values must be a 1D list-like") + if mask is None: - mask = isna(values) + mask = libmissing.is_numeric_na(values) else: assert len(mask) == len(values) - if values.ndim != 1: - raise TypeError("values must be a 1D list-like") if mask.ndim != 1: raise TypeError("mask must be a 1D list-like") diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3654f77825ab4..22801c2c1af42 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1511,6 +1511,15 @@ def setitem(self, indexer, value): # we are always 1-D indexer = indexer[0] + # TODO(EA2D): not needed with 2D EAS + if isinstance(value, (np.ndarray, ExtensionArray)) and value.ndim == 2: + assert value.shape[1] == 1 + value = value[:, 0] + elif isinstance(value, ABCDataFrame): + # TODO: should we avoid getting here with DataFrame? + assert value.shape[1] == 1 + value = value._ixs(0, axis=1)._values + check_setitem_lengths(indexer, value, self.values) self.values[indexer] = value return self diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index 4ce3dd35b538b..484c269d52333 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -97,14 +97,18 @@ def test_to_array_mixed_integer_float(): np.array(["foo"]), [[1, 2], [3, 4]], [np.nan, {"a": 1}], + # all-NA case used to get quietly swapped out before checking ndim + np.array([pd.NA] * 6, dtype=object).reshape(3, 2), ], ) def test_to_array_error(values): # error in converting existing arrays to FloatingArray - msg = ( - r"(:?.* cannot be converted to a FloatingDtype)" - r"|(:?values must be a 1D list-like)" - r"|(:?Cannot pass scalar)" + msg = "|".join( + [ + "cannot be converted to a FloatingDtype", + "values must be a 1D list-like", + "Cannot pass scalar", + ] ) with pytest.raises((TypeError, ValueError), match=msg): pd.array(values, dtype="Float64") diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 40e6500fce64b..10b14f6663aed 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1211,6 +1211,73 @@ def test_setitem_array_as_cell_value(self): expected = DataFrame({"a": [np.zeros((2,))], "b": [np.zeros((2, 2))]}) tm.assert_frame_equal(df, expected) + def test_iloc_setitem_nullable_2d_values(self): + + df = DataFrame({"A": [1, 2, 3]}, dtype="Int64") + orig = df.copy() + + df.loc[:] = df.values[:, ::-1] + tm.assert_frame_equal(df, orig) + + df.loc[:] = pd.core.arrays.PandasArray(df.values[:, ::-1]) + tm.assert_frame_equal(df, orig) + + df.iloc[:] = df.iloc[:, :] + tm.assert_frame_equal(df, orig) + + @pytest.mark.parametrize( + "null", [pd.NaT, pd.NaT.to_numpy("M8[ns]"), pd.NaT.to_numpy("m8[ns]")] + ) + def test_setting_mismatched_na_into_nullable_fails( + self, null, any_numeric_ea_dtype + ): + # don't cast mismatched nulls to pd.NA + df = DataFrame({"A": [1, 2, 3]}, dtype=any_numeric_ea_dtype) + ser = df["A"] + arr = ser._values + + msg = "|".join( + [ + r"int\(\) argument must be a string, a bytes-like object or a " + "number, not 'NaTType'", + r"timedelta64\[ns\] cannot be converted to an? (Floating|Integer)Dtype", + r"datetime64\[ns\] cannot be converted to an? (Floating|Integer)Dtype", + "object cannot be converted to a FloatingDtype", + ] + ) + with pytest.raises(TypeError, match=msg): + arr[0] = null + + with pytest.raises(TypeError, match=msg): + arr[:2] = [null, null] + + with pytest.raises(TypeError, match=msg): + ser[0] = null + + with pytest.raises(TypeError, match=msg): + ser[:2] = [null, null] + + with pytest.raises(TypeError, match=msg): + ser.iloc[0] = null + + with pytest.raises(TypeError, match=msg): + ser.iloc[:2] = [null, null] + + with pytest.raises(TypeError, match=msg): + df.iloc[0, 0] = null + + with pytest.raises(TypeError, match=msg): + df.iloc[:2, 0] = [null, null] + + # Multi-Block + df2 = df.copy() + df2["B"] = ser.copy() + with pytest.raises(TypeError, match=msg): + df2.iloc[0, 0] = null + + with pytest.raises(TypeError, match=msg): + df2.iloc[:2, 0] = [null, null] + class TestDataFrameIndexingUInt64: def test_setitem(self, uint64_frame): diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index 247f0d50772ce..bc6d5aeb0a581 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -46,9 +46,14 @@ def test_series_clipping_with_na_values(self, any_numeric_ea_dtype, nulls_fixtur # Ensure that clipping method can handle NA values with out failing # GH#40581 - s = Series([nulls_fixture, 1.0, 3.0], dtype=any_numeric_ea_dtype) - s_clipped_upper = s.clip(upper=2.0) - s_clipped_lower = s.clip(lower=2.0) + if nulls_fixture is pd.NaT: + # constructor will raise, see + # test_constructor_mismatched_null_nullable_dtype + return + + ser = Series([nulls_fixture, 1.0, 3.0], dtype=any_numeric_ea_dtype) + s_clipped_upper = ser.clip(upper=2.0) + s_clipped_lower = ser.clip(lower=2.0) expected_upper = Series([nulls_fixture, 1.0, 2.0], dtype=any_numeric_ea_dtype) expected_lower = Series([nulls_fixture, 2.0, 3.0], dtype=any_numeric_ea_dtype) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 8023713dfcf39..fc8b63182593c 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1811,6 +1811,24 @@ def test_constructor_bool_dtype_missing_values(self): expected = Series(True, index=[0], dtype="bool") tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("func", [Series, DataFrame, Index, pd.array]) + def test_constructor_mismatched_null_nullable_dtype( + self, func, any_numeric_ea_dtype + ): + msg = "|".join( + [ + "cannot safely cast non-equivalent object", + r"int\(\) argument must be a string, a bytes-like object or a number", + r"Cannot cast array data from dtype\('O'\) to dtype\('float64'\) " + "according to the rule 'safe'", + "object cannot be converted to a FloatingDtype", + ] + ) + + for null in tm.NP_NAT_OBJECTS + [NaT]: + with pytest.raises(TypeError, match=msg): + func([null, 1.0, 3.0], dtype=any_numeric_ea_dtype) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): From df0d4c5afade785175549b07a2e01176e2ea64bb Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 18 Nov 2021 08:52:27 -0800 Subject: [PATCH 12/19] Whatsnew, GH ref --- doc/source/whatsnew/v1.4.0.rst | 2 ++ .../tests/arrays/floating/test_construction.py | 2 +- pandas/tests/extension/base/setitem.py | 17 +++++++++++++++++ pandas/tests/frame/indexing/test_indexing.py | 2 +- pandas/tests/series/test_constructors.py | 1 + 5 files changed, 22 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index bcebe3ab024ba..f197acdbdb000 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -678,6 +678,7 @@ Indexing - Bug in :meth:`DataFrame.loc.__setitem__` and :meth:`DataFrame.iloc.__setitem__` with mixed dtypes sometimes failing to operate in-place (:issue:`44345`) - Bug in :meth:`DataFrame.loc.__getitem__` incorrectly raising ``KeyError`` when selecting a single column with a boolean key (:issue:`44322`). - Bug in indexing on columns with ``loc`` or ``iloc`` using a slice with a negative step with ``ExtensionDtype`` columns incorrectly raising (:issue:`44551`) +- Bug in setting :meth:`DataFrame.iloc` with a single ``ExtensionDtype`` column and setting 2D values e.g. ``df.iloc[:] = df.values`` incorrectly raising (:issue:`44514`) - Bug in :meth:`IntervalIndex.get_indexer_non_unique` returning boolean mask instead of array of integers for a non unique and non monotonic index (:issue:`44084`) - Bug in :meth:`IntervalIndex.get_indexer_non_unique` not handling targets of ``dtype`` 'object' with NaNs correctly (:issue:`44482`) - @@ -797,6 +798,7 @@ ExtensionArray - NumPy ufuncs ``np.minimum.reduce`` and ``np.maximum.reduce`` now work correctly instead of raising ``NotImplementedError`` on :class:`Series` with ``IntegerDtype`` or ``FloatDtype`` (:issue:`43923`) - Avoid raising ``PerformanceWarning`` about fragmented DataFrame when using many columns with an extension dtype (:issue:`44098`) - Bug in :meth:`BooleanArray.__eq__` and :meth:`BooleanArray.__ne__` raising ``TypeError`` on comparison with an incompatible type (like a string). This caused :meth:`DataFrame.replace` to sometimes raise a ``TypeError`` if a nullable boolean column was included (:issue:`44499`) +- Bug in :class:`IntegerArray` and :class:`FloatingArray` construction incorrectly coercing mismatched NA values (e.g. ``np.timedelta64("NaT")``) to numeric NA (:issue:`44514`) - Styler diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index 484c269d52333..4b7b237d2eb7c 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -97,7 +97,7 @@ def test_to_array_mixed_integer_float(): np.array(["foo"]), [[1, 2], [3, 4]], [np.nan, {"a": 1}], - # all-NA case used to get quietly swapped out before checking ndim + # GH#44514 all-NA case used to get quietly swapped out before checking ndim np.array([pd.NA] * 6, dtype=object).reshape(3, 2), ], ) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index a2d100db81a2c..221710fbffca1 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -357,6 +357,23 @@ def test_setitem_series(self, data, full_indexer): ) self.assert_series_equal(result, expected) + def test_setitem_frame_2d_values(self, data): + # GH#44514 + df = pd.DataFrame({"A": data}) + orig = df.copy() + + df.iloc[:] = df + self.assert_frame_equal(df, orig) + + df.iloc[:-1] = df.iloc[:-1] + self.assert_frame_equal(df, orig) + + df.iloc[:] = df.values + self.assert_frame_equal(df, orig) + + df.iloc[:-1] = df.values[:-1] + self.assert_frame_equal(df, orig) + def test_delitem_series(self, data): # GH#40763 ser = pd.Series(data, name="data") diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 10b14f6663aed..5e022567c9c8a 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1231,7 +1231,7 @@ def test_iloc_setitem_nullable_2d_values(self): def test_setting_mismatched_na_into_nullable_fails( self, null, any_numeric_ea_dtype ): - # don't cast mismatched nulls to pd.NA + # GH#44514 don't cast mismatched nulls to pd.NA df = DataFrame({"A": [1, 2, 3]}, dtype=any_numeric_ea_dtype) ser = df["A"] arr = ser._values diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index fc8b63182593c..eb29cb10e2b84 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1815,6 +1815,7 @@ def test_constructor_bool_dtype_missing_values(self): def test_constructor_mismatched_null_nullable_dtype( self, func, any_numeric_ea_dtype ): + # GH#44514 msg = "|".join( [ "cannot safely cast non-equivalent object", From 421c4febe2405df8e8e2a61e3ea5ed352a95e3f4 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 18 Nov 2021 13:26:10 -0800 Subject: [PATCH 13/19] mypy fixup --- pandas/core/internals/blocks.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 22801c2c1af42..6509c5fcaf596 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1514,7 +1514,9 @@ def setitem(self, indexer, value): # TODO(EA2D): not needed with 2D EAS if isinstance(value, (np.ndarray, ExtensionArray)) and value.ndim == 2: assert value.shape[1] == 1 - value = value[:, 0] + # error: No overload variant of "__getitem__" of "ExtensionArray" + # matches argument type "Tuple[slice, int]" + value = value[:, 0] # type: ignore[call-overload] elif isinstance(value, ABCDataFrame): # TODO: should we avoid getting here with DataFrame? assert value.shape[1] == 1 From d967af5772a189cc02515ab1552e53fd2e6b8ad4 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 30 Nov 2021 16:00:48 -0800 Subject: [PATCH 14/19] BUG: Series.where with incompatible NA value --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/arrays/floating.py | 5 +- pandas/core/arrays/integer.py | 1 + pandas/core/frame.py | 2 +- pandas/core/generic.py | 2 +- pandas/core/internals/blocks.py | 17 +++--- pandas/core/series.py | 2 +- .../tests/arrays/integer/test_construction.py | 12 +++-- pandas/tests/frame/indexing/test_where.py | 52 +++++++++++++++++++ 9 files changed, 78 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index f197acdbdb000..daf149e1858ab 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -799,6 +799,7 @@ ExtensionArray - Avoid raising ``PerformanceWarning`` about fragmented DataFrame when using many columns with an extension dtype (:issue:`44098`) - Bug in :meth:`BooleanArray.__eq__` and :meth:`BooleanArray.__ne__` raising ``TypeError`` on comparison with an incompatible type (like a string). This caused :meth:`DataFrame.replace` to sometimes raise a ``TypeError`` if a nullable boolean column was included (:issue:`44499`) - Bug in :class:`IntegerArray` and :class:`FloatingArray` construction incorrectly coercing mismatched NA values (e.g. ``np.timedelta64("NaT")``) to numeric NA (:issue:`44514`) +- Bug in :meth:`Series.where` with ``ExtensionDtype`` when ``other`` is a NA scalar incompatible with the series dtype (e.g. ``NaT`` with a numeric dtype) incorrectly casting to a compatible NA value (:issue:`??`) - Styler diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 8fe8ab28040b8..0561c0bfa1a07 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -4,7 +4,10 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + lib, + missing as libmissing, +) from pandas._typing import ( ArrayLike, AstypeArg, diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index d60c3de119466..0e82ef731bb63 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -7,6 +7,7 @@ from pandas._libs import ( iNaT, lib, + missing as libmissing, ) from pandas._typing import ( ArrayLike, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7a0e945d917de..5073f2a2369f5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10855,7 +10855,7 @@ def interpolate( def where( self, cond, - other=np.nan, + other=lib.no_default, inplace=False, axis=None, level=None, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4aff7acc4c6fb..b0fd343084ef6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8937,7 +8937,7 @@ def _align_series( def _where( self, cond, - other=np.nan, + other=lib.no_default, inplace=False, axis=None, level=None, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6509c5fcaf596..720969f6853a3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -962,6 +962,9 @@ def putmask(self, mask, new) -> list[Block]: mask, noop = validate_putmask(values.T, mask) assert not isinstance(new, (ABCIndex, ABCSeries, ABCDataFrame)) + if new is lib.no_default: + new = self.fill_value + # if we are passed a scalar None, convert it here if not self.is_object and is_valid_na_for_dtype(new, self.dtype): new = self.fill_value @@ -1178,6 +1181,9 @@ def where(self, other, cond) -> list[Block]: icond, noop = validate_putmask(values, ~cond) + if other is lib.no_default: + other = self.fill_value + if is_valid_na_for_dtype(other, self.dtype) and self.dtype != _dtype_obj: other = self.fill_value @@ -1640,13 +1646,8 @@ def where(self, other, cond) -> list[Block]: other = self._maybe_squeeze_arg(other) cond = self._maybe_squeeze_arg(cond) - if lib.is_scalar(other) and isna(other): - # The default `other` for Series / Frame is np.nan - # we want to replace that with the correct NA value - # for the type - # error: Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" has no - # attribute "na_value" - other = self.dtype.na_value # type: ignore[union-attr] + if other is lib.no_default: + other = self.fill_value icond, noop = validate_putmask(self.values, ~cond) if noop: @@ -1741,6 +1742,8 @@ def where(self, other, cond) -> list[Block]: arr = self.values cond = extract_bool_array(cond) + if other is lib.no_default: + other = self.fill_value try: res_values = arr.T._where(cond, other).T diff --git a/pandas/core/series.py b/pandas/core/series.py index ffa31b4f66211..dd6f50b886f0a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5485,7 +5485,7 @@ def interpolate( def where( self, cond, - other=np.nan, + other=lib.no_default, inplace=False, axis=None, level=None, diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py index be9230175bb5d..e5fd4977ec2b8 100644 --- a/pandas/tests/arrays/integer/test_construction.py +++ b/pandas/tests/arrays/integer/test_construction.py @@ -133,11 +133,13 @@ def test_to_integer_array_none_is_nan(a, b): ) def test_to_integer_array_error(values): # error in converting existing arrays to IntegerArrays - msg = ( - r"(:?.* cannot be converted to an IntegerDtype)" - r"|(invalid literal for int\(\) with base 10: .*)" - r"|(:?values must be a 1D list-like)" - r"|(Cannot pass scalar)" + msg = "|".join( + [ + r"cannot be converted to an IntegerDtype", + r"invalid literal for int\(\) with base 10:", + r"values must be a 1D list-like", + r"Cannot pass scalar", + ] ) with pytest.raises((ValueError, TypeError), match=msg): pd.array(values, dtype="Int64") diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 0906186418c0a..d6b3570dfe3e6 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -797,3 +797,55 @@ def test_where_columns_casting(): result = df.where(pd.notnull(df), None) # make sure dtypes don't change tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("as_cat", [True, False]) +def test_where_period_invalid_na(frame_or_series, as_cat): + idx = pd.period_range("2016-01-01", periods=3, freq="D") + if as_cat: + idx = idx.astype("category") + obj = frame_or_series(idx) + + # NA value that we should *not* cast to Period dtype + tdnat = pd.NaT.to_numpy("m8[ns]") + + mask = np.array([True, True, False], ndmin=obj.ndim).T + + if as_cat: + msg = ( + r"Cannot setitem on a Categorical with a new category \(NaT\), " + "set the categories first" + ) + else: + msg = "value should be a 'Period'" + + with pytest.raises(TypeError, match=msg): + obj.where(mask, tdnat) + + with pytest.raises(TypeError, match=msg): + obj.mask(mask, tdnat) + + +def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): + arr = pd.array([1, 2, 3], dtype=any_numeric_ea_dtype) + obj = frame_or_series(arr) + + mask = np.array([True, True, False], ndmin=obj.ndim).T + + msg = "|".join( + [ + r"datetime64\[.{1,2}\] cannot be converted to an? (Integer|Floating)Dtype", + r"timedelta64\[.{1,2}\] cannot be converted to an? (Integer|Floating)Dtype", + r"int\(\) argument must be a string, a bytes-like object or a number, " + "not 'NaTType'", + "object cannot be converted to a FloatingDtype", + ] + ) + + for null in tm.NP_NAT_OBJECTS + [pd.NaT]: + # NaT is an NA value that we should *not* cast to pd.NA dtype + with pytest.raises(TypeError, match=msg): + obj.where(mask, null) + + with pytest.raises(TypeError, match=msg): + obj.mask(mask, null) From 771356a5e8fb3d555012414b17c5cd52d059c848 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 30 Nov 2021 16:06:17 -0800 Subject: [PATCH 15/19] GH refs --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/tests/frame/indexing/test_where.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 2f246b0dd9a63..4fecc2f923900 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -801,7 +801,7 @@ ExtensionArray - Bug in :class:`IntegerArray` and :class:`FloatingArray` construction incorrectly coercing mismatched NA values (e.g. ``np.timedelta64("NaT")``) to numeric NA (:issue:`44514`) - Bug in :meth:`BooleanArray.__eq__` and :meth:`BooleanArray.__ne__` raising ``TypeError`` on comparison with an incompatible type (like a string). This caused :meth:`DataFrame.replace` to sometimes raise a ``TypeError`` if a nullable boolean column was included (:issue:`44499`) - Bug in :class:`IntegerArray` and :class:`FloatingArray` construction incorrectly coercing mismatched NA values (e.g. ``np.timedelta64("NaT")``) to numeric NA (:issue:`44514`) -- Bug in :meth:`Series.where` with ``ExtensionDtype`` when ``other`` is a NA scalar incompatible with the series dtype (e.g. ``NaT`` with a numeric dtype) incorrectly casting to a compatible NA value (:issue:`??`) +- Bug in :meth:`Series.where` with ``ExtensionDtype`` when ``other`` is a NA scalar incompatible with the series dtype (e.g. ``NaT`` with a numeric dtype) incorrectly casting to a compatible NA value (:issue:`44697`) - Styler diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index d6b3570dfe3e6..11cd328f85a1a 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -801,6 +801,7 @@ def test_where_columns_casting(): @pytest.mark.parametrize("as_cat", [True, False]) def test_where_period_invalid_na(frame_or_series, as_cat): + # GH#44697 idx = pd.period_range("2016-01-01", periods=3, freq="D") if as_cat: idx = idx.astype("category") @@ -827,6 +828,7 @@ def test_where_period_invalid_na(frame_or_series, as_cat): def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): + # GH#44697 arr = pd.array([1, 2, 3], dtype=any_numeric_ea_dtype) obj = frame_or_series(arr) From 34f65d6a13ab81bd6644a84324977ebb307a5458 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 30 Nov 2021 18:55:45 -0800 Subject: [PATCH 16/19] update exception message --- pandas/tests/frame/indexing/test_where.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 11cd328f85a1a..723b5f4598557 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -841,6 +841,7 @@ def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): r"int\(\) argument must be a string, a bytes-like object or a number, " "not 'NaTType'", "object cannot be converted to a FloatingDtype", + "'values' contains non-numeric NA", ] ) From e6f6331c25fdfdb4c2a92757d1de5653b613cca1 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 1 Dec 2021 10:42:19 -0800 Subject: [PATCH 17/19] xfail on old numpy --- pandas/tests/frame/indexing/test_where.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 723b5f4598557..9d113660c29e2 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.compat import np_version_under1p19 + from pandas.core.dtypes.common import is_scalar import pandas as pd @@ -800,7 +802,7 @@ def test_where_columns_casting(): @pytest.mark.parametrize("as_cat", [True, False]) -def test_where_period_invalid_na(frame_or_series, as_cat): +def test_where_period_invalid_na(frame_or_series, as_cat, request): # GH#44697 idx = pd.period_range("2016-01-01", periods=3, freq="D") if as_cat: @@ -817,6 +819,13 @@ def test_where_period_invalid_na(frame_or_series, as_cat): r"Cannot setitem on a Categorical with a new category \(NaT\), " "set the categories first" ) + if np_version_under1p19: + mark = pytest.mark.xfail( + reason="When evaluating the f-string to generate the exception " + "message, numpy somehow ends up trying to cast None to int, so " + "ends up raising TypeError but with an unrelated message." + ) + request.node.add_marker(mark) else: msg = "value should be a 'Period'" From 06e44faa7511a1e5ab5fc606c4a71a61fdbed22b Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 1 Dec 2021 16:05:01 -0800 Subject: [PATCH 18/19] remove duplicated note --- doc/source/whatsnew/v1.4.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 3d6bebbcabd23..bb0b3e42b7825 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -682,7 +682,6 @@ Indexing - Bug in :meth:`DataFrame.loc.__getitem__` incorrectly raising ``KeyError`` when selecting a single column with a boolean key (:issue:`44322`). - Bug in setting :meth:`DataFrame.iloc` with a single ``ExtensionDtype`` column and setting 2D values e.g. ``df.iloc[:] = df.values`` incorrectly raising (:issue:`44514`) - Bug in indexing on columns with ``loc`` or ``iloc`` using a slice with a negative step with ``ExtensionDtype`` columns incorrectly raising (:issue:`44551`) -- Bug in setting :meth:`DataFrame.iloc` with a single ``ExtensionDtype`` column and setting 2D values e.g. ``df.iloc[:] = df.values`` incorrectly raising (:issue:`44514`) - Bug in :meth:`IntervalIndex.get_indexer_non_unique` returning boolean mask instead of array of integers for a non unique and non monotonic index (:issue:`44084`) - Bug in :meth:`IntervalIndex.get_indexer_non_unique` not handling targets of ``dtype`` 'object' with NaNs correctly (:issue:`44482`) - From fc158a62dae879c94c772caa7c7ab28318d63c67 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 1 Dec 2021 16:05:17 -0800 Subject: [PATCH 19/19] remove duplicated note --- doc/source/whatsnew/v1.4.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index bb0b3e42b7825..9dd05eae750db 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -802,7 +802,6 @@ ExtensionArray - Avoid raising ``PerformanceWarning`` about fragmented DataFrame when using many columns with an extension dtype (:issue:`44098`) - Bug in :class:`IntegerArray` and :class:`FloatingArray` construction incorrectly coercing mismatched NA values (e.g. ``np.timedelta64("NaT")``) to numeric NA (:issue:`44514`) - Bug in :meth:`BooleanArray.__eq__` and :meth:`BooleanArray.__ne__` raising ``TypeError`` on comparison with an incompatible type (like a string). This caused :meth:`DataFrame.replace` to sometimes raise a ``TypeError`` if a nullable boolean column was included (:issue:`44499`) -- Bug in :class:`IntegerArray` and :class:`FloatingArray` construction incorrectly coercing mismatched NA values (e.g. ``np.timedelta64("NaT")``) to numeric NA (:issue:`44514`) - Bug in :meth:`Series.where` with ``ExtensionDtype`` when ``other`` is a NA scalar incompatible with the series dtype (e.g. ``NaT`` with a numeric dtype) incorrectly casting to a compatible NA value (:issue:`44697`) -