diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index fd7cb6a69d955..12b44a2bcbd5a 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -803,6 +803,7 @@ ExtensionArray - Avoid raising ``PerformanceWarning`` about fragmented DataFrame when using many columns with an extension dtype (:issue:`44098`) - Bug in :class:`IntegerArray` and :class:`FloatingArray` construction incorrectly coercing mismatched NA values (e.g. ``np.timedelta64("NaT")``) to numeric NA (:issue:`44514`) - Bug in :meth:`BooleanArray.__eq__` and :meth:`BooleanArray.__ne__` raising ``TypeError`` on comparison with an incompatible type (like a string). This caused :meth:`DataFrame.replace` to sometimes raise a ``TypeError`` if a nullable boolean column was included (:issue:`44499`) +- Bug in :meth:`Series.where` with ``ExtensionDtype`` when ``other`` is a NA scalar incompatible with the series dtype (e.g. ``NaT`` with a numeric dtype) incorrectly casting to a compatible NA value (:issue:`44697`) - Styler diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 12c6eaa86552f..f0e35a8914870 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10857,7 +10857,7 @@ def interpolate( def where( self, cond, - other=np.nan, + other=lib.no_default, inplace=False, axis=None, level=None, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4aff7acc4c6fb..b0fd343084ef6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8937,7 +8937,7 @@ def _align_series( def _where( self, cond, - other=np.nan, + other=lib.no_default, inplace=False, axis=None, level=None, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a172cfa49e5b3..f93f386261005 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -962,6 +962,9 @@ def putmask(self, mask, new) -> list[Block]: mask, noop = validate_putmask(values.T, mask) assert not isinstance(new, (ABCIndex, ABCSeries, ABCDataFrame)) + if new is lib.no_default: + new = self.fill_value + # if we are passed a scalar None, convert it here if not self.is_object and is_valid_na_for_dtype(new, self.dtype): new = self.fill_value @@ -1173,6 +1176,9 @@ def where(self, other, cond) -> list[Block]: icond, noop = validate_putmask(values, ~cond) + if other is lib.no_default: + other = self.fill_value + if is_valid_na_for_dtype(other, self.dtype) and self.dtype != _dtype_obj: other = self.fill_value @@ -1640,13 +1646,8 @@ def where(self, other, cond) -> list[Block]: other = self._maybe_squeeze_arg(other) cond = self._maybe_squeeze_arg(cond) - if lib.is_scalar(other) and isna(other): - # The default `other` for Series / Frame is np.nan - # we want to replace that with the correct NA value - # for the type - # error: Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" has no - # attribute "na_value" - other = self.dtype.na_value # type: ignore[union-attr] + if other is lib.no_default: + other = self.fill_value icond, noop = validate_putmask(self.values, ~cond) if noop: @@ -1741,6 +1742,8 @@ def where(self, other, cond) -> list[Block]: arr = self.values cond = extract_bool_array(cond) + if other is lib.no_default: + other = self.fill_value try: res_values = arr.T._where(cond, other).T diff --git a/pandas/core/series.py b/pandas/core/series.py index f7b3d36bb82e7..5e111ccc727fc 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5510,7 +5510,7 @@ def interpolate( def where( self, cond, - other=np.nan, + other=lib.no_default, inplace=False, axis=None, level=None, diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py index be9230175bb5d..e5fd4977ec2b8 100644 --- a/pandas/tests/arrays/integer/test_construction.py +++ b/pandas/tests/arrays/integer/test_construction.py @@ -133,11 +133,13 @@ def test_to_integer_array_none_is_nan(a, b): ) def test_to_integer_array_error(values): # error in converting existing arrays to IntegerArrays - msg = ( - r"(:?.* cannot be converted to an IntegerDtype)" - r"|(invalid literal for int\(\) with base 10: .*)" - r"|(:?values must be a 1D list-like)" - r"|(Cannot pass scalar)" + msg = "|".join( + [ + r"cannot be converted to an IntegerDtype", + r"invalid literal for int\(\) with base 10:", + r"values must be a 1D list-like", + r"Cannot pass scalar", + ] ) with pytest.raises((ValueError, TypeError), match=msg): pd.array(values, dtype="Int64") diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 525bf75476fc7..a079a3a7921d7 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas.compat import np_version_under1p19 + from pandas.core.dtypes.common import is_scalar import pandas as pd @@ -810,6 +812,68 @@ def test_where_columns_casting(): tm.assert_frame_equal(expected, result) +@pytest.mark.parametrize("as_cat", [True, False]) +def test_where_period_invalid_na(frame_or_series, as_cat, request): + # GH#44697 + idx = pd.period_range("2016-01-01", periods=3, freq="D") + if as_cat: + idx = idx.astype("category") + obj = frame_or_series(idx) + + # NA value that we should *not* cast to Period dtype + tdnat = pd.NaT.to_numpy("m8[ns]") + + mask = np.array([True, True, False], ndmin=obj.ndim).T + + if as_cat: + msg = ( + r"Cannot setitem on a Categorical with a new category \(NaT\), " + "set the categories first" + ) + if np_version_under1p19: + mark = pytest.mark.xfail( + reason="When evaluating the f-string to generate the exception " + "message, numpy somehow ends up trying to cast None to int, so " + "ends up raising TypeError but with an unrelated message." + ) + request.node.add_marker(mark) + else: + msg = "value should be a 'Period'" + + with pytest.raises(TypeError, match=msg): + obj.where(mask, tdnat) + + with pytest.raises(TypeError, match=msg): + obj.mask(mask, tdnat) + + +def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): + # GH#44697 + arr = pd.array([1, 2, 3], dtype=any_numeric_ea_dtype) + obj = frame_or_series(arr) + + mask = np.array([True, True, False], ndmin=obj.ndim).T + + msg = "|".join( + [ + r"datetime64\[.{1,2}\] cannot be converted to an? (Integer|Floating)Dtype", + r"timedelta64\[.{1,2}\] cannot be converted to an? (Integer|Floating)Dtype", + r"int\(\) argument must be a string, a bytes-like object or a number, " + "not 'NaTType'", + "object cannot be converted to a FloatingDtype", + "'values' contains non-numeric NA", + ] + ) + + for null in tm.NP_NAT_OBJECTS + [pd.NaT]: + # NaT is an NA value that we should *not* cast to pd.NA dtype + with pytest.raises(TypeError, match=msg): + obj.where(mask, null) + + with pytest.raises(TypeError, match=msg): + obj.mask(mask, null) + + @given( data=st.one_of( OPTIONAL_DICTS, OPTIONAL_FLOATS, OPTIONAL_INTS, OPTIONAL_LISTS, OPTIONAL_TEXT