Skip to content

Commit 6b84ee7

Browse files
authored
BUG: IntegerArray/FloatingArray constructors mismatched NAs (#44514)
1 parent ede6234 commit 6b84ee7

File tree

10 files changed

+212
-21
lines changed

10 files changed

+212
-21
lines changed

doc/source/whatsnew/v1.4.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -680,6 +680,7 @@ Indexing
680680
- Bug in :meth:`Series.reset_index` not ignoring ``name`` argument when ``drop`` and ``inplace`` are set to ``True`` (:issue:`44575`)
681681
- Bug in :meth:`DataFrame.loc.__setitem__` and :meth:`DataFrame.iloc.__setitem__` with mixed dtypes sometimes failing to operate in-place (:issue:`44345`)
682682
- Bug in :meth:`DataFrame.loc.__getitem__` incorrectly raising ``KeyError`` when selecting a single column with a boolean key (:issue:`44322`).
683+
- Bug in setting :meth:`DataFrame.iloc` with a single ``ExtensionDtype`` column and setting 2D values e.g. ``df.iloc[:] = df.values`` incorrectly raising (:issue:`44514`)
683684
- Bug in indexing on columns with ``loc`` or ``iloc`` using a slice with a negative step with ``ExtensionDtype`` columns incorrectly raising (:issue:`44551`)
684685
- Bug in :meth:`IntervalIndex.get_indexer_non_unique` returning boolean mask instead of array of integers for a non unique and non monotonic index (:issue:`44084`)
685686
- Bug in :meth:`IntervalIndex.get_indexer_non_unique` not handling targets of ``dtype`` 'object' with NaNs correctly (:issue:`44482`)
@@ -799,6 +800,7 @@ ExtensionArray
799800
- NumPy ufuncs ``np.abs``, ``np.positive``, ``np.negative`` now correctly preserve dtype when called on ExtensionArrays that implement ``__abs__, __pos__, __neg__``, respectively. In particular this is fixed for :class:`TimedeltaArray` (:issue:`43899`)
800801
- NumPy ufuncs ``np.minimum.reduce`` and ``np.maximum.reduce`` now work correctly instead of raising ``NotImplementedError`` on :class:`Series` with ``IntegerDtype`` or ``FloatDtype`` (:issue:`43923`)
801802
- Avoid raising ``PerformanceWarning`` about fragmented DataFrame when using many columns with an extension dtype (:issue:`44098`)
803+
- Bug in :class:`IntegerArray` and :class:`FloatingArray` construction incorrectly coercing mismatched NA values (e.g. ``np.timedelta64("NaT")``) to numeric NA (:issue:`44514`)
802804
- Bug in :meth:`BooleanArray.__eq__` and :meth:`BooleanArray.__ne__` raising ``TypeError`` on comparison with an incompatible type (like a string). This caused :meth:`DataFrame.replace` to sometimes raise a ``TypeError`` if a nullable boolean column was included (:issue:`44499`)
803805
-
804806

pandas/_libs/missing.pyx

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,36 @@ cdef bint checknull_with_nat_and_na(object obj):
248248
return checknull_with_nat(obj) or obj is C_NA
249249

250250

251+
@cython.wraparound(False)
252+
@cython.boundscheck(False)
253+
def is_numeric_na(values: ndarray) -> ndarray:
254+
"""
255+
Check for NA values consistent with IntegerArray/FloatingArray.
256+
257+
Similar to a vectorized is_valid_na_for_dtype restricted to numeric dtypes.
258+
259+
Returns
260+
-------
261+
ndarray[bool]
262+
"""
263+
cdef:
264+
ndarray[uint8_t] result
265+
Py_ssize_t i, N
266+
object val
267+
268+
N = len(values)
269+
result = np.zeros(N, dtype=np.uint8)
270+
271+
for i in range(N):
272+
val = values[i]
273+
if checknull(val):
274+
if val is None or val is C_NA or util.is_nan(val) or is_decimal_na(val):
275+
result[i] = True
276+
else:
277+
raise TypeError(f"'values' contains non-numeric NA {val}")
278+
return result.view(bool)
279+
280+
251281
# -----------------------------------------------------------------------------
252282
# Implementation of NA singleton
253283

pandas/core/arrays/floating.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@
44

55
import numpy as np
66

7-
from pandas._libs import lib
7+
from pandas._libs import (
8+
lib,
9+
missing as libmissing,
10+
)
811
from pandas._typing import (
912
ArrayLike,
1013
AstypeArg,
@@ -27,7 +30,6 @@
2730
ExtensionDtype,
2831
register_extension_dtype,
2932
)
30-
from pandas.core.dtypes.missing import isna
3133

3234
from pandas.core.arrays import ExtensionArray
3335
from pandas.core.arrays.numeric import (
@@ -129,8 +131,7 @@ def coerce_to_array(
129131
if is_object_dtype(values):
130132
inferred_type = lib.infer_dtype(values, skipna=True)
131133
if inferred_type == "empty":
132-
values = np.empty(len(values))
133-
values.fill(np.nan)
134+
pass
134135
elif inferred_type not in [
135136
"floating",
136137
"integer",
@@ -146,13 +147,15 @@ def coerce_to_array(
146147
elif not (is_integer_dtype(values) or is_float_dtype(values)):
147148
raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype")
148149

150+
if values.ndim != 1:
151+
raise TypeError("values must be a 1D list-like")
152+
149153
if mask is None:
150-
mask = isna(values)
154+
mask = libmissing.is_numeric_na(values)
155+
151156
else:
152157
assert len(mask) == len(values)
153158

154-
if not values.ndim == 1:
155-
raise TypeError("values must be a 1D list-like")
156159
if not mask.ndim == 1:
157160
raise TypeError("mask must be a 1D list-like")
158161

pandas/core/arrays/integer.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pandas._libs import (
88
iNaT,
99
lib,
10+
missing as libmissing,
1011
)
1112
from pandas._typing import (
1213
ArrayLike,
@@ -32,7 +33,6 @@
3233
is_string_dtype,
3334
pandas_dtype,
3435
)
35-
from pandas.core.dtypes.missing import isna
3636

3737
from pandas.core.arrays import ExtensionArray
3838
from pandas.core.arrays.masked import BaseMaskedDtype
@@ -183,8 +183,7 @@ def coerce_to_array(
183183
if is_object_dtype(values) or is_string_dtype(values):
184184
inferred_type = lib.infer_dtype(values, skipna=True)
185185
if inferred_type == "empty":
186-
values = np.empty(len(values))
187-
values.fill(np.nan)
186+
pass
188187
elif inferred_type not in [
189188
"floating",
190189
"integer",
@@ -202,13 +201,14 @@ def coerce_to_array(
202201
elif not (is_integer_dtype(values) or is_float_dtype(values)):
203202
raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")
204203

204+
if values.ndim != 1:
205+
raise TypeError("values must be a 1D list-like")
206+
205207
if mask is None:
206-
mask = isna(values)
208+
mask = libmissing.is_numeric_na(values)
207209
else:
208210
assert len(mask) == len(values)
209211

210-
if values.ndim != 1:
211-
raise TypeError("values must be a 1D list-like")
212212
if mask.ndim != 1:
213213
raise TypeError("mask must be a 1D list-like")
214214

pandas/core/internals/blocks.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1511,6 +1511,17 @@ def setitem(self, indexer, value):
15111511
# we are always 1-D
15121512
indexer = indexer[0]
15131513

1514+
# TODO(EA2D): not needed with 2D EAS
1515+
if isinstance(value, (np.ndarray, ExtensionArray)) and value.ndim == 2:
1516+
assert value.shape[1] == 1
1517+
# error: No overload variant of "__getitem__" of "ExtensionArray"
1518+
# matches argument type "Tuple[slice, int]"
1519+
value = value[:, 0] # type: ignore[call-overload]
1520+
elif isinstance(value, ABCDataFrame):
1521+
# TODO: should we avoid getting here with DataFrame?
1522+
assert value.shape[1] == 1
1523+
value = value._ixs(0, axis=1)._values
1524+
15141525
check_setitem_lengths(indexer, value, self.values)
15151526
self.values[indexer] = value
15161527
return self

pandas/tests/arrays/floating/test_construction.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,14 +97,18 @@ def test_to_array_mixed_integer_float():
9797
np.array(["foo"]),
9898
[[1, 2], [3, 4]],
9999
[np.nan, {"a": 1}],
100+
# GH#44514 all-NA case used to get quietly swapped out before checking ndim
101+
np.array([pd.NA] * 6, dtype=object).reshape(3, 2),
100102
],
101103
)
102104
def test_to_array_error(values):
103105
# error in converting existing arrays to FloatingArray
104-
msg = (
105-
r"(:?.* cannot be converted to a FloatingDtype)"
106-
r"|(:?values must be a 1D list-like)"
107-
r"|(:?Cannot pass scalar)"
106+
msg = "|".join(
107+
[
108+
"cannot be converted to a FloatingDtype",
109+
"values must be a 1D list-like",
110+
"Cannot pass scalar",
111+
]
108112
)
109113
with pytest.raises((TypeError, ValueError), match=msg):
110114
pd.array(values, dtype="Float64")

pandas/tests/extension/base/setitem.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
import numpy as np
22
import pytest
33

4+
from pandas.core.dtypes.dtypes import (
5+
DatetimeTZDtype,
6+
IntervalDtype,
7+
PandasDtype,
8+
PeriodDtype,
9+
)
10+
411
import pandas as pd
512
import pandas._testing as tm
613
from pandas.tests.extension.base.base import BaseExtensionTests
@@ -357,6 +364,36 @@ def test_setitem_series(self, data, full_indexer):
357364
)
358365
self.assert_series_equal(result, expected)
359366

367+
def test_setitem_frame_2d_values(self, data, request):
368+
# GH#44514
369+
df = pd.DataFrame({"A": data})
370+
371+
# Avoiding using_array_manager fixture
372+
# https://github.com/pandas-dev/pandas/pull/44514#discussion_r754002410
373+
using_array_manager = isinstance(df._mgr, pd.core.internals.ArrayManager)
374+
if using_array_manager:
375+
if not isinstance(
376+
data.dtype, (PandasDtype, PeriodDtype, IntervalDtype, DatetimeTZDtype)
377+
):
378+
# These dtypes have non-broken implementations of _can_hold_element
379+
mark = pytest.mark.xfail(reason="Goes through split path, loses dtype")
380+
request.node.add_marker(mark)
381+
382+
df = pd.DataFrame({"A": data})
383+
orig = df.copy()
384+
385+
df.iloc[:] = df
386+
self.assert_frame_equal(df, orig)
387+
388+
df.iloc[:-1] = df.iloc[:-1]
389+
self.assert_frame_equal(df, orig)
390+
391+
df.iloc[:] = df.values
392+
self.assert_frame_equal(df, orig)
393+
394+
df.iloc[:-1] = df.values[:-1]
395+
self.assert_frame_equal(df, orig)
396+
360397
def test_delitem_series(self, data):
361398
# GH#40763
362399
ser = pd.Series(data, name="data")

pandas/tests/frame/indexing/test_indexing.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1211,6 +1211,75 @@ def test_setitem_array_as_cell_value(self):
12111211
expected = DataFrame({"a": [np.zeros((2,))], "b": [np.zeros((2, 2))]})
12121212
tm.assert_frame_equal(df, expected)
12131213

1214+
# with AM goes through split-path, loses dtype
1215+
@td.skip_array_manager_not_yet_implemented
1216+
def test_iloc_setitem_nullable_2d_values(self):
1217+
df = DataFrame({"A": [1, 2, 3]}, dtype="Int64")
1218+
orig = df.copy()
1219+
1220+
df.loc[:] = df.values[:, ::-1]
1221+
tm.assert_frame_equal(df, orig)
1222+
1223+
df.loc[:] = pd.core.arrays.PandasArray(df.values[:, ::-1])
1224+
tm.assert_frame_equal(df, orig)
1225+
1226+
df.iloc[:] = df.iloc[:, :]
1227+
tm.assert_frame_equal(df, orig)
1228+
1229+
@pytest.mark.parametrize(
1230+
"null", [pd.NaT, pd.NaT.to_numpy("M8[ns]"), pd.NaT.to_numpy("m8[ns]")]
1231+
)
1232+
def test_setting_mismatched_na_into_nullable_fails(
1233+
self, null, any_numeric_ea_dtype
1234+
):
1235+
# GH#44514 don't cast mismatched nulls to pd.NA
1236+
df = DataFrame({"A": [1, 2, 3]}, dtype=any_numeric_ea_dtype)
1237+
ser = df["A"]
1238+
arr = ser._values
1239+
1240+
msg = "|".join(
1241+
[
1242+
r"int\(\) argument must be a string, a bytes-like object or a "
1243+
"(real )?number, not 'NaTType'",
1244+
r"timedelta64\[ns\] cannot be converted to an? (Floating|Integer)Dtype",
1245+
r"datetime64\[ns\] cannot be converted to an? (Floating|Integer)Dtype",
1246+
"object cannot be converted to a FloatingDtype",
1247+
"'values' contains non-numeric NA",
1248+
]
1249+
)
1250+
with pytest.raises(TypeError, match=msg):
1251+
arr[0] = null
1252+
1253+
with pytest.raises(TypeError, match=msg):
1254+
arr[:2] = [null, null]
1255+
1256+
with pytest.raises(TypeError, match=msg):
1257+
ser[0] = null
1258+
1259+
with pytest.raises(TypeError, match=msg):
1260+
ser[:2] = [null, null]
1261+
1262+
with pytest.raises(TypeError, match=msg):
1263+
ser.iloc[0] = null
1264+
1265+
with pytest.raises(TypeError, match=msg):
1266+
ser.iloc[:2] = [null, null]
1267+
1268+
with pytest.raises(TypeError, match=msg):
1269+
df.iloc[0, 0] = null
1270+
1271+
with pytest.raises(TypeError, match=msg):
1272+
df.iloc[:2, 0] = [null, null]
1273+
1274+
# Multi-Block
1275+
df2 = df.copy()
1276+
df2["B"] = ser.copy()
1277+
with pytest.raises(TypeError, match=msg):
1278+
df2.iloc[0, 0] = null
1279+
1280+
with pytest.raises(TypeError, match=msg):
1281+
df2.iloc[:2, 0] = [null, null]
1282+
12141283

12151284
class TestDataFrameIndexingUInt64:
12161285
def test_setitem(self, uint64_frame):

pandas/tests/series/methods/test_clip.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,14 @@ def test_series_clipping_with_na_values(self, any_numeric_ea_dtype, nulls_fixtur
4646
# Ensure that clipping method can handle NA values with out failing
4747
# GH#40581
4848

49-
s = Series([nulls_fixture, 1.0, 3.0], dtype=any_numeric_ea_dtype)
50-
s_clipped_upper = s.clip(upper=2.0)
51-
s_clipped_lower = s.clip(lower=2.0)
49+
if nulls_fixture is pd.NaT:
50+
# constructor will raise, see
51+
# test_constructor_mismatched_null_nullable_dtype
52+
return
53+
54+
ser = Series([nulls_fixture, 1.0, 3.0], dtype=any_numeric_ea_dtype)
55+
s_clipped_upper = ser.clip(upper=2.0)
56+
s_clipped_lower = ser.clip(lower=2.0)
5257

5358
expected_upper = Series([nulls_fixture, 1.0, 2.0], dtype=any_numeric_ea_dtype)
5459
expected_lower = Series([nulls_fixture, 2.0, 3.0], dtype=any_numeric_ea_dtype)

pandas/tests/series/test_constructors.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,10 @@
1313
iNaT,
1414
lib,
1515
)
16-
from pandas.compat.numpy import np_version_under1p19
16+
from pandas.compat.numpy import (
17+
np_version_under1p19,
18+
np_version_under1p20,
19+
)
1720
import pandas.util._test_decorators as td
1821

1922
from pandas.core.dtypes.common import (
@@ -1811,6 +1814,33 @@ def test_constructor_bool_dtype_missing_values(self):
18111814
expected = Series(True, index=[0], dtype="bool")
18121815
tm.assert_series_equal(result, expected)
18131816

1817+
@pytest.mark.filterwarnings(
1818+
"ignore:elementwise comparison failed:DeprecationWarning"
1819+
)
1820+
@pytest.mark.xfail(
1821+
np_version_under1p20, reason="np.array([td64nat, float, float]) raises"
1822+
)
1823+
@pytest.mark.parametrize("func", [Series, DataFrame, Index, pd.array])
1824+
def test_constructor_mismatched_null_nullable_dtype(
1825+
self, func, any_numeric_ea_dtype
1826+
):
1827+
# GH#44514
1828+
msg = "|".join(
1829+
[
1830+
"cannot safely cast non-equivalent object",
1831+
r"int\(\) argument must be a string, a bytes-like object "
1832+
"or a (real )?number",
1833+
r"Cannot cast array data from dtype\('O'\) to dtype\('float64'\) "
1834+
"according to the rule 'safe'",
1835+
"object cannot be converted to a FloatingDtype",
1836+
"'values' contains non-numeric NA",
1837+
]
1838+
)
1839+
1840+
for null in tm.NP_NAT_OBJECTS + [NaT]:
1841+
with pytest.raises(TypeError, match=msg):
1842+
func([null, 1.0, 3.0], dtype=any_numeric_ea_dtype)
1843+
18141844

18151845
class TestSeriesConstructorIndexCoercion:
18161846
def test_series_constructor_datetimelike_index_coercion(self):

0 commit comments

Comments
 (0)