From 6ee5c5f5e1613c1fc0e76642e1954dec277feb81 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 24 Dec 2020 13:47:46 -0800 Subject: [PATCH 1/8] TST: implement tm.check_setitem_equivalents --- pandas/_testing.py | 75 +++++++++++++++++++ pandas/tests/indexing/test_indexing.py | 37 +++------ pandas/tests/series/indexing/test_indexing.py | 26 ------- pandas/tests/series/indexing/test_setitem.py | 40 +++++++--- 4 files changed, 116 insertions(+), 62 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index 7786eeeb46797..2256156ad222f 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -3143,3 +3143,78 @@ def get_op_from_name(op_name: str) -> Callable: op = lambda x, y: rop(y, x) return op + + +# ----------------------------------------------------------------------------- +# Indexing test helpers + + +def getitem(x): + return x + + +def setitem(x): + return x + + +def loc(x): + return x.loc + + +def iloc(x): + return x.iloc + + +def check_setitem_equivalents(obj: Series, key: Union[int, slice], expected: Series): + """ + Check each of several methods that _should_ be equivalent to `obj[key] = np.nan` + + We assume that + - obj.index is the default Index(range(len(obj))) + - the setitem does not expand the obj + """ + orig = obj.copy() + + if isinstance(key, int): + for indexer in [setitem, loc, iloc]: + obj = orig.copy() + indexer(obj)[key] = np.nan + assert_series_equal(obj, expected) + + key = slice(key, key + 1) + + # setitem with slice + for indexer in [setitem, iloc]: + # Note: no .loc because that handles slice edges differently + obj = orig.copy() + indexer(obj)[key] = np.nan + assert_series_equal(obj, expected) + + # list of ints + ilkey = list(range(len(obj)))[key] + for indexer in [setitem, loc, iloc]: + obj = orig.copy() + indexer(obj)[ilkey] = np.nan + assert_series_equal(obj, expected) + + # setitem with boolean mask + mask = np.zeros(obj.shape, dtype=bool) + mask[key] = True + for indexer in [setitem, loc, iloc]: + obj = orig.copy() + indexer(obj)[mask] = np.nan + assert_series_equal(obj, expected) + + # Series.where + obj = orig.copy() + res = obj.where(~mask, np.nan) + assert_equal(res, expected) + + # Index equivalents + if Index(orig).dtype == orig.dtype: + obj = orig.copy() + res = Index(obj).where(~mask, np.nan) + assert_index_equal(res, Index(expected)) + + # TODO: implement the same for Index(obj).putmask(mask, np.nan) + # once that behavior matches diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index e8c4a834bdeb1..e59d6de1b2a4c 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -17,23 +17,6 @@ from .test_floats import gen_obj - -def getitem(x): - return x - - -def setitem(x): - return x - - -def loc(x): - return x.loc - - -def iloc(x): - return x.iloc - - # ------------------------------------------------------------------------ # Indexing test cases @@ -72,7 +55,7 @@ def test_setitem_ndarray_1d(self): with pytest.raises(ValueError, match=msg): df[2:5] = np.arange(1, 4) * 1j - @pytest.mark.parametrize("idxr", [getitem, loc, iloc]) + @pytest.mark.parametrize("idxr", [tm.getitem, tm.loc, tm.iloc]) def test_getitem_ndarray_3d(self, index, frame_or_series, idxr): # GH 25567 obj = gen_obj(frame_or_series, index) @@ -95,7 +78,7 @@ def test_getitem_ndarray_3d(self, index, frame_or_series, idxr): with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): idxr[nd3] - @pytest.mark.parametrize("indexer", [setitem, loc, iloc]) + @pytest.mark.parametrize("indexer", [tm.setitem, tm.loc, tm.iloc]) def test_setitem_ndarray_3d(self, index, frame_or_series, indexer): # GH 25567 obj = gen_obj(frame_or_series, index) @@ -297,7 +280,7 @@ def test_dups_fancy_indexing2(self): result = df.loc[[1, 2], ["a", "b"]] tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("case", [getitem, loc]) + @pytest.mark.parametrize("case", [tm.getitem, tm.loc]) def test_duplicate_int_indexing(self, case): # GH 17347 s = Series(range(3), index=[1, 1, 3]) @@ -594,7 +577,7 @@ def test_astype_assignment(self): expected = DataFrame({"A": [1, 2, 3, 4]}) tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize("indexer", [getitem, loc]) + @pytest.mark.parametrize("indexer", [tm.getitem, tm.loc]) def test_index_type_coercion(self, indexer): # GH 11836 @@ -967,7 +950,7 @@ def test_none_coercion_mixed_dtypes(self): class TestDatetimelikeCoercion: - @pytest.mark.parametrize("indexer", [setitem, loc, iloc]) + @pytest.mark.parametrize("indexer", [tm.setitem, tm.loc, tm.iloc]) def test_setitem_dt64_string_scalar(self, tz_naive_fixture, indexer): # dispatching _can_hold_element to underling DatetimeArray tz = tz_naive_fixture @@ -993,12 +976,12 @@ def test_setitem_dt64_string_scalar(self, tz_naive_fixture, indexer): @pytest.mark.parametrize( "key", [[0, 1], slice(0, 2), np.array([True, True, False])] ) - @pytest.mark.parametrize("indexer", [setitem, loc, iloc]) + @pytest.mark.parametrize("indexer", [tm.setitem, tm.loc, tm.iloc]) def test_setitem_dt64_string_values(self, tz_naive_fixture, indexer, key, box): # dispatching _can_hold_element to underling DatetimeArray tz = tz_naive_fixture - if isinstance(key, slice) and indexer is loc: + if isinstance(key, slice) and indexer is tm.loc: key = slice(0, 1) dti = date_range("2016-01-01", periods=3, tz=tz) @@ -1019,7 +1002,7 @@ def test_setitem_dt64_string_values(self, tz_naive_fixture, indexer, key, box): assert ser._values is values @pytest.mark.parametrize("scalar", ["3 Days", offsets.Hour(4)]) - @pytest.mark.parametrize("indexer", [setitem, loc, iloc]) + @pytest.mark.parametrize("indexer", [tm.setitem, tm.loc, tm.iloc]) def test_setitem_td64_scalar(self, indexer, scalar): # dispatching _can_hold_element to underling TimedeltaArray tdi = timedelta_range("1 Day", periods=3) @@ -1035,10 +1018,10 @@ def test_setitem_td64_scalar(self, indexer, scalar): @pytest.mark.parametrize( "key", [[0, 1], slice(0, 2), np.array([True, True, False])] ) - @pytest.mark.parametrize("indexer", [setitem, loc, iloc]) + @pytest.mark.parametrize("indexer", [tm.setitem, tm.loc, tm.iloc]) def test_setitem_td64_string_values(self, indexer, key, box): # dispatching _can_hold_element to underling TimedeltaArray - if isinstance(key, slice) and indexer is loc: + if isinstance(key, slice) and indexer is tm.loc: key = slice(0, 1) tdi = timedelta_range("1 Day", periods=3) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 159b42621f970..dbc751dd614a1 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -271,32 +271,6 @@ def test_setitem(datetime_series, string_series): tm.assert_series_equal(s, expected) -def test_setitem_dtypes(): - # change dtypes - # GH 4463 - expected = Series([np.nan, 2, 3]) - - s = Series([1, 2, 3]) - s.iloc[0] = np.nan - tm.assert_series_equal(s, expected) - - s = Series([1, 2, 3]) - s.loc[0] = np.nan - tm.assert_series_equal(s, expected) - - s = Series([1, 2, 3]) - s[0] = np.nan - tm.assert_series_equal(s, expected) - - s = Series([False]) - s.loc[0] = np.nan - tm.assert_series_equal(s, Series([np.nan])) - - s = Series([False, True]) - s.loc[0] = np.nan - tm.assert_series_equal(s, Series([np.nan, 1.0])) - - def test_setslice(datetime_series): sl = datetime_series[5:20] assert len(sl) == len(sl.index) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 5f09283249fe3..8abcf3f7ceead 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -241,21 +241,43 @@ def test_setitem_callable_other(self): class TestSetitemCasting: def test_setitem_nan_casts(self): # these induce dtype changes - expected = Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan]) + ser = Series([2, 3, 4, 5, 6, 7, 8, 9, 10]) - ser[::2] = np.nan - tm.assert_series_equal(ser, expected) + expected = Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan]) + key = slice(None, None, 2) + tm.check_setitem_equivalents(ser, key, expected) # gets coerced to float, right? - expected = Series([np.nan, 1, np.nan, 0]) ser = Series([True, True, False, False]) - ser[::2] = np.nan - tm.assert_series_equal(ser, expected) + expected = Series([np.nan, 1, np.nan, 0]) + key = slice(None, None, 2) + tm.check_setitem_equivalents(ser, key, expected) - expected = Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, 9]) ser = Series(np.arange(10)) - ser[:5] = np.nan - tm.assert_series_equal(ser, expected) + expected = Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, 9]) + key = slice(None, 5) + tm.check_setitem_equivalents(ser, key, expected) + + def test_setitem_nan_into_int(self): + # change dtypes + # GH#4463 + ser = Series([1, 2, 3]) + expected = Series([np.nan, 2, 3]) + key = 0 + tm.check_setitem_equivalents(ser, key, expected) + + def test_setitem_nan_into_bool(self): + # change dtypes + # GH#4463 + ser = Series([False]) + expected = Series([np.nan]) + key = 0 + tm.check_setitem_equivalents(ser, key, expected) + + ser = Series([False, True]) + expected = Series([np.nan, 1.0]) + key = 0 + tm.check_setitem_equivalents(ser, key, expected) class TestSetitemWithExpansion: From 33e39f9a2dd50f12251cc6e3774a012f7314cefe Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 25 Dec 2020 16:01:50 -0800 Subject: [PATCH 2/8] re-write as fixturized --- pandas/_testing.py | 55 ------- pandas/conftest.py | 16 ++ pandas/tests/series/indexing/test_setitem.py | 148 ++++++++++++++----- 3 files changed, 124 insertions(+), 95 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index 2256156ad222f..bc153bcba1faa 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -3163,58 +3163,3 @@ def loc(x): def iloc(x): return x.iloc - - -def check_setitem_equivalents(obj: Series, key: Union[int, slice], expected: Series): - """ - Check each of several methods that _should_ be equivalent to `obj[key] = np.nan` - - We assume that - - obj.index is the default Index(range(len(obj))) - - the setitem does not expand the obj - """ - orig = obj.copy() - - if isinstance(key, int): - for indexer in [setitem, loc, iloc]: - obj = orig.copy() - indexer(obj)[key] = np.nan - assert_series_equal(obj, expected) - - key = slice(key, key + 1) - - # setitem with slice - for indexer in [setitem, iloc]: - # Note: no .loc because that handles slice edges differently - obj = orig.copy() - indexer(obj)[key] = np.nan - assert_series_equal(obj, expected) - - # list of ints - ilkey = list(range(len(obj)))[key] - for indexer in [setitem, loc, iloc]: - obj = orig.copy() - indexer(obj)[ilkey] = np.nan - assert_series_equal(obj, expected) - - # setitem with boolean mask - mask = np.zeros(obj.shape, dtype=bool) - mask[key] = True - for indexer in [setitem, loc, iloc]: - obj = orig.copy() - indexer(obj)[mask] = np.nan - assert_series_equal(obj, expected) - - # Series.where - obj = orig.copy() - res = obj.where(~mask, np.nan) - assert_equal(res, expected) - - # Index equivalents - if Index(orig).dtype == orig.dtype: - obj = orig.copy() - res = Index(obj).where(~mask, np.nan) - assert_index_equal(res, Index(expected)) - - # TODO: implement the same for Index(obj).putmask(mask, np.nan) - # once that behavior matches diff --git a/pandas/conftest.py b/pandas/conftest.py index d84a72d4cc7a8..2862f7c957abc 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1446,3 +1446,19 @@ def names(request): A 3-tuple of names, the first two for operands, the last for a result. """ return request.param + + +@pytest.fixture(params=[tm.setitem, tm.loc, tm.iloc]) +def indexer_sli(request): + """ + Parametrize over __setitem__, loc.__setitem__, iloc.__setitem__ + """ + return request.param + + +@pytest.fixture(params=[tm.setitem, tm.iloc]) +def indexer_si(request): + """ + Parametrize over __setitem__, iloc.__setitem__ + """ + return request.param diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 8abcf3f7ceead..d6d0723bee0e8 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -5,6 +5,7 @@ from pandas import ( DatetimeIndex, + Index, MultiIndex, NaT, Series, @@ -238,46 +239,113 @@ def test_setitem_callable_other(self): tm.assert_series_equal(ser, expected) -class TestSetitemCasting: - def test_setitem_nan_casts(self): - # these induce dtype changes - - ser = Series([2, 3, 4, 5, 6, 7, 8, 9, 10]) - expected = Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan]) - key = slice(None, None, 2) - tm.check_setitem_equivalents(ser, key, expected) - - # gets coerced to float, right? - ser = Series([True, True, False, False]) - expected = Series([np.nan, 1, np.nan, 0]) - key = slice(None, None, 2) - tm.check_setitem_equivalents(ser, key, expected) - - ser = Series(np.arange(10)) - expected = Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, 9]) - key = slice(None, 5) - tm.check_setitem_equivalents(ser, key, expected) - - def test_setitem_nan_into_int(self): - # change dtypes - # GH#4463 - ser = Series([1, 2, 3]) - expected = Series([np.nan, 2, 3]) - key = 0 - tm.check_setitem_equivalents(ser, key, expected) - - def test_setitem_nan_into_bool(self): - # change dtypes - # GH#4463 - ser = Series([False]) - expected = Series([np.nan]) - key = 0 - tm.check_setitem_equivalents(ser, key, expected) - - ser = Series([False, True]) - expected = Series([np.nan, 1.0]) - key = 0 - tm.check_setitem_equivalents(ser, key, expected) +@pytest.mark.parametrize( + "obj,expected,key", + [ + ( + # these induce dtype changes + Series([2, 3, 4, 5, 6, 7, 8, 9, 10]), + Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan]), + slice(None, None, 2), + ), + ( + # gets coerced to float, right? + Series([True, True, False, False]), + Series([np.nan, 1, np.nan, 0]), + slice(None, None, 2), + ), + ( + # these induce dtype changes + Series(np.arange(10)), + Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, 9]), + slice(None, 5), + ), + ( + # changes dtype GH#4463 + Series([1, 2, 3]), + Series([np.nan, 2, 3]), + 0, + ), + ( + # changes dtype GH#4463 + Series([False]), + Series([np.nan]), + 0, + ), + ( + # changes dtype GH#4463 + Series([False, True]), + Series([np.nan, 1.0]), + 0, + ), + ], +) +class TestSetitemCastingEquivalents: + """ + Check each of several methods that _should_ be equivalent to `obj[key] = np.nan` + + We assume that + - obj.index is the default Index(range(len(obj))) + - the setitem does not expand the obj + """ + + def test_int_key(self, obj, key, expected, indexer_sli): + if not isinstance(key, int): + return + + obj = obj.copy() + indexer_sli(obj)[key] = np.nan + tm.assert_series_equal(obj, expected) + + def test_slice_key(self, obj, key, expected, indexer_si): + # Note: no .loc because that handles slice edges differently + obj = obj.copy() + indexer_si(obj)[key] = np.nan + tm.assert_series_equal(obj, expected) + + def test_intlist_key(self, obj, key, expected, indexer_sli): + ilkey = list(range(len(obj)))[key] + + obj = obj.copy() + indexer_sli(obj)[ilkey] = np.nan + tm.assert_series_equal(obj, expected) + + def test_mask_key(self, obj, key, expected, indexer_sli): + # setitem with boolean mask + mask = np.zeros(obj.shape, dtype=bool) + mask[key] = True + + obj = obj.copy() + indexer_sli(obj)[mask] = np.nan + tm.assert_series_equal(obj, expected) + + def test_series_where(self, obj, key, expected): + mask = np.zeros(obj.shape, dtype=bool) + mask[key] = True + + obj = obj.copy() + res = obj.where(~mask, np.nan) + tm.assert_series_equal(res, expected) + + def test_index_where(self, obj, key, expected, request): + if obj.dtype == bool: + msg = "Index/Series casting behavior inconsistent GH#38692" + mark = pytest.xfail(reason=msg) + request.node.add_marker(mark) + + mask = np.zeros(obj.shape, dtype=bool) + mask[key] = True + + res = Index(obj).where(~mask, np.nan) + tm.assert_index_equal(res, Index(expected)) + + @pytest.mark.xfail(reason="Index/Series casting behavior inconsistent GH#38692") + def test_index_putmask(self, obj, key, expected): + mask = np.zeros(obj.shape, dtype=bool) + mask[key] = True + + res = Index(obj).putmask(mask, np.nan) + tm.assert_index_equal(res, Index(expected)) class TestSetitemWithExpansion: From 58966157d46f32129aac6b400f52e83bde157b5d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 24 Dec 2020 12:30:13 -0800 Subject: [PATCH 3/8] CLN: algos.searchsorted (#38686) --- pandas/core/algorithms.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1061eb087318b..2098392cf70a9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -11,7 +11,7 @@ import numpy as np -from pandas._libs import Timestamp, algos, hashtable as htable, iNaT, lib +from pandas._libs import algos, hashtable as htable, iNaT, lib from pandas._typing import AnyArrayLike, ArrayLike, DtypeObj, FrameOrSeriesUnion from pandas.util._decorators import doc @@ -59,7 +59,11 @@ ) from pandas.core.dtypes.missing import isna, na_value_for_dtype -from pandas.core.construction import array, extract_array +from pandas.core.construction import ( + array, + ensure_wrapped_if_datetimelike, + extract_array, +) from pandas.core.indexers import validate_indices if TYPE_CHECKING: @@ -1906,10 +1910,7 @@ def searchsorted(arr, value, side="left", sorter=None) -> np.ndarray: ): # E.g. if `arr` is an array with dtype='datetime64[ns]' # and `value` is a pd.Timestamp, we may need to convert value - value_ser = array([value]) if is_scalar(value) else array(value) - value = value_ser[0] if is_scalar(value) else value_ser - if isinstance(value, Timestamp) and value.tzinfo is None: - value = value.to_datetime64() + arr = ensure_wrapped_if_datetimelike(arr) result = arr.searchsorted(value, side=side, sorter=sorter) return result From 90a76cec2d80d52d94f647db4cec03977b762ef2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 24 Dec 2020 12:32:09 -0800 Subject: [PATCH 4/8] REF: simplify coerce_to_target_dtype (#38683) --- pandas/core/internals/blocks.py | 33 +++------------------------------ 1 file changed, 3 insertions(+), 30 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 138a19779b831..aca7373983a61 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -37,9 +37,7 @@ from pandas.core.dtypes.common import ( DT64NS_DTYPE, TD64NS_DTYPE, - is_bool_dtype, is_categorical_dtype, - is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_dtype_equal, @@ -53,7 +51,6 @@ is_re, is_re_compilable, is_sparse, - is_timedelta64_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype, ExtensionDtype @@ -927,7 +924,7 @@ def setitem(self, indexer, value): else: # current dtype cannot store value, coerce to common dtype - + # TODO: can we just use coerce_to_target_dtype for all this if hasattr(value, "dtype"): dtype = value.dtype @@ -1164,33 +1161,9 @@ def coerce_to_target_dtype(self, other): # if we cannot then coerce to object dtype, _ = infer_dtype_from(other, pandas_dtype=True) - if is_dtype_equal(self.dtype, dtype): - return self - - if self.is_bool or is_object_dtype(dtype) or is_bool_dtype(dtype): - # we don't upcast to bool - return self.astype(object) - - elif (self.is_float or self.is_complex) and ( - is_integer_dtype(dtype) or is_float_dtype(dtype) - ): - # don't coerce float/complex to int - return self + new_dtype = find_common_type([self.dtype, dtype]) - elif self.is_datetime or is_datetime64_any_dtype(dtype): - # The is_dtype_equal check above ensures that at most one of - # these two conditions hold, so we must cast to object. - return self.astype(object) - - elif self.is_timedelta or is_timedelta64_dtype(dtype): - # The is_dtype_equal check above ensures that at most one of - # these two conditions hold, so we must cast to object. - return self.astype(object) - - try: - return self.astype(dtype) - except (ValueError, TypeError, OverflowError): - return self.astype(object) + return self.astype(new_dtype, copy=False) def interpolate( self, From 0827cd098c51b48ba1a22b843b7940d0543521de Mon Sep 17 00:00:00 2001 From: Maxim Ivanov <41443370+ivanovmg@users.noreply.github.com> Date: Fri, 25 Dec 2020 03:40:40 +0700 Subject: [PATCH 5/8] PERF: fix assert_frame_equal can be very slow (#38202) --- pandas/_testing.py | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index bc153bcba1faa..56e53c6ec1cb0 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1294,6 +1294,8 @@ def assert_series_equal( rtol=1.0e-5, atol=1.0e-8, obj="Series", + *, + check_index=True, ): """ Check that left and right Series are equal. @@ -1353,6 +1355,10 @@ def assert_series_equal( obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message. + check_index : bool, default True + Whether to check index equivalence. If False, then compare only values. + + .. versionadded:: 1.3.0 Examples -------- @@ -1388,18 +1394,20 @@ def assert_series_equal( if check_flags: assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" - # index comparison - assert_index_equal( - left.index, - right.index, - exact=check_index_type, - check_names=check_names, - check_exact=check_exact, - check_categorical=check_categorical, - rtol=rtol, - atol=atol, - obj=f"{obj}.index", - ) + if check_index: + # GH #38183 + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + rtol=rtol, + atol=atol, + obj=f"{obj}.index", + ) + if check_freq and isinstance(left.index, (pd.DatetimeIndex, pd.TimedeltaIndex)): lidx = left.index ridx = right.index @@ -1704,6 +1712,10 @@ def assert_frame_equal( assert col in right lcol = left.iloc[:, i] rcol = right.iloc[:, i] + # GH #38183 + # use check_index=False, because we do not want to run + # assert_index_equal for each column, + # as we already checked it for the whole dataframe before. assert_series_equal( lcol, rcol, @@ -1717,6 +1729,7 @@ def assert_frame_equal( obj=f'{obj}.iloc[:, {i}] (column name="{col}")', rtol=rtol, atol=atol, + check_index=False, ) From 803f49520bed9a466f9e854f331d066dfc354532 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 26 Dec 2020 16:12:34 -0800 Subject: [PATCH 6/8] API: cast to object when setting np.nan into Series[bool] --- pandas/core/internals/blocks.py | 45 +++----------------- pandas/tests/frame/indexing/test_mask.py | 4 +- pandas/tests/series/indexing/test_setitem.py | 35 +++++++++------ 3 files changed, 29 insertions(+), 55 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index aca7373983a61..cc597dd0bab9b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -25,7 +25,6 @@ convert_scalar_for_putitemlike, find_common_type, infer_dtype_from, - infer_dtype_from_scalar, maybe_box_datetimelike, maybe_downcast_numeric, maybe_downcast_to_dtype, @@ -924,24 +923,7 @@ def setitem(self, indexer, value): else: # current dtype cannot store value, coerce to common dtype - # TODO: can we just use coerce_to_target_dtype for all this - if hasattr(value, "dtype"): - dtype = value.dtype - - elif lib.is_scalar(value) and not isna(value): - dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) - - else: - # e.g. we are bool dtype and value is nan - # TODO: watch out for case with listlike value and scalar/empty indexer - dtype, _ = maybe_promote(np.array(value).dtype) - return self.astype(dtype).setitem(indexer, value) - - dtype = find_common_type([values.dtype, dtype]) - assert not is_dtype_equal(self.dtype, dtype) - # otherwise should have _can_hold_element - - return self.astype(dtype).setitem(indexer, value) + return self.coerce_to_target_dtype(value).setitem(indexer, value) # value must be storable at this moment if is_extension_array_dtype(getattr(value, "dtype", None)): @@ -1403,16 +1385,7 @@ def where( else: # see if we can operate on the entire block, or need item-by-item # or if we are a single block (ndim == 1) - if ( - (self.is_integer or self.is_bool) - and lib.is_float(other) - and np.isnan(other) - ): - # GH#3733 special case to avoid object-dtype casting - # and go through numexpr path instead. - # In integer case, np.where will cast to floats - pass - elif not self._can_hold_element(other): + if not self._can_hold_element(other): # we cannot coerce, return a compat dtype # we are explicitly ignoring errors block = self.coerce_to_target_dtype(other) @@ -1421,13 +1394,8 @@ def where( ) return self._maybe_downcast(blocks, "infer") - if not ( - (self.is_integer or self.is_bool) - and lib.is_float(other) - and np.isnan(other) - ): - # convert datetime to datetime64, timedelta to timedelta64 - other = convert_scalar_for_putitemlike(other, values.dtype) + # convert datetime to datetime64, timedelta to timedelta64 + other = convert_scalar_for_putitemlike(other, values.dtype) # By the time we get here, we should have all Series/Index # args extracted to ndarray @@ -2733,9 +2701,8 @@ def _putmask_preserve(nv, n): return _putmask_preserve(v, n) # change the dtype if needed - dtype, _ = maybe_promote(n.dtype) - - v = v.astype(dtype) + dtype = find_common_type([n.dtype, v.dtype]) + v = v.astype(dtype, copy=False) return _putmask_preserve(v, n) diff --git a/pandas/tests/frame/indexing/test_mask.py b/pandas/tests/frame/indexing/test_mask.py index 23f3a18881782..51ca2600275d6 100644 --- a/pandas/tests/frame/indexing/test_mask.py +++ b/pandas/tests/frame/indexing/test_mask.py @@ -74,12 +74,12 @@ def test_mask_callable(self): tm.assert_frame_equal(result, exp) tm.assert_frame_equal(result, (df + 2).mask((df + 2) > 8, (df + 2) + 10)) - def test_mask_dtype_conversion(self): + def test_mask_dtype_bool_conversion(self): # GH#3733 df = DataFrame(data=np.random.randn(100, 50)) df = df.where(df > 0) # create nans bools = df > 0 mask = isna(df) - expected = bools.astype(float).mask(mask) + expected = bools.astype(object).mask(mask) result = bools.mask(mask) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index d6d0723bee0e8..656b5f89bd7a4 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -242,41 +242,47 @@ def test_setitem_callable_other(self): @pytest.mark.parametrize( "obj,expected,key", [ - ( + pytest.param( # these induce dtype changes Series([2, 3, 4, 5, 6, 7, 8, 9, 10]), Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan]), slice(None, None, 2), + id="int_series_slice_key_step", ), - ( - # gets coerced to float, right? + pytest.param( Series([True, True, False, False]), - Series([np.nan, 1, np.nan, 0]), + Series([np.nan, True, np.nan, False], dtype=object), slice(None, None, 2), + id="bool_series_slice_key_step", ), - ( + pytest.param( # these induce dtype changes Series(np.arange(10)), Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, 9]), slice(None, 5), + id="int_series_slice_key", ), - ( + pytest.param( # changes dtype GH#4463 Series([1, 2, 3]), Series([np.nan, 2, 3]), 0, + id="int_series_int_key", ), - ( + pytest.param( # changes dtype GH#4463 Series([False]), - Series([np.nan]), + Series([np.nan], dtype=object), + # TODO: maybe go to float64 since we are changing the _whole_ Series? 0, + id="bool_series_int_key_change_all", ), - ( + pytest.param( # changes dtype GH#4463 Series([False, True]), - Series([np.nan, 1.0]), + Series([np.nan, True], dtype=object), 0, + id="bool_series_int_key", ), ], ) @@ -328,10 +334,8 @@ def test_series_where(self, obj, key, expected): tm.assert_series_equal(res, expected) def test_index_where(self, obj, key, expected, request): - if obj.dtype == bool: - msg = "Index/Series casting behavior inconsistent GH#38692" - mark = pytest.xfail(reason=msg) - request.node.add_marker(mark) + if Index(obj).dtype != obj.dtype: + pytest.skip("test not applicable for this dtype") mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -341,6 +345,9 @@ def test_index_where(self, obj, key, expected, request): @pytest.mark.xfail(reason="Index/Series casting behavior inconsistent GH#38692") def test_index_putmask(self, obj, key, expected): + if Index(obj).dtype != obj.dtype: + pytest.skip("test not applicable for this dtype") + mask = np.zeros(obj.shape, dtype=bool) mask[key] = True From c01661d1b41e960f0bb70f8b6f57a3215dd9ff82 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 17 Jan 2021 11:07:11 -0800 Subject: [PATCH 7/8] fix putmask --- pandas/core/array_algos/putmask.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py index 2a1b6f784a1f2..97998948f342e 100644 --- a/pandas/core/array_algos/putmask.py +++ b/pandas/core/array_algos/putmask.py @@ -106,8 +106,11 @@ def putmask_smart(values: np.ndarray, mask: np.ndarray, new) -> np.ndarray: # preserves dtype if possible return _putmask_preserve(values, new, mask) - # change the dtype if needed - dtype, _ = maybe_promote(new.dtype) + if values.dtype == bool and new.dtype.kind == "f": + dtype = object + else: + # change the dtype if needed + dtype, _ = maybe_promote(new.dtype) values = values.astype(dtype) From d2ac2b30169b330b46fd3d6ea7264d4e5a3a10e5 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 27 Jan 2021 08:34:31 -0800 Subject: [PATCH 8/8] whatsnew, test, maybe_promote->find_common_type --- doc/source/whatsnew/v1.3.0.rst | 39 ++++++++++++++++++++ pandas/core/array_algos/putmask.py | 9 +---- pandas/tests/series/indexing/test_setitem.py | 36 +++++++++++------- 3 files changed, 63 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 82cbf1397031d..28f1102ab738e 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -96,6 +96,45 @@ Preserve dtypes in :meth:`~pandas.DataFrame.combine_first` combined.dtypes +.. _whatsnew_130.notable_bug_fixes.setitem_with_bool_casting: + +Consistent Casting With Setting Into Boolean Series +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Setting non-boolean values into a :class:`Series with ``dtype=bool`` consistently +cast to ``dtype=object`` (:issue:`38709`) + +.. ipython:: python + + orig = pd.Series([True, False]) + ser = orig.copy() + ser.iloc[1] = np.nan + ser2 = orig.copy() + ser2.iloc[1] = 2.0 + +*pandas 1.2.x* + +.. code-block:: ipython + + In [1]: ser + Out [1]: + 0 1.0 + 1 NaN + dtype: float64 + + In [2]:ser2 + Out [2]: + 0 True + 1 2.0 + dtype: object + +*pandas 1.3.0* + +.. ipython:: python + + ser + ser2 + .. _whatsnew_130.api_breaking.deps: Increased minimum versions for dependencies diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py index 97998948f342e..ca83692ad7ca4 100644 --- a/pandas/core/array_algos/putmask.py +++ b/pandas/core/array_algos/putmask.py @@ -9,7 +9,7 @@ from pandas._libs import lib from pandas._typing import ArrayLike -from pandas.core.dtypes.cast import convert_scalar_for_putitemlike, maybe_promote +from pandas.core.dtypes.cast import convert_scalar_for_putitemlike, find_common_type from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype, is_list_like from pandas.core.dtypes.missing import isna_compat @@ -106,12 +106,7 @@ def putmask_smart(values: np.ndarray, mask: np.ndarray, new) -> np.ndarray: # preserves dtype if possible return _putmask_preserve(values, new, mask) - if values.dtype == bool and new.dtype.kind == "f": - dtype = object - else: - # change the dtype if needed - dtype, _ = maybe_promote(new.dtype) - + dtype = find_common_type([values.dtype, new.dtype]) values = values.astype(dtype) return _putmask_preserve(values, new, mask) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index b773f8a87c0f8..a49fafe8e90bd 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -295,45 +295,53 @@ class TestSetitemCastingEquivalents: - the setitem does not expand the obj """ - def test_int_key(self, obj, key, expected, indexer_sli): + @pytest.fixture(params=[np.nan, np.float64("NaN")]) + def val(self, request): + """ + One python float NaN, one np.float64. Only np.float64 has a `dtype` + attribute. + """ + return request.param + + def test_int_key(self, obj, key, expected, val, indexer_sli): if not isinstance(key, int): return obj = obj.copy() - indexer_sli(obj)[key] = np.nan + indexer_sli(obj)[key] = val tm.assert_series_equal(obj, expected) - def test_slice_key(self, obj, key, expected, indexer_si): + def test_slice_key(self, obj, key, expected, val, indexer_si): # Note: no .loc because that handles slice edges differently obj = obj.copy() - indexer_si(obj)[key] = np.nan + indexer_si(obj)[key] = val tm.assert_series_equal(obj, expected) - def test_intlist_key(self, obj, key, expected, indexer_sli): + def test_intlist_key(self, obj, key, expected, val, indexer_sli): ilkey = list(range(len(obj)))[key] obj = obj.copy() - indexer_sli(obj)[ilkey] = np.nan + indexer_sli(obj)[ilkey] = val tm.assert_series_equal(obj, expected) - def test_mask_key(self, obj, key, expected, indexer_sli): + def test_mask_key(self, obj, key, expected, val, indexer_sli): # setitem with boolean mask mask = np.zeros(obj.shape, dtype=bool) mask[key] = True obj = obj.copy() - indexer_sli(obj)[mask] = np.nan + indexer_sli(obj)[mask] = val tm.assert_series_equal(obj, expected) - def test_series_where(self, obj, key, expected): + def test_series_where(self, obj, key, expected, val): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True obj = obj.copy() - res = obj.where(~mask, np.nan) + res = obj.where(~mask, val) tm.assert_series_equal(res, expected) - def test_index_where(self, obj, key, expected, request): + def test_index_where(self, obj, key, expected, val, request): if Index(obj).dtype != obj.dtype: pytest.skip("test not applicable for this dtype") @@ -346,18 +354,18 @@ def test_index_where(self, obj, key, expected, request): mark = pytest.mark.xfail(reason=msg) request.node.add_marker(mark) - res = Index(obj).where(~mask, np.nan) + res = Index(obj).where(~mask, val) tm.assert_index_equal(res, Index(expected)) @pytest.mark.xfail(reason="Index/Series casting behavior inconsistent GH#38692") - def test_index_putmask(self, obj, key, expected): + def test_index_putmask(self, obj, key, expected, val): if Index(obj).dtype != obj.dtype: pytest.skip("test not applicable for this dtype") mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - res = Index(obj).putmask(mask, np.nan) + res = Index(obj).putmask(mask, val) tm.assert_index_equal(res, Index(expected))