From 47dab457d6efeef7a7721e78feb3a0a8ee5be022 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 3 May 2023 16:05:01 -0700 Subject: [PATCH 1/4] REF: avoid object dtype in mask_missing --- pandas/core/missing.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 585ad50ad9069..c0a6fc4c0c08a 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -79,12 +79,15 @@ def mask_missing(arr: ArrayLike, values_to_mask) -> npt.NDArray[np.bool_]: # When called from Block.replace/replace_list, values_to_mask is a scalar # known to be holdable by arr. # When called from Series._single_replace, values_to_mask is tuple or list - dtype, values_to_mask = infer_dtype_from(values_to_mask) - # error: Argument "dtype" to "array" has incompatible type "Union[dtype[Any], - # ExtensionDtype]"; expected "Union[dtype[Any], None, type, _SupportsDType, str, - # Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any], - # _DTypeDict, Tuple[Any, Any]]]" - values_to_mask = np.array(values_to_mask, dtype=dtype) # type: ignore[arg-type] + dtype, values_to_mask = infer_dtype_from(values_to_mask, pandas_dtype=True) + + if isinstance(dtype, np.dtype): + values_to_mask = np.array(values_to_mask, dtype=dtype) + else: + cls = dtype.construct_array_type() + if not lib.is_list_like(values_to_mask): + values_to_mask = [values_to_mask] + values_to_mask = cls._from_sequence(values_to_mask, dtype=dtype, copy=False) potential_na = False if is_object_dtype(arr.dtype): From 930ab29b3b961d671857fe3b3815e7d3decc7482 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 3 May 2023 16:52:40 -0700 Subject: [PATCH 2/4] REF: remove pandas_dtype kwarg from infer_dtype_from --- pandas/core/array_algos/putmask.py | 2 +- pandas/core/dtypes/cast.py | 32 +++++--------------- pandas/core/indexes/base.py | 2 +- pandas/core/missing.py | 2 +- pandas/tests/dtypes/cast/test_infer_dtype.py | 30 ++++++++---------- 5 files changed, 23 insertions(+), 45 deletions(-) diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py index 8c8d962e7782b..74cc30a4e030d 100644 --- a/pandas/core/array_algos/putmask.py +++ b/pandas/core/array_algos/putmask.py @@ -136,7 +136,7 @@ def setitem_datetimelike_compat(values: np.ndarray, num_set: int, other): other : Any """ if values.dtype == object: - dtype, _ = infer_dtype_from(other, pandas_dtype=True) + dtype, _ = infer_dtype_from(other) if isinstance(dtype, np.dtype) and dtype.kind in "mM": # https://github.com/numpy/numpy/issues/12550 diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index db8bdd08ee112..74906143c87cf 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -735,21 +735,17 @@ def _ensure_dtype_type(value, dtype: np.dtype): return dtype.type(value) -def infer_dtype_from(val, pandas_dtype: bool = False) -> tuple[DtypeObj, Any]: +def infer_dtype_from(val) -> tuple[DtypeObj, Any]: """ Interpret the dtype from a scalar or array. Parameters ---------- val : object - pandas_dtype : bool, default False - whether to infer dtype including pandas extension types. - If False, scalar/array belongs to pandas extension types is inferred as - object """ if not is_list_like(val): - return infer_dtype_from_scalar(val, pandas_dtype=pandas_dtype) - return infer_dtype_from_array(val, pandas_dtype=pandas_dtype) + return infer_dtype_from_scalar(val, pandas_dtype=True) + return infer_dtype_from_array(val) def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> tuple[DtypeObj, Any]: @@ -859,32 +855,18 @@ def dict_compat(d: dict[Scalar, Scalar]) -> dict[Scalar, Scalar]: return {maybe_box_datetimelike(key): value for key, value in d.items()} -def infer_dtype_from_array( - arr, pandas_dtype: bool = False -) -> tuple[DtypeObj, ArrayLike]: +def infer_dtype_from_array(arr) -> tuple[DtypeObj, ArrayLike]: """ Infer the dtype from an array. Parameters ---------- arr : array - pandas_dtype : bool, default False - whether to infer dtype including pandas extension types. - If False, array belongs to pandas extension types - is inferred as object Returns ------- - tuple (numpy-compat/pandas-compat dtype, array) - - Notes - ----- - if pandas_dtype=False. these infer to numpy dtypes - exactly with the exception that mixed / object dtypes - are not coerced by stringifying or conversion + tuple (pandas-compat dtype, array) - if pandas_dtype=True. datetime64tz-aware/categorical - types will retain there character. Examples -------- @@ -901,7 +883,7 @@ def infer_dtype_from_array( raise TypeError("'arr' must be list-like") arr_dtype = getattr(arr, "dtype", None) - if pandas_dtype and isinstance(arr_dtype, ExtensionDtype): + if isinstance(arr_dtype, ExtensionDtype): return arr.dtype, arr elif isinstance(arr, ABCSeries): @@ -1303,7 +1285,7 @@ def find_result_type(left: ArrayLike, right: Any) -> DtypeObj: new_dtype = ensure_dtype_can_hold_na(left.dtype) else: - dtype, _ = infer_dtype_from(right, pandas_dtype=True) + dtype, _ = infer_dtype_from(right) new_dtype = find_common_type([left.dtype, dtype]) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c977e4dc25584..841d8bb0749d0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6088,7 +6088,7 @@ def _find_common_type_compat(self, target) -> DtypeObj: Implementation of find_common_type that adjusts for Index-specific special cases. """ - target_dtype, _ = infer_dtype_from(target, pandas_dtype=True) + target_dtype, _ = infer_dtype_from(target) # special case: if one dtype is uint64 and the other a signed int, return object # See https://github.com/pandas-dev/pandas/issues/26778 for discussion diff --git a/pandas/core/missing.py b/pandas/core/missing.py index c0a6fc4c0c08a..7762ba8e2c730 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -79,7 +79,7 @@ def mask_missing(arr: ArrayLike, values_to_mask) -> npt.NDArray[np.bool_]: # When called from Block.replace/replace_list, values_to_mask is a scalar # known to be holdable by arr. # When called from Series._single_replace, values_to_mask is tuple or list - dtype, values_to_mask = infer_dtype_from(values_to_mask, pandas_dtype=True) + dtype, values_to_mask = infer_dtype_from(values_to_mask) if isinstance(dtype, np.dtype): values_to_mask = np.array(values_to_mask, dtype=dtype) diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index 902130bf93d54..f1064260a1499 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -160,37 +160,33 @@ def test_infer_dtype_from_scalar(value, expected, pandas_dtype): assert is_dtype_equal(dtype, expected) with pytest.raises(TypeError, match="must be list-like"): - infer_dtype_from_array(value, pandas_dtype=pandas_dtype) + infer_dtype_from_array(value) @pytest.mark.parametrize( - "arr, expected, pandas_dtype", + "arr, expected", [ - ([1], np.int_, False), - (np.array([1], dtype=np.int64), np.int64, False), - ([np.nan, 1, ""], np.object_, False), - (np.array([[1.0, 2.0]]), np.float_, False), - (Categorical(list("aabc")), np.object_, False), - (Categorical([1, 2, 3]), np.int64, False), - (Categorical(list("aabc")), "category", True), - (Categorical([1, 2, 3]), "category", True), - (date_range("20160101", periods=3), np.dtype("=M8[ns]"), False), + ([1], np.int_), + (np.array([1], dtype=np.int64), np.int64), + ([np.nan, 1, ""], np.object_), + (np.array([[1.0, 2.0]]), np.float_), + (Categorical(list("aabc")), "category"), + (Categorical([1, 2, 3]), "category"), + (date_range("20160101", periods=3), np.dtype("=M8[ns]")), ( date_range("20160101", periods=3, tz="US/Eastern"), "datetime64[ns, US/Eastern]", - True, ), - (Series([1.0, 2, 3]), np.float64, False), - (Series(list("abc")), np.object_, False), + (Series([1.0, 2, 3]), np.float64), + (Series(list("abc")), np.object_), ( Series(date_range("20160101", periods=3, tz="US/Eastern")), "datetime64[ns, US/Eastern]", - True, ), ], ) -def test_infer_dtype_from_array(arr, expected, pandas_dtype): - dtype, _ = infer_dtype_from_array(arr, pandas_dtype=pandas_dtype) +def test_infer_dtype_from_array(arr, expected): + dtype, _ = infer_dtype_from_array(arr) assert is_dtype_equal(dtype, expected) From 672eca85e04887164c6ef0238780fe19843878be Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 3 May 2023 17:28:08 -0700 Subject: [PATCH 3/4] REF: remove pandas_dtype kwarg from infer_dtype_from_scalar --- pandas/core/dtypes/cast.py | 32 ++++------ pandas/core/frame.py | 2 +- pandas/core/indexes/interval.py | 2 +- pandas/core/internals/array_manager.py | 10 +--- pandas/core/internals/managers.py | 2 +- pandas/tests/dtypes/cast/test_infer_dtype.py | 62 ++++++++------------ 6 files changed, 44 insertions(+), 66 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 74906143c87cf..21c490d360f0d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -631,7 +631,7 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): # returns tuple of (dtype, fill_value) if issubclass(dtype.type, np.datetime64): - inferred, fv = infer_dtype_from_scalar(fill_value, pandas_dtype=True) + inferred, fv = infer_dtype_from_scalar(fill_value) if inferred == dtype: return dtype, fv @@ -645,7 +645,7 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): return _dtype_obj, fill_value elif issubclass(dtype.type, np.timedelta64): - inferred, fv = infer_dtype_from_scalar(fill_value, pandas_dtype=True) + inferred, fv = infer_dtype_from_scalar(fill_value) if inferred == dtype: return dtype, fv @@ -744,20 +744,17 @@ def infer_dtype_from(val) -> tuple[DtypeObj, Any]: val : object """ if not is_list_like(val): - return infer_dtype_from_scalar(val, pandas_dtype=True) + return infer_dtype_from_scalar(val) return infer_dtype_from_array(val) -def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> tuple[DtypeObj, Any]: +def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: """ Interpret the dtype from a scalar. Parameters ---------- - pandas_dtype : bool, default False - whether to infer dtype including pandas extension types. - If False, scalar belongs to pandas extension types is inferred as - object + val : object """ dtype: DtypeObj = _dtype_obj @@ -792,11 +789,7 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> tuple[DtypeObj, dtype = val.dtype # TODO: test with datetime(2920, 10, 1) based on test_replace_dtypes else: - if pandas_dtype: - dtype = DatetimeTZDtype(unit="ns", tz=val.tz) - else: - # return datetimetz as object - return _dtype_obj, val + dtype = DatetimeTZDtype(unit="ns", tz=val.tz) elif isinstance(val, (np.timedelta64, dt.timedelta)): try: @@ -830,12 +823,11 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> tuple[DtypeObj, elif is_complex(val): dtype = np.dtype(np.complex_) - elif pandas_dtype: - if lib.is_period(val): - dtype = PeriodDtype(freq=val.freq) - elif lib.is_interval(val): - subtype = infer_dtype_from_scalar(val.left, pandas_dtype=True)[0] - dtype = IntervalDtype(subtype=subtype, closed=val.closed) + if lib.is_period(val): + dtype = PeriodDtype(freq=val.freq) + elif lib.is_interval(val): + subtype = infer_dtype_from_scalar(val.left)[0] + dtype = IntervalDtype(subtype=subtype, closed=val.closed) return dtype, val @@ -1448,7 +1440,7 @@ def construct_1d_arraylike_from_scalar( if dtype is None: try: - dtype, value = infer_dtype_from_scalar(value, pandas_dtype=True) + dtype, value = infer_dtype_from_scalar(value) except OutOfBoundsDatetime: dtype = _dtype_obj diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 636267e54027f..3ab548aec06e2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -824,7 +824,7 @@ def __init__( columns = ensure_index(columns) if not dtype: - dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True) + dtype, _ = infer_dtype_from_scalar(data) # For data is a scalar extension dtype if isinstance(dtype, ExtensionDtype): diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 72c88d2967b35..50838f8c65881 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -558,7 +558,7 @@ def _maybe_convert_i8(self, key): if scalar: # Timestamp/Timedelta - key_dtype, key_i8 = infer_dtype_from_scalar(key, pandas_dtype=True) + key_dtype, key_i8 = infer_dtype_from_scalar(key) if lib.is_period(key): key_i8 = key.ordinal elif isinstance(key_i8, Timestamp): diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index c3be914aa095d..a38f4a5900e58 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -84,6 +84,7 @@ new_block, to_native_types, ) +from pandas.core.internals.managers import make_na_array if TYPE_CHECKING: from pandas._typing import ( @@ -665,13 +666,8 @@ def _make_na_array(self, fill_value=None, use_na_proxy: bool = False): fill_value = np.nan dtype, fill_value = infer_dtype_from_scalar(fill_value) - # error: Argument "dtype" to "empty" has incompatible type "Union[dtype[Any], - # ExtensionDtype]"; expected "Union[dtype[Any], None, type, _SupportsDType, str, - # Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any], - # _DTypeDict, Tuple[Any, Any]]]" - values = np.empty(self.shape_proper[0], dtype=dtype) # type: ignore[arg-type] - values.fill(fill_value) - return values + array_values = make_na_array(dtype, self.shape_proper[0], fill_value) + return array_values def _equal_values(self, other) -> bool: """ diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 36dd0cece0f20..567ad2b741317 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -921,7 +921,7 @@ def _make_na_block( shape = (len(placement), self.shape[1]) - dtype, fill_value = infer_dtype_from_scalar(fill_value, pandas_dtype=True) + dtype, fill_value = infer_dtype_from_scalar(fill_value) block_values = make_na_array(dtype, shape, fill_value) return new_block_2d(block_values, placement=placement) diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index f1064260a1499..53d0656a11f81 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -25,11 +25,6 @@ ) -@pytest.fixture(params=[True, False]) -def pandas_dtype(request): - return request.param - - def test_infer_dtype_from_int_scalar(any_int_numpy_dtype): # Test that infer_dtype_from_scalar is # returning correct dtype for int and float. @@ -81,36 +76,32 @@ def test_infer_dtype_from_timedelta(data): @pytest.mark.parametrize("freq", ["M", "D"]) -def test_infer_dtype_from_period(freq, pandas_dtype): +def test_infer_dtype_from_period(freq): p = Period("2011-01-01", freq=freq) - dtype, val = infer_dtype_from_scalar(p, pandas_dtype=pandas_dtype) + dtype, val = infer_dtype_from_scalar(p) - if pandas_dtype: - exp_dtype = f"period[{freq}]" - else: - exp_dtype = np.object_ + exp_dtype = f"period[{freq}]" assert dtype == exp_dtype assert val == p -@pytest.mark.parametrize( - "data", [date(2000, 1, 1), "foo", Timestamp(1, tz="US/Eastern")] -) -def test_infer_dtype_misc(data): - dtype, val = infer_dtype_from_scalar(data) +def test_infer_dtype_misc(): + dt = date(2000, 1, 1) + dtype, val = infer_dtype_from_scalar(dt) assert dtype == np.object_ + ts = Timestamp(1, tz="US/Eastern") + dtype, val = infer_dtype_from_scalar(ts) + assert dtype == "datetime64[ns, US/Eastern]" + @pytest.mark.parametrize("tz", ["UTC", "US/Eastern", "Asia/Tokyo"]) -def test_infer_from_scalar_tz(tz, pandas_dtype): +def test_infer_from_scalar_tz(tz): dt = Timestamp(1, tz=tz) - dtype, val = infer_dtype_from_scalar(dt, pandas_dtype=pandas_dtype) + dtype, val = infer_dtype_from_scalar(dt) - if pandas_dtype: - exp_dtype = f"datetime64[ns, {tz}]" - else: - exp_dtype = np.object_ + exp_dtype = f"datetime64[ns, {tz}]" assert dtype == exp_dtype assert val == dt @@ -126,11 +117,11 @@ def test_infer_from_scalar_tz(tz, pandas_dtype): (Timedelta(0), Timedelta(1), "timedelta64[ns]"), ], ) -def test_infer_from_interval(left, right, subtype, closed, pandas_dtype): +def test_infer_from_interval(left, right, subtype, closed): # GH 30337 interval = Interval(left, right, closed) - result_dtype, result_value = infer_dtype_from_scalar(interval, pandas_dtype) - expected_dtype = f"interval[{subtype}, {closed}]" if pandas_dtype else np.object_ + result_dtype, result_value = infer_dtype_from_scalar(interval) + expected_dtype = f"interval[{subtype}, {closed}]" assert result_dtype == expected_dtype assert result_value == interval @@ -143,20 +134,19 @@ def test_infer_dtype_from_scalar_errors(): @pytest.mark.parametrize( - "value, expected, pandas_dtype", + "value, expected", [ - ("foo", np.object_, False), - (b"foo", np.object_, False), - (1, np.int64, False), - (1.5, np.float_, False), - (np.datetime64("2016-01-01"), np.dtype("M8[ns]"), False), - (Timestamp("20160101"), np.dtype("M8[ns]"), False), - (Timestamp("20160101", tz="UTC"), np.object_, False), - (Timestamp("20160101", tz="UTC"), "datetime64[ns, UTC]", True), + ("foo", np.object_), + (b"foo", np.object_), + (1, np.int64), + (1.5, np.float_), + (np.datetime64("2016-01-01"), np.dtype("M8[ns]")), + (Timestamp("20160101"), np.dtype("M8[ns]")), + (Timestamp("20160101", tz="UTC"), "datetime64[ns, UTC]"), ], ) -def test_infer_dtype_from_scalar(value, expected, pandas_dtype): - dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=pandas_dtype) +def test_infer_dtype_from_scalar(value, expected): + dtype, _ = infer_dtype_from_scalar(value) assert is_dtype_equal(dtype, expected) with pytest.raises(TypeError, match="must be list-like"): From 043efad5e2c860dde1a91576b96f39d92135796b Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 3 May 2023 18:23:20 -0700 Subject: [PATCH 4/4] mypy fixup --- pandas/core/internals/array_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index a38f4a5900e58..dbc80a69a5f69 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -666,7 +666,7 @@ def _make_na_array(self, fill_value=None, use_na_proxy: bool = False): fill_value = np.nan dtype, fill_value = infer_dtype_from_scalar(fill_value) - array_values = make_na_array(dtype, self.shape_proper[0], fill_value) + array_values = make_na_array(dtype, self.shape_proper[:1], fill_value) return array_values def _equal_values(self, other) -> bool: