From cddc939715cd8a169ad7058e6a3fd5ff230ea488 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 15 May 2020 15:47:27 +0200 Subject: [PATCH 01/26] ENH: nullable Float32/64 ExtensionArray --- pandas/__init__.py | 2 + pandas/arrays/__init__.py | 2 + pandas/core/api.py | 1 + pandas/core/arrays/__init__.py | 2 + pandas/core/arrays/boolean.py | 5 +- pandas/core/arrays/floating.py | 638 ++++++++++++++++++ pandas/core/arrays/integer.py | 12 +- pandas/core/construction.py | 10 + pandas/core/dtypes/common.py | 2 + pandas/core/groupby/ops.py | 3 +- pandas/tests/api/test_api.py | 2 + pandas/tests/arrays/floating/__init__.py | 0 pandas/tests/arrays/floating/conftest.py | 36 + .../tests/arrays/floating/test_arithmetic.py | 234 +++++++ pandas/tests/arrays/floating/test_astype.py | 71 ++ .../tests/arrays/floating/test_comparison.py | 117 ++++ pandas/tests/arrays/floating/test_concat.py | 21 + .../arrays/floating/test_construction.py | 211 ++++++ pandas/tests/arrays/floating/test_function.py | 154 +++++ pandas/tests/arrays/floating/test_repr.py | 45 ++ pandas/tests/arrays/floating/test_to_numpy.py | 123 ++++ pandas/tests/arrays/integer/test_concat.py | 3 +- pandas/tests/arrays/test_array.py | 18 +- pandas/tests/extension/test_floating.py | 220 ++++++ 24 files changed, 1924 insertions(+), 8 deletions(-) create mode 100644 pandas/core/arrays/floating.py create mode 100644 pandas/tests/arrays/floating/__init__.py create mode 100644 pandas/tests/arrays/floating/conftest.py create mode 100644 pandas/tests/arrays/floating/test_arithmetic.py create mode 100644 pandas/tests/arrays/floating/test_astype.py create mode 100644 pandas/tests/arrays/floating/test_comparison.py create mode 100644 pandas/tests/arrays/floating/test_concat.py create mode 100644 pandas/tests/arrays/floating/test_construction.py create mode 100644 pandas/tests/arrays/floating/test_function.py create mode 100644 pandas/tests/arrays/floating/test_repr.py create mode 100644 pandas/tests/arrays/floating/test_to_numpy.py create mode 100644 pandas/tests/extension/test_floating.py diff --git a/pandas/__init__.py b/pandas/__init__.py index d6584bf4f1c4f..3ef0876a4b5ae 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -59,6 +59,8 @@ UInt16Dtype, UInt32Dtype, UInt64Dtype, + Float32Dtype, + Float64Dtype, CategoricalDtype, PeriodDtype, IntervalDtype, diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index 61832a8b6d621..0fa070b6e4fc4 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -7,6 +7,7 @@ BooleanArray, Categorical, DatetimeArray, + FloatingArray, IntegerArray, IntervalArray, PandasArray, @@ -20,6 +21,7 @@ "BooleanArray", "Categorical", "DatetimeArray", + "FloatingArray", "IntegerArray", "IntervalArray", "PandasArray", diff --git a/pandas/core/api.py b/pandas/core/api.py index b0b65f9d0be34..27d4b44b35d0f 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -14,6 +14,7 @@ from pandas.core.algorithms import factorize, unique, value_counts from pandas.core.arrays import Categorical from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.floating import Float32Dtype, Float64Dtype from pandas.core.arrays.integer import ( Int8Dtype, Int16Dtype, diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 1d538824e6d82..76077069cd2b1 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -6,6 +6,7 @@ from pandas.core.arrays.boolean import BooleanArray from pandas.core.arrays.categorical import Categorical from pandas.core.arrays.datetimes import DatetimeArray +from pandas.core.arrays.floating import FloatingArray from pandas.core.arrays.integer import IntegerArray, integer_array from pandas.core.arrays.interval import IntervalArray from pandas.core.arrays.numpy_ import PandasArray, PandasDtype @@ -21,6 +22,7 @@ "BooleanArray", "Categorical", "DatetimeArray", + "FloatingArray", "IntegerArray", "integer_array", "IntervalArray", diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 9bc3d1276c79d..ba077a0f1049e 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -604,10 +604,11 @@ def logical_method(self, other): @classmethod def _create_comparison_method(cls, op): def cmp_method(self, other): - from pandas.arrays import IntegerArray + from pandas.arrays import IntegerArray, FloatingArray if isinstance( - other, (ABCDataFrame, ABCSeries, ABCIndexClass, IntegerArray) + other, + (ABCDataFrame, ABCSeries, ABCIndexClass, IntegerArray, FloatingArray), ): # Rely on pandas to unbox and dispatch to us. return NotImplemented diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py new file mode 100644 index 0000000000000..1e8e5feea0617 --- /dev/null +++ b/pandas/core/arrays/floating.py @@ -0,0 +1,638 @@ +import numbers +from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union +import warnings + +import numpy as np + +from pandas._libs import lib, missing as libmissing +from pandas._typing import ArrayLike, DtypeObj +from pandas.compat import set_function_name +from pandas.compat.numpy import function as nv +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_datetime64_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_object_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.missing import isna + +from pandas.core import nanops, ops +from pandas.core.array_algos import masked_reductions +from pandas.core.ops import invalid_comparison +from pandas.core.ops.common import unpack_zerodim_and_defer +from pandas.core.tools.numeric import to_numeric + +from .masked import BaseMaskedArray, BaseMaskedDtype + +if TYPE_CHECKING: + import pyarrow # noqa: F401 + + +class _FloatingDtype(BaseMaskedDtype): + """ + An ExtensionDtype to hold a single size of floating dtype. + + These specific implementations are subclasses of the non-public + _FloatingDtype. For example we have Float32Dtype to represent float32. + + The attributes name & type are set when these subclasses are created. + """ + + name: str + base = None + type: Type + + def __repr__(self) -> str: + return f"{self.name}Dtype()" + + @property + def _is_numeric(self) -> bool: + return True + + @cache_readonly + def numpy_dtype(self) -> np.dtype: + """ Return an instance of our numpy dtype """ + return np.dtype(self.type) + + @cache_readonly + def kind(self) -> str: + return self.numpy_dtype.kind + + @cache_readonly + def itemsize(self) -> int: + """ Return the number of bytes in this dtype """ + return self.numpy_dtype.itemsize + + @classmethod + def construct_array_type(cls) -> Type["FloatingArray"]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return FloatingArray + + def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: + # for now only handle other floating types + if not all(isinstance(t, _FloatingDtype) for t in dtypes): + return None + np_dtype = np.find_common_type( + [t.numpy_dtype for t in dtypes], [] # type: ignore + ) + if np.issubdtype(np_dtype, np.floating): + return _dtypes[str(np_dtype)] + return None + + def __from_arrow__( + self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] + ) -> "FloatingArray": + """ + Construct FloatingArray from pyarrow Array/ChunkedArray. + """ + import pyarrow # noqa: F811 + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask + + pyarrow_type = pyarrow.from_numpy_dtype(self.type) + if not array.type.equals(pyarrow_type): + array = array.cast(pyarrow_type) + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type) + float_arr = FloatingArray(data.copy(), ~mask, copy=False) + results.append(float_arr) + + return FloatingArray._concat_same_type(results) + + +def coerce_to_array( + values, dtype=None, mask=None, copy: bool = False, +) -> Tuple[np.ndarray, np.ndarray]: + """ + Coerce the input values array to numpy arrays with a mask + + Parameters + ---------- + values : 1D list-like + dtype : float dtype + mask : bool 1D array, optional + copy : bool, default False + if True, copy the input + + Returns + ------- + tuple of (values, mask) + """ + # if values is floating numpy array, preserve it's dtype + if dtype is None and hasattr(values, "dtype"): + if is_float_dtype(values.dtype): + dtype = values.dtype + + if dtype is not None: + if isinstance(dtype, str) and dtype.startswith("Float"): + # Avoid DeprecationWarning from NumPy about np.dtype("Float64") + # https://github.com/numpy/numpy/pull/7476 + dtype = dtype.lower() + + if not issubclass(type(dtype), _FloatingDtype): + try: + dtype = _dtypes[str(np.dtype(dtype))] + except KeyError as err: + raise ValueError(f"invalid dtype specified {dtype}") from err + + if isinstance(values, FloatingArray): + values, mask = values._data, values._mask + if dtype is not None: + values = values.astype(dtype.numpy_dtype, copy=False) + + if copy: + values = values.copy() + mask = mask.copy() + return values, mask + + values = np.array(values, copy=copy) + if is_object_dtype(values): + inferred_type = lib.infer_dtype(values, skipna=True) + if inferred_type == "empty": + values = np.empty(len(values)) + values.fill(np.nan) + elif inferred_type not in [ + "floating", + "integer", + "mixed-integer", + "integer-na", + "mixed-integer-float", + ]: + raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype") + + elif is_bool_dtype(values) and is_float_dtype(dtype): + values = np.array(values, dtype=float, copy=copy) + + elif not (is_integer_dtype(values) or is_float_dtype(values)): + raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype") + + if mask is None: + mask = isna(values) + else: + assert len(mask) == len(values) + + if not values.ndim == 1: + raise TypeError("values must be a 1D list-like") + if not mask.ndim == 1: + raise TypeError("mask must be a 1D list-like") + + # infer dtype if needed + if dtype is None: + dtype = np.dtype("float64") + else: + dtype = dtype.type + + # if we are float, let's make sure that we can + # safely cast + + # we copy as need to coerce here + # TODO should this be a safe cast? + if mask.any(): + values = values.copy() + values[mask] = np.nan + values = values.astype(dtype, copy=False) # , casting="safe") + else: + values = values.astype(dtype, copy=False) # , casting="safe") + + return values, mask + + +class FloatingArray(BaseMaskedArray): + """ + Array of floating (optional missing) values. + + .. versionadded:: 1.1.0 + + .. warning:: + + FloatingArray is currently experimental, and its API or internal + implementation may change without warning. Expecially the behaviour + regarding NaN (distinct from NA missing values) is subject to change. + + We represent a FloatingArray with 2 numpy arrays: + + - data: contains a numpy float array of the appropriate dtype + - mask: a boolean array holding a mask on the data, True is missing + + To construct an FloatingArray from generic array-like input, use + :func:`pandas.array` with one of the float dtypes (see examples). + + See :ref:`integer_na` for more. + + Parameters + ---------- + values : numpy.ndarray + A 1-d float-dtype array. + mask : numpy.ndarray + A 1-d boolean-dtype array indicating missing values. + copy : bool, default False + Whether to copy the `values` and `mask`. + + Attributes + ---------- + None + + Methods + ------- + None + + Returns + ------- + FloatingArray + + Examples + -------- + Create an FloatingArray with :func:`pandas.array`: + + >>> pd.array([0.1, None, 0.3], dtype=pd.Float32Dtype()) + + [0.1, , 0.3] + Length: 3, dtype: Float32 + + String aliases for the dtypes are also available. They are capitalized. + + >>> pd.array([0.1, None, 0.3], dtype="Float32") + + [0.1, , 0.3] + Length: 3, dtype: Float32 + """ + + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = 0.0 + + @cache_readonly + def dtype(self) -> _FloatingDtype: + return _dtypes[str(self._data.dtype)] + + def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): + if not (isinstance(values, np.ndarray) and values.dtype.kind == "f"): + raise TypeError( + "values should be floating numpy array. Use " + "the 'pd.array' function instead" + ) + super().__init__(values, mask, copy=copy) + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy: bool = False) -> "FloatingArray": + values, mask = coerce_to_array(scalars, dtype=dtype, copy=copy) + return FloatingArray(values, mask) + + @classmethod + def _from_sequence_of_strings( + cls, strings, dtype=None, copy: bool = False + ) -> "FloatingArray": + scalars = to_numeric(strings, errors="raise") + return cls._from_sequence(scalars, dtype, copy) + + _HANDLED_TYPES = (np.ndarray, numbers.Number) + + def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs): + # For FloatingArray inputs, we apply the ufunc to ._data + # and mask the result. + if method == "reduce": + # Not clear how to handle missing values in reductions. Raise. + raise NotImplementedError("The 'reduce' method is not supported.") + out = kwargs.get("out", ()) + + for x in inputs + out: + if not isinstance(x, self._HANDLED_TYPES + (FloatingArray,)): + return NotImplemented + + # for binary ops, use our custom dunder methods + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + mask = np.zeros(len(self), dtype=bool) + inputs2 = [] + for x in inputs: + if isinstance(x, FloatingArray): + mask |= x._mask + inputs2.append(x._data) + else: + inputs2.append(x) + + def reconstruct(x): + # we don't worry about scalar `x` here, since we + # raise for reduce up above. + + # TODO + if is_float_dtype(x.dtype): + m = mask.copy() + return FloatingArray(x, m) + else: + x[mask] = np.nan + return x + + result = getattr(ufunc, method)(*inputs2, **kwargs) + if isinstance(result, tuple): + tuple(reconstruct(x) for x in result) + else: + return reconstruct(result) + + def _coerce_to_array(self, value) -> Tuple[np.ndarray, np.ndarray]: + return coerce_to_array(value, dtype=self.dtype) + + def astype(self, dtype, copy: bool = True) -> ArrayLike: + """ + Cast to a NumPy array or ExtensionArray with 'dtype'. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + ndarray or ExtensionArray + NumPy ndarray, or BooleanArray, IntegerArray or FloatingArray with + 'dtype' for its dtype. + + Raises + ------ + TypeError + if incompatible type with an FloatingDtype, equivalent of same_kind + casting + """ + from pandas.core.arrays.boolean import BooleanArray, BooleanDtype + from pandas.core.arrays.integer import _IntegerDtype, IntegerArray + + dtype = pandas_dtype(dtype) + + # if we are astyping to an existing FloatingDtype we can fastpath + if isinstance(dtype, _FloatingDtype): + result = self._data.astype(dtype.numpy_dtype, copy=False) + return type(self)(result, mask=self._mask, copy=False) + # astyping to other known masked dtypes + elif isinstance(dtype, _IntegerDtype): + # TODO deal with NaNs + result = self._data.astype(dtype.numpy_dtype, copy=False) + # TODO should mask be copied here? + return IntegerArray(result, mask=self._mask, copy=False) + elif isinstance(dtype, BooleanDtype): + result = self._data.astype("bool", copy=False) + return BooleanArray(result, mask=self._mask, copy=False) + + # coerce + if is_float_dtype(dtype): + # In astype, we consider dtype=float to also mean na_value=np.nan + kwargs = dict(na_value=np.nan) + elif is_datetime64_dtype(dtype): + kwargs = dict(na_value=np.datetime64("NaT")) + else: + kwargs = {} + + data = self.to_numpy(dtype=dtype, **kwargs) + return astype_nansafe(data, dtype, copy=False) + + def _values_for_argsort(self) -> np.ndarray: + return self._data + + @classmethod + def _create_comparison_method(cls, op): + op_name = op.__name__ + + @unpack_zerodim_and_defer(op.__name__) + def cmp_method(self, other): + from pandas.arrays import BooleanArray, IntegerArray + + mask = None + + if isinstance(other, (BooleanArray, IntegerArray, FloatingArray)): + other, mask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + + if other is libmissing.NA: + # numpy does not handle pd.NA well as "other" scalar (it returns + # a scalar False instead of an array) + # This may be fixed by NA.__array_ufunc__. Revisit this check + # once that's implemented. + result = np.zeros(self._data.shape, dtype="bool") + mask = np.ones(self._data.shape, dtype="bool") + else: + with warnings.catch_warnings(): + # numpy may show a FutureWarning: + # elementwise comparison failed; returning scalar instead, + # but in the future will perform elementwise comparison + # before returning NotImplemented. We fall back to the correct + # behavior today, so that should be fine to ignore. + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + method = getattr(self._data, f"__{op_name}__") + result = method(other) + + if result is NotImplemented: + result = invalid_comparison(self._data, other, op) + + # nans propagate + if mask is None: + mask = self._mask.copy() + else: + mask = self._mask | mask + + return BooleanArray(result, mask) + + name = f"__{op.__name__}__" + return set_function_name(cmp_method, name, cls) + + def _reduce(self, name: str, skipna: bool = True, **kwargs): + data = self._data + mask = self._mask + + if name in {"sum", "prod", "min", "max"}: + op = getattr(masked_reductions, name) + return op(data, mask, skipna=skipna, **kwargs) + + # coerce to a nan-aware float if needed + # (we explicitly use NaN within reductions) + if self._hasna: + data = self.to_numpy("float64", na_value=np.nan) + + op = getattr(nanops, "nan" + name) + result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) + + if np.isnan(result): + return libmissing.NA + + return result + + def sum(self, skipna=True, min_count=0, **kwargs): + nv.validate_sum((), kwargs) + result = masked_reductions.sum( + values=self._data, mask=self._mask, skipna=skipna, min_count=min_count + ) + return result + + def _maybe_mask_result(self, result, mask, other, op_name: str): + """ + Parameters + ---------- + result : array-like + mask : array-like bool + other : scalar or array-like + op_name : str + """ + # TODO are there cases we don't end up with float? + # if we have a float operand we are by-definition + # a float result + # or our op is a divide + # if (is_float_dtype(other) or is_float(other)) or ( + # op_name in ["rtruediv", "truediv"] + # ): + # result[mask] = np.nan + # return result + + return type(self)(result, mask, copy=False) + + @classmethod + def _create_arithmetic_method(cls, op): + op_name = op.__name__ + + @unpack_zerodim_and_defer(op.__name__) + def floating_arithmetic_method(self, other): + from pandas.arrays import IntegerArray + + omask = None + + if getattr(other, "ndim", 0) > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + + if isinstance(other, (IntegerArray, FloatingArray)): + other, omask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match") + if not (is_float_dtype(other) or is_integer_dtype(other)): + raise TypeError("can only perform ops with numeric values") + + else: + if not (is_float(other) or is_integer(other) or other is libmissing.NA): + raise TypeError("can only perform ops with numeric values") + + if omask is None: + mask = self._mask.copy() + if other is libmissing.NA: + mask |= True + else: + mask = self._mask | omask + + if op_name == "pow": + # 1 ** x is 1. + mask = np.where((self._data == 1) & ~self._mask, False, mask) + # x ** 0 is 1. + if omask is not None: + mask = np.where((other == 0) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 0, False, mask) + + elif op_name == "rpow": + # 1 ** x is 1. + if omask is not None: + mask = np.where((other == 1) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 1, False, mask) + # x ** 0 is 1. + mask = np.where((self._data == 0) & ~self._mask, False, mask) + + if other is libmissing.NA: + result = np.ones_like(self._data) + else: + with np.errstate(all="ignore"): + result = op(self._data, other) + + # divmod returns a tuple + if op_name == "divmod": + div, mod = result + return ( + self._maybe_mask_result(div, mask, other, "floordiv"), + self._maybe_mask_result(mod, mask, other, "mod"), + ) + + return self._maybe_mask_result(result, mask, other, op_name) + + name = f"__{op.__name__}__" + return set_function_name(floating_arithmetic_method, name, cls) + + +FloatingArray._add_arithmetic_ops() +FloatingArray._add_comparison_ops() + + +_dtype_docstring = """ +An ExtensionDtype for {dtype} data. + +.. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as its missing value, + rather than :attr:`numpy.nan`. + +Attributes +---------- +None + +Methods +------- +None +""" + +# create the Dtype + + +@register_extension_dtype +class Float32Dtype(_FloatingDtype): + type = np.float32 + name = "Float32" + __doc__ = _dtype_docstring.format(dtype="float32") + + +@register_extension_dtype +class Float64Dtype(_FloatingDtype): + type = np.float64 + name = "Float64" + __doc__ = _dtype_docstring.format(dtype="float64") + + +_dtypes = { + "float32": Float32Dtype(), + "float64": Float64Dtype(), +} diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 59954f548fd33..f0c38783457bf 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -101,6 +101,10 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: ) if np.issubdtype(np_dtype, np.integer): return _dtypes[str(np_dtype)] + if np.issubdtype(np_dtype, np.floating): + from pandas.core.arrays.floating import _dtypes as float_dtypes + + return float_dtypes[str(np_dtype)] return None def __from_arrow__( @@ -443,6 +447,7 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: casting """ from pandas.core.arrays.boolean import BooleanArray, BooleanDtype + from pandas.core.arrays.floating import FloatingArray, _FloatingDtype dtype = pandas_dtype(dtype) @@ -453,6 +458,9 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: elif isinstance(dtype, BooleanDtype): result = self._data.astype("bool", copy=False) return BooleanArray(result, mask=self._mask, copy=False) + elif isinstance(dtype, _FloatingDtype): + result = self._data.astype(dtype.numpy_dtype, copy=False) + return FloatingArray(result, mask=self._mask, copy=False) # coerce if is_float_dtype(dtype): @@ -491,11 +499,11 @@ def _create_comparison_method(cls, op): @unpack_zerodim_and_defer(op.__name__) def cmp_method(self, other): - from pandas.arrays import BooleanArray + from pandas.arrays import BooleanArray, FloatingArray mask = None - if isinstance(other, (BooleanArray, IntegerArray)): + if isinstance(other, (BooleanArray, IntegerArray, FloatingArray)): other, mask = other._data, other._mask elif is_list_like(other): diff --git a/pandas/core/construction.py b/pandas/core/construction.py index b110a316a76d9..1204dfc144cd1 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -101,6 +101,7 @@ def array( :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` :class:`int` :class:`pandas.arrays.IntegerArray` + :class:`float` :class:`pandas.arrays.FloatingArray` :class:`str` :class:`pandas.arrays.StringArray` :class:`bool` :class:`pandas.arrays.BooleanArray` ============================== ===================================== @@ -113,6 +114,11 @@ def array( string dtype for string data, and nullable-boolean dtype for boolean data. + .. versionchanged:: 1.1.0 + + Pandas now also infers nullable-floating dtype for float-like + input data + copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require @@ -257,6 +263,7 @@ def array( from pandas.core.arrays import ( period_array, BooleanArray, + FloatingArray, IntegerArray, IntervalArray, PandasArray, @@ -319,6 +326,9 @@ def array( elif inferred_dtype == "integer": return IntegerArray._from_sequence(data, copy=copy) + elif inferred_dtype == "floating": + return FloatingArray._from_sequence(data, copy=copy) + elif inferred_dtype == "boolean": return BooleanArray._from_sequence(data, copy=copy) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index a4a5ae1bfefff..868337f92f65a 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -83,6 +83,8 @@ def ensure_float(arr): float_arr : The original array cast to the float dtype if possible. Otherwise, the original array is returned. """ + if is_extension_array_dtype(arr.dtype): + return arr.to_numpy(dtype="float64", na_value=np.nan) if issubclass(arr.dtype.type, (np.integer, np.bool_)): arr = arr.astype(float) return arr diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 74db87f46c5e2..2b8944d715e5f 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -20,6 +20,7 @@ from pandas.core.dtypes.cast import maybe_cast_result from pandas.core.dtypes.common import ( + ensure_float, ensure_float64, ensure_int64, ensure_int_or_float, @@ -494,7 +495,7 @@ def _cython_operation( else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): - values = ensure_float64(values) + values = ensure_float64(ensure_float(values)) else: values = values.astype(object) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index ecd20796b6f21..9a89476886ceb 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -91,6 +91,8 @@ class TestPDApi(Base): "UInt16Dtype", "UInt32Dtype", "UInt64Dtype", + "Float32Dtype", + "Float64Dtype", "NamedAgg", ] diff --git a/pandas/tests/arrays/floating/__init__.py b/pandas/tests/arrays/floating/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/arrays/floating/conftest.py b/pandas/tests/arrays/floating/conftest.py new file mode 100644 index 0000000000000..1e80518e15941 --- /dev/null +++ b/pandas/tests/arrays/floating/conftest.py @@ -0,0 +1,36 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.core.arrays.floating import Float32Dtype, Float64Dtype + + +@pytest.fixture(params=[Float32Dtype, Float64Dtype]) +def dtype(request): + return request.param() + + +@pytest.fixture +def data(dtype): + return pd.array( + list(np.arange(0.1, 0.9, 0.1)) + + [pd.NA] + + list(np.arange(1, 9.8, 0.1)) + + [pd.NA] + + [9.9, 10.0], + dtype=dtype, + ) + + +@pytest.fixture +def data_missing(dtype): + return pd.array([np.nan, 0.1], dtype=dtype) + + +@pytest.fixture(params=["data", "data_missing"]) +def all_data(request, data, data_missing): + """Parametrized fixture giving 'data' and 'data_missing'""" + if request.param == "data": + return data + elif request.param == "data_missing": + return data_missing diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py new file mode 100644 index 0000000000000..29813c2fbfa50 --- /dev/null +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -0,0 +1,234 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import FloatingArray, IntegerArray +from pandas.tests.extension.base import BaseOpsUtil + + +class TestArithmeticOps(BaseOpsUtil): + def _check_divmod_op(self, s, op, other, exc=None): + super()._check_divmod_op(s, op, other, None) + + def _check_op(self, s, op_name, other, exc=None): + op = self.get_op_from_name(op_name) + result = op(s, other) + + # compute expected + mask = s.isna() + + # if s is a DataFrame, squeeze to a Series + # for comparison + if isinstance(s, pd.DataFrame): + result = result.squeeze() + s = s.squeeze() + mask = mask.squeeze() + + # other array is an Integer + if isinstance(other, (IntegerArray, FloatingArray)): + omask = getattr(other, "mask", None) + mask = getattr(other, "data", other) + if omask is not None: + mask |= omask + + # 1 ** na is na, so need to unmask those + if op_name == "__pow__": + mask = np.where(~s.isna() & (s == 1), False, mask) + + elif op_name == "__rpow__": + other_is_one = other == 1 + if isinstance(other_is_one, pd.Series): + other_is_one = other_is_one.fillna(False) + mask = np.where(other_is_one, False, mask) + + rs = s.astype(s.dtype.numpy_dtype) + expected = op(rs, other).astype(s.dtype) + expected[mask] = np.nan + if "floordiv" in op_name: + # Series op sets 1//0 to np.inf, which IntegerArray does not do (yet) + mask2 = np.isinf(expected) & np.isnan(result) + expected[mask2] = np.nan + tm.assert_series_equal(result, expected) + + def test_arith_floating_array(self, data, all_arithmetic_operators): + # we operate with a rhs of an floating array + + op = all_arithmetic_operators + + s = pd.Series(data) + rhs = pd.Series([1] * len(data), dtype=data.dtype) + rhs.iloc[-1] = np.nan + + self._check_op(s, op, rhs) + + def test_arith_series_with_scalar(self, data, all_arithmetic_operators): + # scalar + op = all_arithmetic_operators + s = pd.Series(data) + self._check_op(s, op, 1, exc=TypeError) + + def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): + # frame & scalar + op = all_arithmetic_operators + df = pd.DataFrame({"A": data}) + self._check_op(df, op, 1, exc=TypeError) + + def test_arith_series_with_array(self, data, all_arithmetic_operators): + # ndarray & other series + op = all_arithmetic_operators + s = pd.Series(data) + other = np.ones(len(s), dtype=s.dtype.type) + self._check_op(s, op, other, exc=TypeError) + + def test_arith_len_mismatch(self, all_arithmetic_operators): + # operating with a list-like with non-matching length raises + op = self.get_op_from_name(all_arithmetic_operators) + other = np.array([1.0]) + + s = pd.Series([1, 2, 3], dtype="Float64") + with pytest.raises(ValueError, match="Lengths must match"): + op(s, other) + + @pytest.mark.parametrize("other", [0, 0.5]) + def test_arith_zero_dim_ndarray(self, other): + arr = pd.array([1, None, 2], dtype="Float64") + result = arr + np.array(other) + expected = arr + other + tm.assert_equal(result, expected) + + def test_error(self, data, all_arithmetic_operators): + # invalid ops + + op = all_arithmetic_operators + s = pd.Series(data) + ops = getattr(s, op) + opa = getattr(data, op) + + # invalid scalars + msg = ( + r"(:?can only perform ops with numeric values)" + r"|(:?FloatingArray cannot perform the operation mod)" + ) + with pytest.raises(TypeError, match=msg): + ops("foo") + with pytest.raises(TypeError, match=msg): + ops(pd.Timestamp("20180101")) + + # invalid array-likes + with pytest.raises(TypeError, match=msg): + ops(pd.Series("foo", index=s.index)) + + if op != "__rpow__": + # TODO(extension) + # rpow with a datetimelike coerces the integer array incorrectly + msg = ( + "can only perform ops with numeric values|" + "cannot perform .* with this index type: DatetimeArray|" + "Addition/subtraction of integers and integer-arrays " + "with DatetimeArray is no longer supported. *" + ) + with pytest.raises(TypeError, match=msg): + ops(pd.Series(pd.date_range("20180101", periods=len(s)))) + + # 2d + result = opa(pd.DataFrame({"A": s})) + assert result is NotImplemented + + msg = r"can only perform ops with 1-d structures" + with pytest.raises(NotImplementedError, match=msg): + opa(np.arange(len(s)).reshape(-1, len(s))) + + @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) + def test_divide_by_zero(self, zero, negative): + # TODO pending NA/NaN discussion + # https://github.com/pandas-dev/pandas/issues/32265/ + a = pd.array([0, 1, -1, None], dtype="Float64") + result = a / zero + expected = FloatingArray( + np.array([np.nan, np.inf, -np.inf, np.nan]), + np.array([False, False, False, True]), + ) + if negative: + expected *= -1 + tm.assert_extension_array_equal(result, expected) + + def test_pow_scalar(self): + a = pd.array([-1, 0, 1, None, 2], dtype="Float64") + result = a ** 0 + expected = pd.array([1, 1, 1, 1, 1], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + result = a ** 1 + expected = pd.array([-1, 0, 1, None, 2], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + result = a ** pd.NA + expected = pd.array([None, None, 1, None, None], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + result = a ** np.nan + # TODO np.nan should be converted to pd.NA / missing before operation? + expected = FloatingArray( + np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"), mask=a._mask + ) + tm.assert_extension_array_equal(result, expected) + + # reversed + a = a[1:] # Can't raise integers to negative powers. + + result = 0 ** a + expected = pd.array([1, 0, None, 0], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + result = 1 ** a + expected = pd.array([1, 1, 1, 1], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + result = pd.NA ** a + expected = pd.array([1, None, None, None], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + result = np.nan ** a + expected = FloatingArray( + np.array([1, np.nan, np.nan, np.nan], dtype="float64"), mask=a._mask + ) + tm.assert_extension_array_equal(result, expected) + + def test_pow_array(self): + a = pd.array([0, 0, 0, 1, 1, 1, None, None, None], dtype="Float64") + b = pd.array([0, 1, None, 0, 1, None, 0, 1, None], dtype="Float64") + result = a ** b + expected = pd.array([1, 0, None, 1, 1, 1, 1, None, None], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + def test_rpow_one_to_na(self): + # https://github.com/pandas-dev/pandas/issues/22022 + # https://github.com/pandas-dev/pandas/issues/29997 + arr = pd.array([np.nan, np.nan], dtype="Float64") + result = np.array([1.0, 2.0]) ** arr + expected = pd.array([1.0, np.nan], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + +def test_cross_type_arithmetic(): + + df = pd.DataFrame( + { + "A": pd.array([1, 2, np.nan], dtype="Float64"), + "B": pd.array([1, np.nan, 3], dtype="Float32"), + "C": np.array([1, 2, 3], dtype="float64"), + } + ) + + result = df.A + df.C + expected = pd.Series([2, 4, np.nan], dtype="Float64") + tm.assert_series_equal(result, expected) + + result = (df.A + df.C) * 3 == 12 + expected = pd.Series([False, True, None], dtype="boolean") + tm.assert_series_equal(result, expected) + + result = df.A + df.B + expected = pd.Series([2, np.nan, np.nan], dtype="Float64") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py new file mode 100644 index 0000000000000..f9e30556012aa --- /dev/null +++ b/pandas/tests/arrays/floating/test_astype.py @@ -0,0 +1,71 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +def test_astype(): + # with missing values + arr = pd.array([0.1, 0.2, None], dtype="Float64") + + with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype NumPy"): + arr.astype("int64") + + with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype NumPy"): + arr.astype("bool") + + result = arr.astype("float64") + expected = np.array([0.1, 0.2, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # no missing values + arr = pd.array([0.0, 1.0, 0.5], dtype="Float64") + result = arr.astype("int64") + expected = np.array([0, 1, 0], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype("bool") + expected = np.array([False, True, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_to_floating_array(): + # astype to FloatingArray + arr = pd.array([0.0, 1.0, None], dtype="Float64") + + result = arr.astype("Float64") + tm.assert_extension_array_equal(result, arr) + result = arr.astype(pd.Float64Dtype()) + tm.assert_extension_array_equal(result, arr) + result = arr.astype("Float32") + expected = pd.array([0.0, 1.0, None], dtype="Float32") + tm.assert_extension_array_equal(result, expected) + + +def test_astype_to_boolean_array(): + # astype to BooleanArray + arr = pd.array([0.0, 1.0, None], dtype="Float64") + + result = arr.astype("boolean") + expected = pd.array([False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = arr.astype(pd.BooleanDtype()) + tm.assert_extension_array_equal(result, expected) + + +def test_astype_to_integer_array(): + # astype to IntegerArray + arr = pd.array([0.0, 1.5, None], dtype="Float64") + + result = arr.astype("Int64") + expected = pd.array([0, 1, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + +def test_astype_str(): + a = pd.array([0.1, 0.2, None], dtype="Float64") + expected = np.array(["0.1", "0.2", ""], dtype=object) + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) diff --git a/pandas/tests/arrays/floating/test_comparison.py b/pandas/tests/arrays/floating/test_comparison.py new file mode 100644 index 0000000000000..5538367f49e5b --- /dev/null +++ b/pandas/tests/arrays/floating/test_comparison.py @@ -0,0 +1,117 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.tests.extension.base import BaseOpsUtil + + +class TestComparisonOps(BaseOpsUtil): + def _compare_other(self, data, op_name, other): + op = self.get_op_from_name(op_name) + + # array + result = pd.Series(op(data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") + + # fill the nan locations + expected[data._mask] = pd.NA + + tm.assert_series_equal(result, expected) + + # series + s = pd.Series(data) + result = op(s, other) + + expected = op(pd.Series(data._data), other) + + # fill the nan locations + expected[data._mask] = pd.NA + expected = expected.astype("boolean") + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("other", [True, False, pd.NA, -1.0, 0.0, 1]) + def test_scalar(self, other, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([1.0, 0.0, None], dtype="Float64") + + result = op(a, other) + + if other is pd.NA: + expected = pd.array([None, None, None], dtype="boolean") + else: + values = op(a._data, other) + expected = pd.arrays.BooleanArray(values, a._mask, copy=True) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal(a, pd.array([1.0, 0.0, None], dtype="Float64")) + + def test_array(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([0, 1, 2, None, None, None], dtype="Float64") + b = pd.array([0, 1, None, 0, 1, None], dtype="Float64") + + result = op(a, b) + values = op(a._data, b._data) + mask = a._mask | b._mask + + expected = pd.arrays.BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal( + a, pd.array([0, 1, 2, None, None, None], dtype="Float64") + ) + tm.assert_extension_array_equal( + b, pd.array([0, 1, None, 0, 1, None], dtype="Float64") + ) + + def test_compare_with_booleanarray(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True, False, None] * 3, dtype="boolean") + b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Float64") + other = pd.array([False] * 3 + [True] * 3 + [None] * 3, dtype="boolean") + expected = op(a, other) + result = op(a, b) + tm.assert_extension_array_equal(result, expected) + expected = op(other, a) + result = op(b, a) + tm.assert_extension_array_equal(result, expected) + + def test_compare_with_integerarray(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([0, 1, None] * 3, dtype="Int64") + b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Float64") + other = b.astype("Int64") + expected = op(a, other) + result = op(a, b) + tm.assert_extension_array_equal(result, expected) + expected = op(other, a) + result = op(b, a) + tm.assert_extension_array_equal(result, expected) + + def test_no_shared_mask(self, data): + result = data + 1 + assert np.shares_memory(result._mask, data._mask) is False + + def test_compare_to_string(self, dtype): + # GH 28930 + s = pd.Series([1, None], dtype=dtype) + result = s == "a" + expected = pd.Series([False, pd.NA], dtype="boolean") + + self.assert_series_equal(result, expected) + + +def test_equals(): + # GH-30652 + # equals is generally tested in /tests/extension/base/methods, but this + # specifically tests that two arrays of the same class but different dtype + # do not evaluate equal + a1 = pd.array([1, 2, None], dtype="Float64") + a2 = pd.array([1, 2, None], dtype="Float32") + assert a1.equals(a2) is False diff --git a/pandas/tests/arrays/floating/test_concat.py b/pandas/tests/arrays/floating/test_concat.py new file mode 100644 index 0000000000000..dcb021045c6a7 --- /dev/null +++ b/pandas/tests/arrays/floating/test_concat.py @@ -0,0 +1,21 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize( + "to_concat_dtypes, result_dtype", + [ + (["Float64", "Float64"], "Float64"), + (["Float32", "Float64"], "Float64"), + (["Float32", "Float32"], "Float32"), + ], +) +def test_concat_series(to_concat_dtypes, result_dtype): + + result = pd.concat([pd.Series([1, 2, pd.NA], dtype=t) for t in to_concat_dtypes]) + expected = pd.concat([pd.Series([1, 2, pd.NA], dtype=object)] * 2).astype( + result_dtype + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py new file mode 100644 index 0000000000000..8910d7f11df91 --- /dev/null +++ b/pandas/tests/arrays/floating/test_construction.py @@ -0,0 +1,211 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import FloatingArray +from pandas.core.arrays.floating import Float32Dtype, Float64Dtype + + +def test_uses_pandas_na(): + a = pd.array([1, None], dtype=pd.Float64Dtype()) + assert a[1] is pd.NA + + +def test_floating_array_constructor(): + values = np.array([1, 2, 3, 4], dtype="float64") + mask = np.array([False, False, False, True], dtype="bool") + + result = FloatingArray(values, mask) + expected = pd.array([1, 2, 3, np.nan], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + tm.assert_numpy_array_equal(result._data, values) + tm.assert_numpy_array_equal(result._mask, mask) + + msg = r".* should be .* numpy array. Use the 'pd.array' function instead" + with pytest.raises(TypeError, match=msg): + FloatingArray(values.tolist(), mask) + + with pytest.raises(TypeError, match=msg): + FloatingArray(values, mask.tolist()) + + with pytest.raises(TypeError, match=msg): + FloatingArray(values.astype(int), mask) + + msg = r"__init__\(\) missing 1 required positional argument: 'mask'" + with pytest.raises(TypeError, match=msg): + FloatingArray(values) + + +def test_floating_array_constructor_copy(): + values = np.array([1, 2, 3, 4], dtype="float64") + mask = np.array([False, False, False, True], dtype="bool") + + result = FloatingArray(values, mask) + assert result._data is values + assert result._mask is mask + + result = FloatingArray(values, mask, copy=True) + assert result._data is not values + assert result._mask is not mask + + +def test_to_array(): + result = pd.array([0.1, 0.2, 0.3, 0.4]) + expected = pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "a, b", + [ + ([1, None], [1, pd.NA]), + ([None], [pd.NA]), + ([None, np.nan], [pd.NA, pd.NA]), + ([1, np.nan], [1, pd.NA]), + ([np.nan], [pd.NA]), + ], +) +def test_to_array_none_is_nan(a, b): + result = pd.array(a, dtype="Float64") + expected = pd.array(b, dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + ["foo", "bar"], + ["1", "2"], + "foo", + 1, + 1.0, + pd.date_range("20130101", periods=2), + np.array(["foo"]), + [[1, 2], [3, 4]], + [np.nan, {"a": 1}], + ], +) +def test_to_array_error(values): + # error in converting existing arrays to FloatingArray + msg = ( + r"(:?.* cannot be converted to a FloatingDtype)" + r"|(:?values must be a 1D list-like)" + r"|(:?Cannot pass scalar)" + ) + with pytest.raises((TypeError, ValueError), match=msg): + pd.array(values, dtype="Float64") + + +def test_to_array_inferred_dtype(): + # if values has dtype -> respect it + result = pd.array(np.array([1, 2], dtype="float32")) + assert result.dtype == Float32Dtype() + + # if values have no dtype -> always float64 + result = pd.array([1.0, 2.0]) + assert result.dtype == Float64Dtype() + + +def test_to_array_dtype_keyword(): + result = pd.array([1, 2], dtype="Float32") + assert result.dtype == Float32Dtype() + + # if values has dtype -> override it + result = pd.array(np.array([1, 2], dtype="float32"), dtype="Float64") + assert result.dtype == Float64Dtype() + + +def test_to_array_integer(): + result = pd.array([1, 2], dtype="Float64") + expected = pd.array([1.0, 2.0], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + # for integer dtypes, the itemsize is not preserved + # TODO can we specify "floating" in general? + result = pd.array(np.array([1, 2], dtype="int32"), dtype="Float64") + assert result.dtype == Float64Dtype() + + +@pytest.mark.parametrize( + "bool_values, values, target_dtype, expected_dtype", + [ + ([False, True], [0, 1], Float64Dtype(), Float64Dtype()), + ([False, True], [0, 1], "Float64", Float64Dtype()), + ([False, True, np.nan], [0, 1, np.nan], Float64Dtype(), Float64Dtype()), + ], +) +def test_to_array_bool(bool_values, values, target_dtype, expected_dtype): + result = pd.array(bool_values, dtype=target_dtype) + assert result.dtype == expected_dtype + expected = pd.array(values, dtype=target_dtype) + tm.assert_extension_array_equal(result, expected) + + +def test_series_from_float(data): + # construct from our dtype & string dtype + dtype = data.dtype + + # from float + expected = pd.Series(data) + result = pd.Series(data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype)) + tm.assert_series_equal(result, expected) + + # from list + expected = pd.Series(data) + result = pd.Series(np.array(data).tolist(), dtype=str(dtype)) + tm.assert_series_equal(result, expected) + + +# TODO belongs in different file + +# def test_conversions(data_missing): + +# # astype to object series +# df = pd.DataFrame({"A": data_missing}) +# result = df["A"].astype("object") +# expected = pd.Series(np.array([np.nan, 1], dtype=object), name="A") +# tm.assert_series_equal(result, expected) + +# # convert to object ndarray +# # we assert that we are exactly equal +# # including type conversions of scalars +# result = df["A"].astype("object").values +# expected = np.array([pd.NA, 1], dtype=object) +# tm.assert_numpy_array_equal(result, expected) + +# for r, e in zip(result, expected): +# if pd.isnull(r): +# assert pd.isnull(e) +# elif is_integer(r): +# assert r == e +# assert is_integer(e) +# else: +# assert r == e +# assert type(r) == type(e) + + +@td.skip_if_no("pyarrow", min_version="0.15.0") +def test_arrow_array(data): + # protocol added in 0.15.0 + import pyarrow as pa + + arr = pa.array(data) + expected = np.array(data, dtype=object) + expected[data.isna()] = None + expected = pa.array(expected, type=data.dtype.name.lower(), from_pandas=True) + assert arr.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="0.16.0") +def test_arrow_roundtrip(data): + # roundtrip possible from arrow 0.16.0 + import pyarrow as pa + + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == str(data.dtype.numpy_dtype) + result = table.to_pandas() + tm.assert_frame_equal(result, df) diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py new file mode 100644 index 0000000000000..84c650f880541 --- /dev/null +++ b/pandas/tests/arrays/floating/test_function.py @@ -0,0 +1,154 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize("ufunc", [np.abs, np.sign]) +# np.sign emits a warning with nans, +@pytest.mark.filterwarnings("ignore:invalid value encountered in sign") +def test_ufuncs_single(ufunc): + a = pd.array([1, 2, -3, np.nan], dtype="Float64") + result = ufunc(a) + expected = pd.array(ufunc(a.astype(float)), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s) + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt]) +def test_ufuncs_single_float(ufunc): + a = pd.array([1.0, 0.2, 3.0, np.nan], dtype="Float64") + with np.errstate(invalid="ignore"): + result = ufunc(a) + expected = pd.array(ufunc(a.astype(float)), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + with np.errstate(invalid="ignore"): + result = ufunc(s) + expected = pd.Series(ufunc(s.astype(float)), dtype="Float64") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", [np.add, np.subtract]) +def test_ufuncs_binary_float(ufunc): + # two FloatingArrays + a = pd.array([1, 0.2, -3, np.nan], dtype="Float64") + result = ufunc(a, a) + expected = pd.array(ufunc(a.astype(float), a.astype(float)), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + # FloatingArray with numpy array + arr = np.array([1, 2, 3, 4]) + result = ufunc(a, arr) + expected = pd.array(ufunc(a.astype(float), arr), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + result = ufunc(arr, a) + expected = pd.array(ufunc(arr, a.astype(float)), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + # FloatingArray with scalar + result = ufunc(a, 1) + expected = pd.array(ufunc(a.astype(float), 1), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + result = ufunc(1, a) + expected = pd.array(ufunc(1, a.astype(float)), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("values", [[0, 1], [0, None]]) +def test_ufunc_reduce_raises(values): + a = pd.array(values, dtype="Float64") + msg = r"The 'reduce' method is not supported." + with pytest.raises(NotImplementedError, match=msg): + np.add.reduce(a) + + +@pytest.mark.parametrize( + "pandasmethname, kwargs", + [ + ("var", {"ddof": 0}), + ("var", {"ddof": 1}), + ("kurtosis", {}), + ("skew", {}), + ("sem", {}), + ], +) +def test_stat_method(pandasmethname, kwargs): + s = pd.Series(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, np.nan, np.nan], dtype="Float64") + pandasmeth = getattr(s, pandasmethname) + result = pandasmeth(**kwargs) + s2 = pd.Series(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype="float64") + pandasmeth = getattr(s2, pandasmethname) + expected = pandasmeth(**kwargs) + assert expected == result + + +def test_value_counts_na(): + arr = pd.array([0.1, 0.2, 0.1, pd.NA], dtype="Float64") + result = arr.value_counts(dropna=False) + expected = pd.Series([2, 1, 1], index=[0.1, 0.2, pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([2, 1], index=[0.1, 0.2], dtype="Int64") + tm.assert_series_equal(result, expected) + + +def test_value_counts_empty(): + s = pd.Series([], dtype="Float64") + result = s.value_counts() + idx = pd.Index([], dtype="object") + expected = pd.Series([], index=idx, dtype="Int64") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("min_count", [0, 4]) +def test_floating_array_sum(skipna, min_count): + arr = pd.array([1, 2, 3, None], dtype="Float64") + result = arr.sum(skipna=skipna, min_count=min_count) + if skipna and min_count == 0: + assert result == 6.0 + else: + assert result is pd.NA + + +@pytest.mark.parametrize( + "values, expected", [([1, 2, 3], 6.0), ([1, 2, 3, None], 6.0), ([None], 0.0)] +) +def test_floating_array_numpy_sum(values, expected): + arr = pd.array(values, dtype="Float64") + result = np.sum(arr) + assert result == expected + + +@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"]) +def test_preserve_dtypes(op): + df = pd.DataFrame( + { + "A": ["a", "b", "b"], + "B": [1, None, 3], + "C": pd.array([0.1, None, 3.0], dtype="Float64"), + } + ) + + # op + result = getattr(df.C, op)() + assert isinstance(result, np.float64) + + # groupby + result = getattr(df.groupby("A"), op)() + + expected = pd.DataFrame( + {"B": np.array([1.0, 3.0]), "C": pd.array([0.1, 3], dtype="Float64")}, + index=pd.Index(["a", "b"], name="A"), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_repr.py b/pandas/tests/arrays/floating/test_repr.py new file mode 100644 index 0000000000000..8767b79242c83 --- /dev/null +++ b/pandas/tests/arrays/floating/test_repr.py @@ -0,0 +1,45 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.core.arrays.floating import Float32Dtype, Float64Dtype + + +def test_dtypes(dtype): + # smoke tests on auto dtype construction + + np.dtype(dtype.type).kind == "f" + assert dtype.name is not None + + +@pytest.mark.parametrize( + "dtype, expected", + [(Float32Dtype(), "Float32Dtype()"), (Float64Dtype(), "Float64Dtype()")], +) +def test_repr_dtype(dtype, expected): + assert repr(dtype) == expected + + +def test_repr_array(): + result = repr(pd.array([1.0, None, 3.0])) + expected = "\n[1.0, , 3.0]\nLength: 3, dtype: Float64" + assert result == expected + + +def test_repr_array_long(): + data = pd.array([1.0, 2.0, None] * 1000) + expected = """ +[ 1.0, 2.0, , 1.0, 2.0, , 1.0, 2.0, , 1.0, + ... + , 1.0, 2.0, , 1.0, 2.0, , 1.0, 2.0, ] +Length: 3000, dtype: Float64""" + result = repr(data) + assert result == expected + + +def test_frame_repr(data_missing): + + df = pd.DataFrame({"A": data_missing}) + result = repr(df) + expected = " A\n0 \n1 0.1" + assert result == expected diff --git a/pandas/tests/arrays/floating/test_to_numpy.py b/pandas/tests/arrays/floating/test_to_numpy.py new file mode 100644 index 0000000000000..9d0cdefe6eed0 --- /dev/null +++ b/pandas/tests/arrays/floating/test_to_numpy.py @@ -0,0 +1,123 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy(box): + con = pd.Series if box else pd.array + + # default (with or without missing values) -> object dtype + arr = con([0.1, 0.2, 0.3], dtype="Float64") + result = arr.to_numpy() + expected = np.array([0.1, 0.2, 0.3], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + arr = con([0.1, 0.2, None], dtype="Float64") + result = arr.to_numpy() + expected = np.array([0.1, 0.2, pd.NA], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy_float(box): + con = pd.Series if box else pd.array + + # no missing values -> can convert to float, otherwise raises + arr = con([0.1, 0.2, 0.3], dtype="Float64") + result = arr.to_numpy(dtype="float64") + expected = np.array([0.1, 0.2, 0.3], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + arr = con([0.1, 0.2, None], dtype="Float64") + with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): + result = arr.to_numpy(dtype="float64") + + # need to explicitly specify na_value + result = arr.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([0.1, 0.2, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy_int(box): + con = pd.Series if box else pd.array + + # no missing values -> can convert to int, otherwise raises + arr = con([1.0, 2.0, 3.0], dtype="Float64") + result = arr.to_numpy(dtype="int64") + expected = np.array([1, 2, 3], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + arr = con([1.0, 2.0, None], dtype="Float64") + with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): + result = arr.to_numpy(dtype="int64") + + # automatic casting (floors the values) + arr = con([0.1, 0.9, 1.1], dtype="Float64") + result = arr.to_numpy(dtype="int64") + expected = np.array([0, 0, 1], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy_na_value(box): + con = pd.Series if box else pd.array + + arr = con([0.0, 1.0, None], dtype="Float64") + result = arr.to_numpy(dtype=object, na_value=None) + expected = np.array([0.0, 1.0, None], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype=bool, na_value=False) + expected = np.array([False, True, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="int64", na_value=-99) + expected = np.array([0, 1, -99], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["float64", "float32", "int32", "int64", "bool"]) +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy_dtype(box, dtype): + con = pd.Series if box else pd.array + arr = con([0.0, 1.0], dtype="Float64") + + result = arr.to_numpy(dtype=dtype) + expected = np.array([0, 1], dtype=dtype) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["float64", "float32", "int32", "int64", "bool"]) +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy_na_raises(box, dtype): + con = pd.Series if box else pd.array + arr = con([0.0, 1.0, None], dtype="Float64") + with pytest.raises(ValueError, match=dtype): + arr.to_numpy(dtype=dtype) + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy_string(box, dtype): + con = pd.Series if box else pd.array + arr = con([0.0, 1.0, None], dtype="Float64") + + result = arr.to_numpy(dtype="str") + expected = np.array([0.0, 1.0, pd.NA], dtype=" Date: Fri, 22 May 2020 15:56:26 +0200 Subject: [PATCH 02/26] fix doctest --- pandas/core/construction.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 1204dfc144cd1..dc82089759a1b 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -210,6 +210,11 @@ def array( [1, 2, ] Length: 3, dtype: Int64 + >>> pd.array([1.1, 2.2]) + + [1.1, 2.2] + Length: 2, dtype: Float64 + >>> pd.array(["a", None, "c"]) ['a', , 'c'] @@ -236,10 +241,10 @@ def array( If pandas does not infer a dedicated extension type a :class:`arrays.PandasArray` is returned. - >>> pd.array([1.1, 2.2]) + >>> pd.array([1 + 1j, 3 + 2j]) - [1.1, 2.2] - Length: 2, dtype: float64 + [(1+1j), (3+2j)] + Length: 2, dtype: complex128 As mentioned in the "Notes" section, new extension types may be added in the future (by pandas or 3rd party libraries), causing the return From f43f02158f04499aa5f06cb0ea18bafe25c68d8d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 22 May 2020 18:13:50 +0200 Subject: [PATCH 03/26] add basic whatsnew note --- doc/source/whatsnew/v1.1.0.rst | 59 ++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 19db7dcb4b83e..1946b547a9339 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -194,6 +194,65 @@ If needed you can adjust the bins with the argument ``offset`` (a Timedelta) tha For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`. +.. _whatsnew_110.floating: + +Experimental nullable data types for float data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We've added :class:`Float32Dtype` / :class:`Float64Dtype` and :class:`~arrays.FloatingArray`, +an extension data type dedicated to floating point data that can hold the +``pd.NA`` missing value indicator (:issue:`32265`, :issue:`34307`). + +While the default float data type already supports missing values using ``np.nan``, +this new data type uses ``pd.NA`` (and its corresponding behaviour) as missing +value indicator, in line with the already existing nullable :ref:`integer ` +and :ref:`boolean ` data types. + +One example where the behaviour of ``np.nan`` and ``pd.NA`` is diffferent is +comparison operations: + +.. code-block:: python + + # the default numpy float64 dtype + >>> s1 = pd.Series([1.5, None]) + >>> s1 + 0 1.5 + 1 NaN + dtype: float64 + + >>> s1 > 1 + 0 True + 1 False + dtype: bool + + # the new nullable float64 dtype + >>> s2 = pd.Series([1.5, None], dtype="Float64") + >>> s2 + 0 1.5 + 1 + dtype: Float64 + + >>> s2 > 1 + 0 True + 1 + dtype: boolean + +See the :ref:`missing_data.NA` doc section for more details on the behaviour +when using the ``pd.NA`` missing value indicator. + +As shown above, the dtype can be specified using the "Float64" or "Float32" +string (capitalized to distinguish it from the default "float64" data type). +Alternatively, you can also use the dtype object: + +.. ipython:: python + + pd.Series([1.5, None], dtype=pd.Float32Dtype()) + +.. warning:: + + Experimental: the new floating data types are currently experimental, and its + behaviour or API may still change without warning. Expecially the behaviour + regarding NaN (distinct from NA missing values) is subject to change. .. _whatsnew_110.enhancements.other: From ffdd65c9cd8f8627161301557652e25c7d550efa Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 22 May 2020 18:55:38 +0200 Subject: [PATCH 04/26] typo --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 1946b547a9339..5d57e9ebcaecd 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -208,7 +208,7 @@ this new data type uses ``pd.NA`` (and its corresponding behaviour) as missing value indicator, in line with the already existing nullable :ref:`integer ` and :ref:`boolean ` data types. -One example where the behaviour of ``np.nan`` and ``pd.NA`` is diffferent is +One example where the behaviour of ``np.nan`` and ``pd.NA`` is different is comparison operations: .. code-block:: python From ff3b93759c801aaf86a1b6b76eaa6e8cf9b53ee9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 29 May 2020 15:23:32 +0200 Subject: [PATCH 05/26] fix astype to string --- pandas/core/arrays/floating.py | 3 +++ pandas/core/arrays/integer.py | 2 +- pandas/core/arrays/string_.py | 21 ++++++++++++++------- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 1e8e5feea0617..14ecc3ae065c2 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -384,6 +384,7 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: """ from pandas.core.arrays.boolean import BooleanArray, BooleanDtype from pandas.core.arrays.integer import _IntegerDtype, IntegerArray + from pandas.core.arrays.string_ import StringDtype, StringArray dtype = pandas_dtype(dtype) @@ -400,6 +401,8 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: elif isinstance(dtype, BooleanDtype): result = self._data.astype("bool", copy=False) return BooleanArray(result, mask=self._mask, copy=False) + elif isinstance(dtype, StringDtype): + return StringArray._from_sequence(self, copy=False) # coerce if is_float_dtype(dtype): diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index fd156aad69855..0624bce316581 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -447,7 +447,7 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: casting """ from pandas.core.arrays.boolean import BooleanDtype - from pandas.core.arrays.floating import _FloatingDtype + from pandas.core.arrays.floating import _FloatingDtype from pandas.core.arrays.string_ import StringDtype dtype = pandas_dtype(dtype) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index ac501a8afbe09..9f9c27a56959a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -201,14 +201,21 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): if dtype: assert dtype == "string" - result = np.asarray(scalars, dtype="object") - if copy and result is scalars: - result = result.copy() + from pandas.core.arrays.masked import BaseMaskedArray + + if isinstance(scalars, BaseMaskedArray): + na_values = scalars._mask + result = scalars._data + else: + result = np.asarray(scalars) + if copy and result is scalars: + result = result.copy() + + # Standardize all missing-like values to NA + # TODO: it would be nice to do this in _validate / lib.is_string_array + # We are already doing a scan over the values there. + na_values = isna(result) - # Standardize all missing-like values to NA - # TODO: it would be nice to do this in _validate / lib.is_string_array - # We are already doing a scan over the values there. - na_values = isna(result) has_nans = na_values.any() if has_nans and result is scalars: # force a copy now, if we haven't already From ebbc64dc4743093b40daa655391aa3b9e3b5e0f8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 19 Jun 2020 20:48:26 +0200 Subject: [PATCH 06/26] clean-up arithmetic tests to align with integer/boolean tests --- pandas/_testing.py | 1 + .../tests/arrays/floating/test_arithmetic.py | 346 ++++++++---------- pandas/tests/arrays/masked/test_arithmetic.py | 5 + 3 files changed, 153 insertions(+), 199 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index ebb53dd81682c..fbdc2b5eca52f 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -82,6 +82,7 @@ ALL_EA_INT_DTYPES = UNSIGNED_EA_INT_DTYPES + SIGNED_EA_INT_DTYPES FLOAT_DTYPES: List[Dtype] = [float, "float32", "float64"] +FLOAT_EA_DTYPES: List[Dtype] = ["Float32", "Float64"] COMPLEX_DTYPES: List[Dtype] = [complex, "complex64", "complex128"] STRING_DTYPES: List[Dtype] = [str, "str", "U"] diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index 29813c2fbfa50..7ba4da8a5ede9 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -1,214 +1,162 @@ +import operator + import numpy as np import pytest import pandas as pd import pandas._testing as tm -from pandas.core.arrays import FloatingArray, IntegerArray -from pandas.tests.extension.base import BaseOpsUtil - - -class TestArithmeticOps(BaseOpsUtil): - def _check_divmod_op(self, s, op, other, exc=None): - super()._check_divmod_op(s, op, other, None) - - def _check_op(self, s, op_name, other, exc=None): - op = self.get_op_from_name(op_name) - result = op(s, other) - - # compute expected - mask = s.isna() - - # if s is a DataFrame, squeeze to a Series - # for comparison - if isinstance(s, pd.DataFrame): - result = result.squeeze() - s = s.squeeze() - mask = mask.squeeze() - - # other array is an Integer - if isinstance(other, (IntegerArray, FloatingArray)): - omask = getattr(other, "mask", None) - mask = getattr(other, "data", other) - if omask is not None: - mask |= omask - - # 1 ** na is na, so need to unmask those - if op_name == "__pow__": - mask = np.where(~s.isna() & (s == 1), False, mask) - - elif op_name == "__rpow__": - other_is_one = other == 1 - if isinstance(other_is_one, pd.Series): - other_is_one = other_is_one.fillna(False) - mask = np.where(other_is_one, False, mask) - - rs = s.astype(s.dtype.numpy_dtype) - expected = op(rs, other).astype(s.dtype) - expected[mask] = np.nan - if "floordiv" in op_name: - # Series op sets 1//0 to np.inf, which IntegerArray does not do (yet) - mask2 = np.isinf(expected) & np.isnan(result) - expected[mask2] = np.nan - tm.assert_series_equal(result, expected) - - def test_arith_floating_array(self, data, all_arithmetic_operators): - # we operate with a rhs of an floating array - - op = all_arithmetic_operators - - s = pd.Series(data) - rhs = pd.Series([1] * len(data), dtype=data.dtype) - rhs.iloc[-1] = np.nan - - self._check_op(s, op, rhs) - - def test_arith_series_with_scalar(self, data, all_arithmetic_operators): - # scalar - op = all_arithmetic_operators - s = pd.Series(data) - self._check_op(s, op, 1, exc=TypeError) - - def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): - # frame & scalar - op = all_arithmetic_operators - df = pd.DataFrame({"A": data}) - self._check_op(df, op, 1, exc=TypeError) - - def test_arith_series_with_array(self, data, all_arithmetic_operators): - # ndarray & other series - op = all_arithmetic_operators - s = pd.Series(data) - other = np.ones(len(s), dtype=s.dtype.type) - self._check_op(s, op, other, exc=TypeError) - - def test_arith_len_mismatch(self, all_arithmetic_operators): - # operating with a list-like with non-matching length raises - op = self.get_op_from_name(all_arithmetic_operators) - other = np.array([1.0]) - - s = pd.Series([1, 2, 3], dtype="Float64") - with pytest.raises(ValueError, match="Lengths must match"): - op(s, other) - - @pytest.mark.parametrize("other", [0, 0.5]) - def test_arith_zero_dim_ndarray(self, other): - arr = pd.array([1, None, 2], dtype="Float64") - result = arr + np.array(other) - expected = arr + other - tm.assert_equal(result, expected) - - def test_error(self, data, all_arithmetic_operators): - # invalid ops - - op = all_arithmetic_operators - s = pd.Series(data) - ops = getattr(s, op) - opa = getattr(data, op) - - # invalid scalars - msg = ( - r"(:?can only perform ops with numeric values)" - r"|(:?FloatingArray cannot perform the operation mod)" - ) - with pytest.raises(TypeError, match=msg): - ops("foo") - with pytest.raises(TypeError, match=msg): - ops(pd.Timestamp("20180101")) +from pandas.core.arrays import FloatingArray + +# Basic test for the arithmetic array ops +# ----------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "opname, exp", + [ + ("add", [1.1, 2.2, None, None, 5.5]), + ("mul", [0.1, 0.4, None, None, 2.5]), + ("sub", [0.9, 1.8, None, None, 4.5]), + ("truediv", [10.0, 10.0, None, None, 10.0]), + ("floordiv", [9.0, 9.0, None, None, 10.0]), + ("mod", [0.1, 0.2, None, None, 0.0]), + ], + ids=["add", "mul", "sub", "div", "floordiv", "mod"], +) +def test_array_op(dtype, opname, exp): + a = pd.array([1.0, 2.0, None, 4.0, 5.0], dtype=dtype) + b = pd.array([0.1, 0.2, 0.3, None, 0.5], dtype=dtype) + + op = getattr(operator, opname) + + result = op(a, b) + expected = pd.array(exp, dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) +def test_divide_by_zero(dtype, zero, negative): + # TODO pending NA/NaN discussion + # https://github.com/pandas-dev/pandas/issues/32265/ + a = pd.array([0, 1, -1, None], dtype=dtype) + result = a / zero + expected = FloatingArray( + np.array([np.nan, np.inf, -np.inf, np.nan], dtype=dtype.numpy_dtype), + np.array([False, False, False, True]), + ) + if negative: + expected *= -1 + tm.assert_extension_array_equal(result, expected) + + +def test_pow_scalar(dtype): + a = pd.array([-1, 0, 1, None, 2], dtype=dtype) + result = a ** 0 + expected = pd.array([1, 1, 1, 1, 1], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = a ** 1 + expected = pd.array([-1, 0, 1, None, 2], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = a ** pd.NA + expected = pd.array([None, None, 1, None, None], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = a ** np.nan + # TODO np.nan should be converted to pd.NA / missing before operation? + expected = FloatingArray( + np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype), + mask=a._mask, + ) + tm.assert_extension_array_equal(result, expected) - # invalid array-likes - with pytest.raises(TypeError, match=msg): - ops(pd.Series("foo", index=s.index)) - - if op != "__rpow__": - # TODO(extension) - # rpow with a datetimelike coerces the integer array incorrectly - msg = ( - "can only perform ops with numeric values|" - "cannot perform .* with this index type: DatetimeArray|" - "Addition/subtraction of integers and integer-arrays " - "with DatetimeArray is no longer supported. *" - ) - with pytest.raises(TypeError, match=msg): - ops(pd.Series(pd.date_range("20180101", periods=len(s)))) - - # 2d - result = opa(pd.DataFrame({"A": s})) - assert result is NotImplemented - - msg = r"can only perform ops with 1-d structures" - with pytest.raises(NotImplementedError, match=msg): - opa(np.arange(len(s)).reshape(-1, len(s))) - - @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) - def test_divide_by_zero(self, zero, negative): - # TODO pending NA/NaN discussion - # https://github.com/pandas-dev/pandas/issues/32265/ - a = pd.array([0, 1, -1, None], dtype="Float64") - result = a / zero - expected = FloatingArray( - np.array([np.nan, np.inf, -np.inf, np.nan]), - np.array([False, False, False, True]), - ) - if negative: - expected *= -1 - tm.assert_extension_array_equal(result, expected) - - def test_pow_scalar(self): - a = pd.array([-1, 0, 1, None, 2], dtype="Float64") - result = a ** 0 - expected = pd.array([1, 1, 1, 1, 1], dtype="Float64") - tm.assert_extension_array_equal(result, expected) - - result = a ** 1 - expected = pd.array([-1, 0, 1, None, 2], dtype="Float64") - tm.assert_extension_array_equal(result, expected) - - result = a ** pd.NA - expected = pd.array([None, None, 1, None, None], dtype="Float64") - tm.assert_extension_array_equal(result, expected) - - result = a ** np.nan - # TODO np.nan should be converted to pd.NA / missing before operation? - expected = FloatingArray( - np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"), mask=a._mask - ) - tm.assert_extension_array_equal(result, expected) + # reversed + a = a[1:] # Can't raise integers to negative powers. + + result = 0 ** a + expected = pd.array([1, 0, None, 0], dtype=dtype) + tm.assert_extension_array_equal(result, expected) - # reversed - a = a[1:] # Can't raise integers to negative powers. + result = 1 ** a + expected = pd.array([1, 1, 1, 1], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = pd.NA ** a + expected = pd.array([1, None, None, None], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = np.nan ** a + expected = FloatingArray( + np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), mask=a._mask + ) + tm.assert_extension_array_equal(result, expected) - result = 0 ** a - expected = pd.array([1, 0, None, 0], dtype="Float64") - tm.assert_extension_array_equal(result, expected) - result = 1 ** a - expected = pd.array([1, 1, 1, 1], dtype="Float64") - tm.assert_extension_array_equal(result, expected) +def test_pow_array(dtype): + a = pd.array([0, 0, 0, 1, 1, 1, None, None, None], dtype=dtype) + b = pd.array([0, 1, None, 0, 1, None, 0, 1, None], dtype=dtype) + result = a ** b + expected = pd.array([1, 0, None, 1, 1, 1, 1, None, None], dtype=dtype) + tm.assert_extension_array_equal(result, expected) - result = pd.NA ** a - expected = pd.array([1, None, None, None], dtype="Float64") - tm.assert_extension_array_equal(result, expected) - result = np.nan ** a - expected = FloatingArray( - np.array([1, np.nan, np.nan, np.nan], dtype="float64"), mask=a._mask +def test_rpow_one_to_na(): + # https://github.com/pandas-dev/pandas/issues/22022 + # https://github.com/pandas-dev/pandas/issues/29997 + arr = pd.array([np.nan, np.nan], dtype="Float64") + result = np.array([1.0, 2.0]) ** arr + expected = pd.array([1.0, np.nan], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("other", [0, 0.5]) +def test_arith_zero_dim_ndarray(other): + arr = pd.array([1, None, 2], dtype="Float64") + result = arr + np.array(other) + expected = arr + other + tm.assert_equal(result, expected) + + +# Test generic characteristics / errors +# ----------------------------------------------------------------------------- + + +def test_error_invalid_values(data, all_arithmetic_operators): + + op = all_arithmetic_operators + s = pd.Series(data) + ops = getattr(s, op) + + # invalid scalars + msg = ( + r"(:?can only perform ops with numeric values)" + r"|(:?FloatingArray cannot perform the operation mod)" + ) + with pytest.raises(TypeError, match=msg): + ops("foo") + with pytest.raises(TypeError, match=msg): + ops(pd.Timestamp("20180101")) + + # invalid array-likes + with pytest.raises(TypeError, match=msg): + ops(pd.Series("foo", index=s.index)) + + if op != "__rpow__": + # TODO(extension) + # rpow with a datetimelike coerces the integer array incorrectly + msg = ( + "can only perform ops with numeric values|" + "cannot perform .* with this index type: DatetimeArray|" + "Addition/subtraction of integers and integer-arrays " + "with DatetimeArray is no longer supported. *" ) - tm.assert_extension_array_equal(result, expected) - - def test_pow_array(self): - a = pd.array([0, 0, 0, 1, 1, 1, None, None, None], dtype="Float64") - b = pd.array([0, 1, None, 0, 1, None, 0, 1, None], dtype="Float64") - result = a ** b - expected = pd.array([1, 0, None, 1, 1, 1, 1, None, None], dtype="Float64") - tm.assert_extension_array_equal(result, expected) - - def test_rpow_one_to_na(self): - # https://github.com/pandas-dev/pandas/issues/22022 - # https://github.com/pandas-dev/pandas/issues/29997 - arr = pd.array([np.nan, np.nan], dtype="Float64") - result = np.array([1.0, 2.0]) ** arr - expected = pd.array([1.0, np.nan], dtype="Float64") - tm.assert_extension_array_equal(result, expected) + with pytest.raises(TypeError, match=msg): + ops(pd.Series(pd.date_range("20180101", periods=len(s)))) + + +# Various +# ----------------------------------------------------------------------------- def test_cross_type_arithmetic(): diff --git a/pandas/tests/arrays/masked/test_arithmetic.py b/pandas/tests/arrays/masked/test_arithmetic.py index db938c36fe7ae..4e75939e81878 100644 --- a/pandas/tests/arrays/masked/test_arithmetic.py +++ b/pandas/tests/arrays/masked/test_arithmetic.py @@ -5,8 +5,13 @@ import pandas._testing as tm from pandas.core.arrays import ExtensionArray +# integer dtypes arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_EA_INT_DTYPES] scalars = [2] * len(arrays) +# floating dtypes +arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES] +scalars += [0.2, 0.2] +# boolean arrays += [pd.array([True, False, True, None], dtype="boolean")] scalars += [False] From 8cf0d47a21e6cebe46ca048610002bf31012545a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 22 Jun 2020 10:21:11 +0200 Subject: [PATCH 07/26] updates for feedback --- pandas/core/arrays/__init__.py | 2 ++ pandas/core/arrays/floating.py | 5 +---- pandas/core/arrays/integer.py | 6 +++--- pandas/core/construction.py | 2 +- pandas/core/dtypes/common.py | 4 ++-- pandas/tests/arrays/floating/test_construction.py | 10 ++++++++++ pandas/tests/arrays/floating/test_to_numpy.py | 9 +++++++++ pandas/tests/arrays/integer/test_dtypes.py | 7 +++++++ pandas/tests/arrays/test_array.py | 3 +++ 9 files changed, 38 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 76077069cd2b1..e5258a6aecd30 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -9,6 +9,7 @@ from pandas.core.arrays.floating import FloatingArray from pandas.core.arrays.integer import IntegerArray, integer_array from pandas.core.arrays.interval import IntervalArray +from pandas.core.arrays.masked import BaseMaskedArray from pandas.core.arrays.numpy_ import PandasArray, PandasDtype from pandas.core.arrays.period import PeriodArray, period_array from pandas.core.arrays.sparse import SparseArray @@ -19,6 +20,7 @@ "ExtensionArray", "ExtensionOpsMixin", "ExtensionScalarOpsMixin", + "BaseMaskedArray", "BooleanArray", "Categorical", "DatetimeArray", diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 14ecc3ae065c2..6ec67f17a27c0 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -604,10 +604,7 @@ def floating_arithmetic_method(self, other): _dtype_docstring = """ An ExtensionDtype for {dtype} data. -.. versionchanged:: 1.0.0 - - Now uses :attr:`pandas.NA` as its missing value, - rather than :attr:`numpy.nan`. +This dtype uses ``pd.NA`` as missing value indicator. Attributes ---------- diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 833676fa76d76..838640fb7026b 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -104,7 +104,7 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: ) if np.issubdtype(np_dtype, np.integer): return _dtypes[str(np_dtype)] - if np.issubdtype(np_dtype, np.floating): + elif np.issubdtype(np_dtype, np.floating): from pandas.core.arrays.floating import _dtypes as float_dtypes return float_dtypes[str(np_dtype)] @@ -504,11 +504,11 @@ def _create_comparison_method(cls, op): @unpack_zerodim_and_defer(op.__name__) def cmp_method(self, other): - from pandas.arrays import BooleanArray, FloatingArray + from pandas.core.arrays import BaseMaskedArray, BooleanArray mask = None - if isinstance(other, (BooleanArray, IntegerArray, FloatingArray)): + if isinstance(other, BaseMaskedArray): other, mask = other._data, other._mask elif is_list_like(other): diff --git a/pandas/core/construction.py b/pandas/core/construction.py index dc82089759a1b..53e891569e808 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -331,7 +331,7 @@ def array( elif inferred_dtype == "integer": return IntegerArray._from_sequence(data, copy=copy) - elif inferred_dtype == "floating": + elif inferred_dtype in ("floating", "mixed-integer-float"): return FloatingArray._from_sequence(data, copy=copy) elif inferred_dtype == "boolean": diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 76d120a7619e9..0d9f0b09449b6 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -84,8 +84,8 @@ def ensure_float(arr): possible. Otherwise, the original array is returned. """ if is_extension_array_dtype(arr.dtype): - return arr.to_numpy(dtype="float64", na_value=np.nan) - if issubclass(arr.dtype.type, (np.integer, np.bool_)): + arr = arr.to_numpy(dtype="float64", na_value=np.nan) + elif issubclass(arr.dtype.type, (np.integer, np.bool_)): arr = arr.astype(float) return arr diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index 8910d7f11df91..7b67e46e5fe4c 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -74,6 +74,16 @@ def test_to_array_none_is_nan(a, b): tm.assert_extension_array_equal(result, expected) +def test_to_array_mixed_integer_float(): + result = pd.array([1, 2.0]) + expected = pd.array([1.0, 2.0], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + result = pd.array([1, None, 2.0]) + expected = pd.array([1.0, None, 2.0], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + @pytest.mark.parametrize( "values", [ diff --git a/pandas/tests/arrays/floating/test_to_numpy.py b/pandas/tests/arrays/floating/test_to_numpy.py index 9d0cdefe6eed0..26e5687b1b4a0 100644 --- a/pandas/tests/arrays/floating/test_to_numpy.py +++ b/pandas/tests/arrays/floating/test_to_numpy.py @@ -3,6 +3,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays import FloatingArray @pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) @@ -80,6 +81,14 @@ def test_to_numpy_na_value(box): tm.assert_numpy_array_equal(result, expected) +def test_to_numpy_na_value_with_nan(): + # array with both NaN and NA -> only fill NA with `na_value` + arr = FloatingArray(np.array([0.0, np.nan, 0.0]), np.array([False, False, True])) + result = arr.to_numpy(dtype="float64", na_value=-1) + expected = np.array([0.0, np.nan, -1.0], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["float64", "float32", "int32", "int64", "bool"]) @pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) def test_to_numpy_dtype(box, dtype): diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index cafe9e47a18f4..31425d51a6c99 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -169,6 +169,13 @@ def test_astype_specific_casting(dtype): tm.assert_series_equal(result, expected) +def test_astype_floating(): + arr = pd.array([1, 2, None], dtype="Int64") + result = arr.astype("Float64") + expected = pd.array([1.0, 2.0, None], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + def test_astype_dt64(): # GH#32435 arr = pd.array([1, 2, 3, pd.NA]) * 10 ** 9 diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 1041eb15e4b77..a2a43c9e407ed 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -240,6 +240,9 @@ def test_array_copy(): ([1.0, None], FloatingArray._from_sequence([1.0, pd.NA])), ([1.0, np.nan], FloatingArray._from_sequence([1.0, pd.NA])), ([1.0, pd.NA], FloatingArray._from_sequence([1.0, pd.NA])), + # mixed-integer-float + ([1, 2.0], FloatingArray._from_sequence([1.0, 2.0])), + ([1, np.nan, 2.0], FloatingArray._from_sequence([1.0, None, 2.0])), # string (["a", "b"], StringArray._from_sequence(["a", "b"])), (["a", None], StringArray._from_sequence(["a", None])), From f7cc1be9624e120badad0b40349cddc5b5b64365 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 22 Jun 2020 12:47:20 +0200 Subject: [PATCH 08/26] fix string array construction --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 9f9c27a56959a..529314973f845 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -207,7 +207,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): na_values = scalars._mask result = scalars._data else: - result = np.asarray(scalars) + result = np.asarray(scalars, dtype=object) if copy and result is scalars: result = result.copy() From 107b083c51ee491236d0f5e05a872798190c91b5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 22 Jun 2020 12:59:09 +0200 Subject: [PATCH 09/26] fix mypy --- pandas/tests/arrays/masked/test_arithmetic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/masked/test_arithmetic.py b/pandas/tests/arrays/masked/test_arithmetic.py index 4e75939e81878..6de10fd896878 100644 --- a/pandas/tests/arrays/masked/test_arithmetic.py +++ b/pandas/tests/arrays/masked/test_arithmetic.py @@ -1,3 +1,5 @@ +from typing import Any, List + import numpy as np import pytest @@ -7,7 +9,7 @@ # integer dtypes arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_EA_INT_DTYPES] -scalars = [2] * len(arrays) +scalars: List[Any] = [2] * len(arrays) # floating dtypes arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES] scalars += [0.2, 0.2] From ed9a14b76288bfb52f0037601c5052ba4167af0b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 11 Jul 2020 21:25:43 +0200 Subject: [PATCH 10/26] rename _FloatingDtype -> FloatingDtype --- pandas/core/arrays/floating.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 6ec67f17a27c0..726f50a8fb78d 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -37,12 +37,12 @@ import pyarrow # noqa: F401 -class _FloatingDtype(BaseMaskedDtype): +class FloatingDtype(BaseMaskedDtype): """ An ExtensionDtype to hold a single size of floating dtype. These specific implementations are subclasses of the non-public - _FloatingDtype. For example we have Float32Dtype to represent float32. + FloatingDtype. For example we have Float32Dtype to represent float32. The attributes name & type are set when these subclasses are created. """ @@ -85,7 +85,7 @@ def construct_array_type(cls) -> Type["FloatingArray"]: def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: # for now only handle other floating types - if not all(isinstance(t, _FloatingDtype) for t in dtypes): + if not all(isinstance(t, FloatingDtype) for t in dtypes): return None np_dtype = np.find_common_type( [t.numpy_dtype for t in dtypes], [] # type: ignore @@ -151,7 +151,7 @@ def coerce_to_array( # https://github.com/numpy/numpy/pull/7476 dtype = dtype.lower() - if not issubclass(type(dtype), _FloatingDtype): + if not issubclass(type(dtype), FloatingDtype): try: dtype = _dtypes[str(np.dtype(dtype))] except KeyError as err: @@ -283,7 +283,7 @@ class FloatingArray(BaseMaskedArray): _internal_fill_value = 0.0 @cache_readonly - def dtype(self) -> _FloatingDtype: + def dtype(self) -> FloatingDtype: return _dtypes[str(self._data.dtype)] def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): @@ -389,7 +389,7 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: dtype = pandas_dtype(dtype) # if we are astyping to an existing FloatingDtype we can fastpath - if isinstance(dtype, _FloatingDtype): + if isinstance(dtype, FloatingDtype): result = self._data.astype(dtype.numpy_dtype, copy=False) return type(self)(result, mask=self._mask, copy=False) # astyping to other known masked dtypes @@ -619,14 +619,14 @@ def floating_arithmetic_method(self, other): @register_extension_dtype -class Float32Dtype(_FloatingDtype): +class Float32Dtype(FloatingDtype): type = np.float32 name = "Float32" __doc__ = _dtype_docstring.format(dtype="float32") @register_extension_dtype -class Float64Dtype(_FloatingDtype): +class Float64Dtype(FloatingDtype): type = np.float64 name = "Float64" __doc__ = _dtype_docstring.format(dtype="float64") From c16ca4c1279b9ff55f358f055a3e86425075ca68 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 11 Jul 2020 21:34:51 +0200 Subject: [PATCH 11/26] update astype implementation to follow IntegerArray changes --- pandas/core/arrays/floating.py | 25 ++++++-------- pandas/tests/arrays/floating/test_astype.py | 38 +++++++++++++++++++++ 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 726f50a8fb78d..baff17ca7409e 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -382,25 +382,22 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: if incompatible type with an FloatingDtype, equivalent of same_kind casting """ - from pandas.core.arrays.boolean import BooleanArray, BooleanDtype - from pandas.core.arrays.integer import _IntegerDtype, IntegerArray from pandas.core.arrays.string_ import StringDtype, StringArray dtype = pandas_dtype(dtype) - # if we are astyping to an existing FloatingDtype we can fastpath - if isinstance(dtype, FloatingDtype): - result = self._data.astype(dtype.numpy_dtype, copy=False) - return type(self)(result, mask=self._mask, copy=False) - # astyping to other known masked dtypes - elif isinstance(dtype, _IntegerDtype): + # if the dtype is exactly the same, we can fastpath + if self.dtype == dtype: + # return the same object for copy=False + return self.copy() if copy else self + # if we are astyping to another nullable masked dtype, we can fastpath + if isinstance(dtype, BaseMaskedDtype): # TODO deal with NaNs - result = self._data.astype(dtype.numpy_dtype, copy=False) - # TODO should mask be copied here? - return IntegerArray(result, mask=self._mask, copy=False) - elif isinstance(dtype, BooleanDtype): - result = self._data.astype("bool", copy=False) - return BooleanArray(result, mask=self._mask, copy=False) + data = self._data.astype(dtype.numpy_dtype, copy=copy) + # mask is copied depending on whether the data was copied, and + # not directly depending on the `copy` keyword + mask = self._mask if data is self._data else self._mask.copy() + return dtype.construct_array_type()(data, mask, copy=False) elif isinstance(dtype, StringDtype): return StringArray._from_sequence(self, copy=False) diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py index f9e30556012aa..9eff1dd2a3bf6 100644 --- a/pandas/tests/arrays/floating/test_astype.py +++ b/pandas/tests/arrays/floating/test_astype.py @@ -69,3 +69,41 @@ def test_astype_str(): tm.assert_numpy_array_equal(a.astype(str), expected) tm.assert_numpy_array_equal(a.astype("str"), expected) + + +def test_astype_copy(): + arr = pd.array([0.1, 0.2, None], dtype="Float64") + orig = pd.array([0.1, 0.2, None], dtype="Float64") + + # copy=True -> ensure both data and mask are actual copies + result = arr.astype("Float64", copy=True) + assert result is not arr + assert not np.shares_memory(result._data, arr._data) + assert not np.shares_memory(result._mask, arr._mask) + result[0] = 10 + tm.assert_extension_array_equal(arr, orig) + result[0] = pd.NA + tm.assert_extension_array_equal(arr, orig) + + # copy=False + result = arr.astype("Float64", copy=False) + assert result is arr + assert np.shares_memory(result._data, arr._data) + assert np.shares_memory(result._mask, arr._mask) + result[0] = 10 + assert arr[0] == 10 + result[0] = pd.NA + assert arr[0] is pd.NA + + # astype to different dtype -> always needs a copy -> even with copy=False + # we need to ensure that also the mask is actually copied + arr = pd.array([0.1, 0.2, None], dtype="Float64") + orig = pd.array([0.1, 0.2, None], dtype="Float64") + + result = arr.astype("Float32", copy=False) + assert not np.shares_memory(result._data, arr._data) + assert not np.shares_memory(result._mask, arr._mask) + result[0] = 10 + tm.assert_extension_array_equal(arr, orig) + result[0] = pd.NA + tm.assert_extension_array_equal(arr, orig) From aa45aac1bb16dbfab6bd612c7dab868065fb9eb0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 11 Jul 2020 21:43:11 +0200 Subject: [PATCH 12/26] clean-up tests --- pandas/tests/arrays/floating/test_astype.py | 11 ++++ .../arrays/floating/test_construction.py | 54 ------------------- .../tests/arrays/masked/test_arrow_compat.py | 1 + 3 files changed, 12 insertions(+), 54 deletions(-) diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py index 9eff1dd2a3bf6..828d80d2f9a51 100644 --- a/pandas/tests/arrays/floating/test_astype.py +++ b/pandas/tests/arrays/floating/test_astype.py @@ -107,3 +107,14 @@ def test_astype_copy(): tm.assert_extension_array_equal(arr, orig) result[0] = pd.NA tm.assert_extension_array_equal(arr, orig) + + +def test_astype_object(dtype): + arr = pd.array([1.0, pd.NA], dtype=dtype) + + result = arr.astype(object) + expected = np.array([1.0, pd.NA], dtype=object) + tm.assert_numpy_array_equal(result, expected) + # check exact element types + assert isinstance(result[0], float) + assert result[1] is pd.NA diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index 7b67e46e5fe4c..69147f8f3a54a 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd import pandas._testing as tm from pandas.core.arrays import FloatingArray @@ -167,55 +165,3 @@ def test_series_from_float(data): expected = pd.Series(data) result = pd.Series(np.array(data).tolist(), dtype=str(dtype)) tm.assert_series_equal(result, expected) - - -# TODO belongs in different file - -# def test_conversions(data_missing): - -# # astype to object series -# df = pd.DataFrame({"A": data_missing}) -# result = df["A"].astype("object") -# expected = pd.Series(np.array([np.nan, 1], dtype=object), name="A") -# tm.assert_series_equal(result, expected) - -# # convert to object ndarray -# # we assert that we are exactly equal -# # including type conversions of scalars -# result = df["A"].astype("object").values -# expected = np.array([pd.NA, 1], dtype=object) -# tm.assert_numpy_array_equal(result, expected) - -# for r, e in zip(result, expected): -# if pd.isnull(r): -# assert pd.isnull(e) -# elif is_integer(r): -# assert r == e -# assert is_integer(e) -# else: -# assert r == e -# assert type(r) == type(e) - - -@td.skip_if_no("pyarrow", min_version="0.15.0") -def test_arrow_array(data): - # protocol added in 0.15.0 - import pyarrow as pa - - arr = pa.array(data) - expected = np.array(data, dtype=object) - expected[data.isna()] = None - expected = pa.array(expected, type=data.dtype.name.lower(), from_pandas=True) - assert arr.equals(expected) - - -@td.skip_if_no("pyarrow", min_version="0.16.0") -def test_arrow_roundtrip(data): - # roundtrip possible from arrow 0.16.0 - import pyarrow as pa - - df = pd.DataFrame({"a": data}) - table = pa.table(df) - assert table.field("a").type == str(data.dtype.numpy_dtype) - result = table.to_pandas() - tm.assert_frame_equal(result, df) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index b63bb0fbd9a3b..ca6fb1cf9dca0 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -6,6 +6,7 @@ import pandas._testing as tm arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_EA_INT_DTYPES] +arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES] arrays += [pd.array([True, False, True, None], dtype="boolean")] From 25eb1ba3d34247caa00ed9baac729fd76c5de216 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 12 Jul 2020 20:13:21 +0200 Subject: [PATCH 13/26] remove usage of deprecated check_less_precise --- pandas/tests/extension/test_floating.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py index 04760b69a04f2..00881178de1b4 100644 --- a/pandas/tests/extension/test_floating.py +++ b/pandas/tests/extension/test_floating.py @@ -204,8 +204,7 @@ def check_reduce(self, s, op_name, skipna): expected = getattr(s.dropna().astype(s.dtype.numpy_dtype), op_name)( skipna=skipna ) - check_less_precise = True if op_name == "skew" else False - tm.assert_almost_equal(result, expected, check_less_precise=check_less_precise) + tm.assert_almost_equal(result, expected) class TestBooleanReduce(base.BaseBooleanReduceTests): From 45b98f21a683f12c68091e6eaf8c4416ebe344cf Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 18 Sep 2020 16:59:54 +0200 Subject: [PATCH 14/26] fixup merge + skip astype(string) for float32 --- pandas/core/arrays/boolean.py | 2 +- pandas/tests/extension/test_floating.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 16093f0797d03..155e419160d4e 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -605,7 +605,7 @@ def _create_comparison_method(cls, op): def cmp_method(self, other): from pandas.arrays import FloatingArray, IntegerArray - if isinstance(other, IntegerArray, FloatingArray): + if isinstance(other, (IntegerArray, FloatingArray)): return NotImplemented mask = None diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py index 00881178de1b4..54b50670c640e 100644 --- a/pandas/tests/extension/test_floating.py +++ b/pandas/tests/extension/test_floating.py @@ -186,7 +186,11 @@ def test_value_counts(self, all_data, dropna): class TestCasting(base.BaseCastingTests): - pass + def test_astype_string(self, data): + if data.dtype == "Float32": + # https://github.com/pandas-dev/pandas/issues/36451 + pytest.skip() + super().test_astype_string(data) class TestGroupby(base.BaseGroupbyTests): From 314b6a95995527eb4445e42740997f827e082dbc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 18 Sep 2020 17:45:06 +0200 Subject: [PATCH 15/26] linting --- pandas/core/arrays/floating.py | 15 +++++++-------- pandas/core/arrays/integer.py | 12 ++++++------ pandas/core/dtypes/cast.py | 4 ++-- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index baff17ca7409e..af7a842e260a3 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -87,11 +87,9 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: # for now only handle other floating types if not all(isinstance(t, FloatingDtype) for t in dtypes): return None - np_dtype = np.find_common_type( - [t.numpy_dtype for t in dtypes], [] # type: ignore - ) + np_dtype = np.find_common_type([t.numpy_dtype for t in dtypes], []) if np.issubdtype(np_dtype, np.floating): - return _dtypes[str(np_dtype)] + return FLOAT_STR_TO_DTYPE[str(np_dtype)] return None def __from_arrow__( @@ -101,6 +99,7 @@ def __from_arrow__( Construct FloatingArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask pyarrow_type = pyarrow.from_numpy_dtype(self.type) @@ -153,7 +152,7 @@ def coerce_to_array( if not issubclass(type(dtype), FloatingDtype): try: - dtype = _dtypes[str(np.dtype(dtype))] + dtype = FLOAT_STR_TO_DTYPE[str(np.dtype(dtype))] except KeyError as err: raise ValueError(f"invalid dtype specified {dtype}") from err @@ -284,7 +283,7 @@ class FloatingArray(BaseMaskedArray): @cache_readonly def dtype(self) -> FloatingDtype: - return _dtypes[str(self._data.dtype)] + return FLOAT_STR_TO_DTYPE[str(self._data.dtype)] def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): if not (isinstance(values, np.ndarray) and values.dtype.kind == "f"): @@ -382,7 +381,7 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: if incompatible type with an FloatingDtype, equivalent of same_kind casting """ - from pandas.core.arrays.string_ import StringDtype, StringArray + from pandas.core.arrays.string_ import StringArray, StringDtype dtype = pandas_dtype(dtype) @@ -629,7 +628,7 @@ class Float64Dtype(FloatingDtype): __doc__ = _dtype_docstring.format(dtype="float64") -_dtypes = { +FLOAT_STR_TO_DTYPE = { "float32": Float32Dtype(), "float64": Float64Dtype(), } diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index b70dea534626f..f94747085afb8 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -106,11 +106,11 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: [t.numpy_dtype if isinstance(t, BaseMaskedDtype) else t for t in dtypes], [] ) if np.issubdtype(np_dtype, np.integer): - return STR_TO_DTYPE[str(np_dtype)] + return INT_STR_TO_DTYPE[str(np_dtype)] elif np.issubdtype(np_dtype, np.floating): - from pandas.core.arrays.floating import _dtypes as float_dtypes + from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE - return float_dtypes[str(np_dtype)] + return FLOAT_STR_TO_DTYPE[str(np_dtype)] return None def __from_arrow__( @@ -218,7 +218,7 @@ def coerce_to_array( if not issubclass(type(dtype), _IntegerDtype): try: - dtype = STR_TO_DTYPE[str(np.dtype(dtype))] + dtype = INT_STR_TO_DTYPE[str(np.dtype(dtype))] except KeyError as err: raise ValueError(f"invalid dtype specified {dtype}") from err @@ -358,7 +358,7 @@ class IntegerArray(BaseMaskedArray): @cache_readonly def dtype(self) -> _IntegerDtype: - return STR_TO_DTYPE[str(self._data.dtype)] + return INT_STR_TO_DTYPE[str(self._data.dtype)] def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): if not (isinstance(values, np.ndarray) and values.dtype.kind in ["i", "u"]): @@ -748,7 +748,7 @@ class UInt64Dtype(_IntegerDtype): __doc__ = _dtype_docstring.format(dtype="uint64") -STR_TO_DTYPE: Dict[str, _IntegerDtype] = { +INT_STR_TO_DTYPE: Dict[str, _IntegerDtype] = { "int8": Int8Dtype(), "int16": Int16Dtype(), "int32": Int32Dtype(), diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 05759ffb43dde..b3f743e8566e9 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1156,9 +1156,9 @@ def convert_dtypes( target_int_dtype = "Int64" if is_integer_dtype(input_array.dtype): - from pandas.core.arrays.integer import STR_TO_DTYPE + from pandas.core.arrays.integer import INT_STR_TO_DTYPE - inferred_dtype = STR_TO_DTYPE.get( + inferred_dtype = INT_STR_TO_DTYPE.get( input_array.dtype.name, target_int_dtype ) if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( From a157806252df4ee2118db7fd81ae173d394d0080 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 18 Sep 2020 18:05:36 +0200 Subject: [PATCH 16/26] add back type ignore --- pandas/core/arrays/floating.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index af7a842e260a3..182c6870a5b0b 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -87,7 +87,9 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: # for now only handle other floating types if not all(isinstance(t, FloatingDtype) for t in dtypes): return None - np_dtype = np.find_common_type([t.numpy_dtype for t in dtypes], []) + np_dtype = np.find_common_type( + [t.numpy_dtype for t in dtypes], [] # type: ignore[union-attr] + ) if np.issubdtype(np_dtype, np.floating): return FLOAT_STR_TO_DTYPE[str(np_dtype)] return None From 81456f977530cf10f4d47458f173a903d65b4d62 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 19 Sep 2020 09:01:02 +0200 Subject: [PATCH 17/26] whatsnew 1.2 --- doc/source/whatsnew/v1.1.0.rst | 60 ---------------------------------- doc/source/whatsnew/v1.2.0.rst | 60 ++++++++++++++++++++++++++++++++++ pandas/core/arrays/floating.py | 2 +- 3 files changed, 61 insertions(+), 61 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index fca389adc6d1a..a49b29d691692 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -252,66 +252,6 @@ If needed you can adjust the bins with the argument ``offset`` (a :class:`Timede For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`. -.. _whatsnew_110.floating: - -Experimental nullable data types for float data -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We've added :class:`Float32Dtype` / :class:`Float64Dtype` and :class:`~arrays.FloatingArray`, -an extension data type dedicated to floating point data that can hold the -``pd.NA`` missing value indicator (:issue:`32265`, :issue:`34307`). - -While the default float data type already supports missing values using ``np.nan``, -this new data type uses ``pd.NA`` (and its corresponding behaviour) as missing -value indicator, in line with the already existing nullable :ref:`integer ` -and :ref:`boolean ` data types. - -One example where the behaviour of ``np.nan`` and ``pd.NA`` is different is -comparison operations: - -.. code-block:: python - - # the default numpy float64 dtype - >>> s1 = pd.Series([1.5, None]) - >>> s1 - 0 1.5 - 1 NaN - dtype: float64 - - >>> s1 > 1 - 0 True - 1 False - dtype: bool - - # the new nullable float64 dtype - >>> s2 = pd.Series([1.5, None], dtype="Float64") - >>> s2 - 0 1.5 - 1 - dtype: Float64 - - >>> s2 > 1 - 0 True - 1 - dtype: boolean - -See the :ref:`missing_data.NA` doc section for more details on the behaviour -when using the ``pd.NA`` missing value indicator. - -As shown above, the dtype can be specified using the "Float64" or "Float32" -string (capitalized to distinguish it from the default "float64" data type). -Alternatively, you can also use the dtype object: - -.. ipython:: python - - pd.Series([1.5, None], dtype=pd.Float32Dtype()) - -.. warning:: - - Experimental: the new floating data types are currently experimental, and its - behaviour or API may still change without warning. Expecially the behaviour - regarding NaN (distinct from NA missing values) is subject to change. - fsspec now used for filesystem handling ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6923b42d3340b..a0198c839e992 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -109,6 +109,66 @@ Beginning with this version, the default is now to use the more accurate parser ``floating_precision="legacy"`` to use the legacy parser. The change to using the higher precision parser by default should have no impact on performance. (:issue:`17154`) +.. _whatsnew_110.floating: + +Experimental nullable data types for float data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We've added :class:`Float32Dtype` / :class:`Float64Dtype` and :class:`~arrays.FloatingArray`, +an extension data type dedicated to floating point data that can hold the +``pd.NA`` missing value indicator (:issue:`32265`, :issue:`34307`). + +While the default float data type already supports missing values using ``np.nan``, +this new data type uses ``pd.NA`` (and its corresponding behaviour) as missing +value indicator, in line with the already existing nullable :ref:`integer ` +and :ref:`boolean ` data types. + +One example where the behaviour of ``np.nan`` and ``pd.NA`` is different is +comparison operations: + +.. code-block:: python + + # the default numpy float64 dtype + >>> s1 = pd.Series([1.5, None]) + >>> s1 + 0 1.5 + 1 NaN + dtype: float64 + + >>> s1 > 1 + 0 True + 1 False + dtype: bool + + # the new nullable float64 dtype + >>> s2 = pd.Series([1.5, None], dtype="Float64") + >>> s2 + 0 1.5 + 1 + dtype: Float64 + + >>> s2 > 1 + 0 True + 1 + dtype: boolean + +See the :ref:`missing_data.NA` doc section for more details on the behaviour +when using the ``pd.NA`` missing value indicator. + +As shown above, the dtype can be specified using the "Float64" or "Float32" +string (capitalized to distinguish it from the default "float64" data type). +Alternatively, you can also use the dtype object: + +.. ipython:: python + + pd.Series([1.5, None], dtype=pd.Float32Dtype()) + +.. warning:: + + Experimental: the new floating data types are currently experimental, and its + behaviour or API may still change without warning. Expecially the behaviour + regarding NaN (distinct from NA missing values) is subject to change. + .. _whatsnew_120.enhancements.other: Other enhancements diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 182c6870a5b0b..e38e5fa4ac25d 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -224,7 +224,7 @@ class FloatingArray(BaseMaskedArray): """ Array of floating (optional missing) values. - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 .. warning:: From 71009c37fd021a8b62589f9aeb7a97b390a37d78 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 19 Sep 2020 09:03:20 +0200 Subject: [PATCH 18/26] update whatsnew --- doc/source/whatsnew/v1.2.0.rst | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index a0198c839e992..a582806b65513 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -126,31 +126,19 @@ and :ref:`boolean ` data types. One example where the behaviour of ``np.nan`` and ``pd.NA`` is different is comparison operations: -.. code-block:: python +.. ipython:: python # the default numpy float64 dtype - >>> s1 = pd.Series([1.5, None]) - >>> s1 - 0 1.5 - 1 NaN - dtype: float64 + s1 = pd.Series([1.5, None]) + s1 + s1 > 1 - >>> s1 > 1 - 0 True - 1 False - dtype: bool +.. ipython:: python # the new nullable float64 dtype - >>> s2 = pd.Series([1.5, None], dtype="Float64") - >>> s2 - 0 1.5 - 1 - dtype: Float64 - - >>> s2 > 1 - 0 True - 1 - dtype: boolean + s2 = pd.Series([1.5, None], dtype="Float64") + s2 + s2 > 1 See the :ref:`missing_data.NA` doc section for more details on the behaviour when using the ``pd.NA`` missing value indicator. From 56d2311e36a297bb6b39c321d15fe17aa9b1a435 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 19 Sep 2020 09:05:35 +0200 Subject: [PATCH 19/26] add fixture --- pandas/conftest.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/conftest.py b/pandas/conftest.py index 604815d496f80..79e6eb42dfebf 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -984,6 +984,17 @@ def float_dtype(request): return request.param +@pytest.fixture(params=tm.FLOAT_EA_DTYPES) +def float_ea_dtype(request): + """ + Parameterized fixture for float dtypes. + + * 'Float32' + * 'Float64' + """ + return request.param + + @pytest.fixture(params=tm.COMPLEX_DTYPES) def complex_dtype(request): """ From 65a2060eb42fee7c5c4e9050c621473ae4ee7236 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 19 Sep 2020 09:25:47 +0200 Subject: [PATCH 20/26] share some dtype properties --- pandas/core/arrays/boolean.py | 2 +- pandas/core/arrays/floating.py | 18 ------------------ pandas/core/arrays/integer.py | 18 ------------------ pandas/core/arrays/masked.py | 20 +++++++++++++++++--- 4 files changed, 18 insertions(+), 40 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 155e419160d4e..f773e9cfc8790 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -59,7 +59,7 @@ class BooleanDtype(BaseMaskedDtype): name = "boolean" @property - def type(self) -> Type[np.bool_]: + def type(self) -> Type: return np.bool_ @property diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index e38e5fa4ac25d..6bf4e94c3894c 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -47,10 +47,6 @@ class FloatingDtype(BaseMaskedDtype): The attributes name & type are set when these subclasses are created. """ - name: str - base = None - type: Type - def __repr__(self) -> str: return f"{self.name}Dtype()" @@ -58,20 +54,6 @@ def __repr__(self) -> str: def _is_numeric(self) -> bool: return True - @cache_readonly - def numpy_dtype(self) -> np.dtype: - """ Return an instance of our numpy dtype """ - return np.dtype(self.type) - - @cache_readonly - def kind(self) -> str: - return self.numpy_dtype.kind - - @cache_readonly - def itemsize(self) -> int: - """ Return the number of bytes in this dtype """ - return self.numpy_dtype.itemsize - @classmethod def construct_array_type(cls) -> Type["FloatingArray"]: """ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index f94747085afb8..ffaffdd23969e 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -46,10 +46,6 @@ class _IntegerDtype(BaseMaskedDtype): The attributes name & type are set when these subclasses are created. """ - name: str - base = None - type: Type - def __repr__(self) -> str: sign = "U" if self.is_unsigned_integer else "" return f"{sign}Int{8 * self.itemsize}Dtype()" @@ -66,20 +62,6 @@ def is_unsigned_integer(self) -> bool: def _is_numeric(self) -> bool: return True - @cache_readonly - def numpy_dtype(self) -> np.dtype: - """ Return an instance of our numpy dtype """ - return np.dtype(self.type) - - @cache_readonly - def kind(self) -> str: - return self.numpy_dtype.kind - - @cache_readonly - def itemsize(self) -> int: - """ Return the number of bytes in this dtype """ - return self.numpy_dtype.itemsize - @classmethod def construct_array_type(cls) -> Type["IntegerArray"]: """ diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 31274232e2525..97ade0dc70843 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -5,7 +5,7 @@ from pandas._libs import lib, missing as libmissing from pandas._typing import Scalar from pandas.errors import AbstractMethodError -from pandas.util._decorators import doc +from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( @@ -34,11 +34,25 @@ class BaseMaskedDtype(ExtensionDtype): Base class for dtypes for BasedMaskedArray subclasses. """ + name: str + base = None + type: Type + na_value = libmissing.NA - @property + @cache_readonly def numpy_dtype(self) -> np.dtype: - raise AbstractMethodError + """ Return an instance of our numpy dtype """ + return np.dtype(self.type) + + @cache_readonly + def kind(self) -> str: + return self.numpy_dtype.kind + + @cache_readonly + def itemsize(self) -> int: + """ Return the number of bytes in this dtype """ + return self.numpy_dtype.itemsize @classmethod def construct_array_type(cls) -> Type["BaseMaskedArray"]: From 8b3609834bf9fe7f7749ccba536c2b2774d770ec Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 19 Sep 2020 09:36:13 +0200 Subject: [PATCH 21/26] update ensure_float --- pandas/core/construction.py | 2 +- pandas/core/dtypes/common.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 2e84179826963..4751f6076f869 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -115,7 +115,7 @@ def array( string dtype for string data, and nullable-boolean dtype for boolean data. - .. versionchanged:: 1.1.0 + .. versionchanged:: 1.2.0 Pandas now also infers nullable-floating dtype for float-like input data diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 5a2ef48099798..bf45ce0107842 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -84,7 +84,10 @@ def ensure_float(arr): possible. Otherwise, the original array is returned. """ if is_extension_array_dtype(arr.dtype): - arr = arr.to_numpy(dtype="float64", na_value=np.nan) + if is_float_dtype(arr.dtype): + arr = arr.to_numpy(dtype=arr.dtype.numpy_dtype, na_value=np.nan) + else: + arr = arr.to_numpy(dtype="float64", na_value=np.nan) elif issubclass(arr.dtype.type, (np.integer, np.bool_)): arr = arr.astype(float) return arr From 66d69394b99b4fa89dc31cbef558c3cca231765e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 23 Sep 2020 13:30:29 +0200 Subject: [PATCH 22/26] remove skip for float32 conversion to string --- pandas/tests/extension/test_floating.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py index 54b50670c640e..00881178de1b4 100644 --- a/pandas/tests/extension/test_floating.py +++ b/pandas/tests/extension/test_floating.py @@ -186,11 +186,7 @@ def test_value_counts(self, all_data, dropna): class TestCasting(base.BaseCastingTests): - def test_astype_string(self, data): - if data.dtype == "Float32": - # https://github.com/pandas-dev/pandas/issues/36451 - pytest.skip() - super().test_astype_string(data) + pass class TestGroupby(base.BaseGroupbyTests): From d37b8153bb4afb0d08e48c6291b150730f4149ef Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 29 Sep 2020 09:18:04 +0200 Subject: [PATCH 23/26] code formatting --- pandas/core/arrays/floating.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 6bf4e94c3894c..deef408137f3a 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -106,7 +106,7 @@ def __from_arrow__( def coerce_to_array( - values, dtype=None, mask=None, copy: bool = False, + values, dtype=None, mask=None, copy: bool = False ) -> Tuple[np.ndarray, np.ndarray]: """ Coerce the input values array to numpy arrays with a mask From 44e699a6771837aecb0d8ed0870142dee7f2a83a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 30 Sep 2020 08:48:39 +0200 Subject: [PATCH 24/26] Update doc/source/whatsnew/v1.2.0.rst Co-authored-by: Simon Hawkins --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3dd641753b899..ca6c38e59597b 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -109,7 +109,7 @@ Beginning with this version, the default is now to use the more accurate parser ``floating_precision="legacy"`` to use the legacy parser. The change to using the higher precision parser by default should have no impact on performance. (:issue:`17154`) -.. _whatsnew_110.floating: +.. _whatsnew_120.floating: Experimental nullable data types for float data ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From b42b61d7925cf88704c4557959f6cd814575d75b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 30 Sep 2020 08:49:04 +0200 Subject: [PATCH 25/26] Update pandas/core/arrays/floating.py Co-authored-by: Simon Hawkins --- pandas/core/arrays/floating.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index deef408137f3a..c3710196a8611 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -109,7 +109,7 @@ def coerce_to_array( values, dtype=None, mask=None, copy: bool = False ) -> Tuple[np.ndarray, np.ndarray]: """ - Coerce the input values array to numpy arrays with a mask + Coerce the input values array to numpy arrays with a mask. Parameters ---------- From edf9618d49de3f04d8f18e099c60310ae40296f7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 30 Sep 2020 10:17:58 +0200 Subject: [PATCH 26/26] ignore mypy bug --- pandas/core/arrays/boolean.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 0a6f47820de3a..dd750bce7842e 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -58,8 +58,9 @@ class BooleanDtype(BaseMaskedDtype): name = "boolean" + # mypy: https://github.com/python/mypy/issues/4125 @property - def type(self) -> Type: + def type(self) -> Type: # type: ignore[override] return np.bool_ @property