From a75ebda2b34754089d548f76982703e488d6d9d6 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 May 2023 13:44:57 -0700 Subject: [PATCH 1/2] REF: move SparseDtype to dtypes.dtypes --- pandas/__init__.py | 2 +- pandas/compat/pickle_compat.py | 4 + pandas/core/arrays/sparse/__init__.py | 2 - pandas/core/arrays/sparse/accessor.py | 2 +- pandas/core/arrays/sparse/array.py | 6 +- pandas/core/arrays/sparse/dtype.py | 425 ------------------ pandas/core/dtypes/common.py | 2 +- pandas/core/dtypes/dtypes.py | 409 ++++++++++++++++- pandas/core/internals/array_manager.py | 2 +- pandas/core/internals/blocks.py | 2 +- pandas/core/internals/concat.py | 6 +- pandas/core/internals/managers.py | 2 +- pandas/core/sparse/api.py | 7 +- pandas/tests/arrays/sparse/test_accessor.py | 6 +- .../tests/arrays/sparse/test_arithmetics.py | 6 +- pandas/tests/arrays/sparse/test_array.py | 8 +- pandas/tests/arrays/sparse/test_astype.py | 6 +- .../tests/arrays/sparse/test_constructors.py | 8 +- pandas/tests/arrays/sparse/test_dtype.py | 2 +- pandas/tests/arrays/sparse/test_indexing.py | 6 +- pandas/tests/arrays/sparse/test_reductions.py | 6 +- pandas/tests/dtypes/test_dtypes.py | 6 +- pandas/tests/reshape/test_get_dummies.py | 6 +- 23 files changed, 450 insertions(+), 481 deletions(-) delete mode 100644 pandas/core/arrays/sparse/dtype.py diff --git a/pandas/__init__.py b/pandas/__init__.py index 6ddfbadcf91d1..cf0e16b46d018 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -108,7 +108,7 @@ DataFrame, ) -from pandas.core.arrays.sparse import SparseDtype +from pandas.core.dtypes.dtypes import SparseDtype from pandas.tseries.api import infer_freq from pandas.tseries import offsets diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 9800c960f031b..c15dd7b37be93 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -134,6 +134,10 @@ def load_reduce(self): "pandas.core.indexes.base", "Index", ), + ("pandas.core.arrays.sparse.dtype", "SparseDtype"): ( + "pandas.core.dtypes.dtypes", + "SparseDtype", + ), } diff --git a/pandas/core/arrays/sparse/__init__.py b/pandas/core/arrays/sparse/__init__.py index 56dbc6df54fc9..adf83963aca39 100644 --- a/pandas/core/arrays/sparse/__init__.py +++ b/pandas/core/arrays/sparse/__init__.py @@ -8,7 +8,6 @@ SparseArray, make_sparse_index, ) -from pandas.core.arrays.sparse.dtype import SparseDtype __all__ = [ "BlockIndex", @@ -16,6 +15,5 @@ "make_sparse_index", "SparseAccessor", "SparseArray", - "SparseDtype", "SparseFrameAccessor", ] diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 4c1a6e6e219c6..eeff44cfa3c9c 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -8,13 +8,13 @@ from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.cast import find_common_type +from pandas.core.dtypes.dtypes import SparseDtype from pandas.core.accessor import ( PandasDelegate, delegate_names, ) from pandas.core.arrays.sparse.array import SparseArray -from pandas.core.arrays.sparse.dtype import SparseDtype if TYPE_CHECKING: from pandas import ( diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index d03e60131fd74..4f5505015ef76 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -51,7 +51,10 @@ is_string_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + SparseDtype, +) from pandas.core.dtypes.generic import ( ABCIndex, ABCSeries, @@ -66,7 +69,6 @@ import pandas.core.algorithms as algos from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray -from pandas.core.arrays.sparse.dtype import SparseDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.construction import ( diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py deleted file mode 100644 index dadd161ceeb38..0000000000000 --- a/pandas/core/arrays/sparse/dtype.py +++ /dev/null @@ -1,425 +0,0 @@ -"""Sparse Dtype""" -from __future__ import annotations - -import re -from typing import ( - TYPE_CHECKING, - Any, -) -import warnings - -import numpy as np - -from pandas.errors import PerformanceWarning -from pandas.util._exceptions import find_stack_level - -from pandas.core.dtypes.astype import astype_array -from pandas.core.dtypes.base import ( - ExtensionDtype, - register_extension_dtype, -) -from pandas.core.dtypes.common import ( - is_bool_dtype, - is_object_dtype, - is_scalar, - is_string_dtype, - pandas_dtype, -) -from pandas.core.dtypes.missing import ( - isna, - na_value_for_dtype, -) - -if TYPE_CHECKING: - from pandas._typing import ( - Dtype, - DtypeObj, - type_t, - ) - - from pandas.core.arrays.sparse.array import SparseArray - - -@register_extension_dtype -class SparseDtype(ExtensionDtype): - """ - Dtype for data stored in :class:`SparseArray`. - - This dtype implements the pandas ExtensionDtype interface. - - Parameters - ---------- - dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64 - The dtype of the underlying array storing the non-fill value values. - fill_value : scalar, optional - The scalar value not stored in the SparseArray. By default, this - depends on `dtype`. - - =========== ========== - dtype na_value - =========== ========== - float ``np.nan`` - int ``0`` - bool ``False`` - datetime64 ``pd.NaT`` - timedelta64 ``pd.NaT`` - =========== ========== - - The default value may be overridden by specifying a `fill_value`. - - Attributes - ---------- - None - - Methods - ------- - None - """ - - # We include `_is_na_fill_value` in the metadata to avoid hash collisions - # between SparseDtype(float, 0.0) and SparseDtype(float, nan). - # Without is_na_fill_value in the comparison, those would be equal since - # hash(nan) is (sometimes?) 0. - _metadata = ("_dtype", "_fill_value", "_is_na_fill_value") - - def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None: - if isinstance(dtype, type(self)): - if fill_value is None: - fill_value = dtype.fill_value - dtype = dtype.subtype - - dtype = pandas_dtype(dtype) - if is_string_dtype(dtype): - dtype = np.dtype("object") - - if fill_value is None: - fill_value = na_value_for_dtype(dtype) - - self._dtype = dtype - self._fill_value = fill_value - self._check_fill_value() - - def __hash__(self) -> int: - # Python3 doesn't inherit __hash__ when a base class overrides - # __eq__, so we explicitly do it here. - return super().__hash__() - - def __eq__(self, other: Any) -> bool: - # We have to override __eq__ to handle NA values in _metadata. - # The base class does simple == checks, which fail for NA. - if isinstance(other, str): - try: - other = self.construct_from_string(other) - except TypeError: - return False - - if isinstance(other, type(self)): - subtype = self.subtype == other.subtype - if self._is_na_fill_value: - # this case is complicated by two things: - # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan) - # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT) - # i.e. we want to treat any floating-point NaN as equal, but - # not a floating-point NaN and a datetime NaT. - fill_value = ( - other._is_na_fill_value - and isinstance(self.fill_value, type(other.fill_value)) - or isinstance(other.fill_value, type(self.fill_value)) - ) - else: - with warnings.catch_warnings(): - # Ignore spurious numpy warning - warnings.filterwarnings( - "ignore", - "elementwise comparison failed", - category=DeprecationWarning, - ) - - fill_value = self.fill_value == other.fill_value - - return subtype and fill_value - return False - - @property - def fill_value(self): - """ - The fill value of the array. - - Converting the SparseArray to a dense ndarray will fill the - array with this value. - - .. warning:: - - It's possible to end up with a SparseArray that has ``fill_value`` - values in ``sp_values``. This can occur, for example, when setting - ``SparseArray.fill_value`` directly. - """ - return self._fill_value - - def _check_fill_value(self): - if not is_scalar(self._fill_value): - raise ValueError( - f"fill_value must be a scalar. Got {self._fill_value} instead" - ) - # TODO: Right now we can use Sparse boolean array - # with any fill_value. Here was an attempt - # to allow only 3 value: True, False or nan - # but plenty test has failed. - # see pull 44955 - # if self._is_boolean and not ( - # is_bool(self._fill_value) or isna(self._fill_value) - # ): - # raise ValueError( - # "fill_value must be True, False or nan " - # f"for boolean type. Got {self._fill_value} instead" - # ) - - @property - def _is_na_fill_value(self) -> bool: - return isna(self.fill_value) - - @property - def _is_numeric(self) -> bool: - return not is_object_dtype(self.subtype) - - @property - def _is_boolean(self) -> bool: - return is_bool_dtype(self.subtype) - - @property - def kind(self) -> str: - """ - The sparse kind. Either 'integer', or 'block'. - """ - return self.subtype.kind - - @property - def type(self): - return self.subtype.type - - @property - def subtype(self): - return self._dtype - - @property - def name(self) -> str: - return f"Sparse[{self.subtype.name}, {repr(self.fill_value)}]" - - def __repr__(self) -> str: - return self.name - - @classmethod - def construct_array_type(cls) -> type_t[SparseArray]: - """ - Return the array type associated with this dtype. - - Returns - ------- - type - """ - from pandas.core.arrays.sparse.array import SparseArray - - return SparseArray - - @classmethod - def construct_from_string(cls, string: str) -> SparseDtype: - """ - Construct a SparseDtype from a string form. - - Parameters - ---------- - string : str - Can take the following forms. - - string dtype - ================ ============================ - 'int' SparseDtype[np.int64, 0] - 'Sparse' SparseDtype[np.float64, nan] - 'Sparse[int]' SparseDtype[np.int64, 0] - 'Sparse[int, 0]' SparseDtype[np.int64, 0] - ================ ============================ - - It is not possible to specify non-default fill values - with a string. An argument like ``'Sparse[int, 1]'`` - will raise a ``TypeError`` because the default fill value - for integers is 0. - - Returns - ------- - SparseDtype - """ - if not isinstance(string, str): - raise TypeError( - f"'construct_from_string' expects a string, got {type(string)}" - ) - msg = f"Cannot construct a 'SparseDtype' from '{string}'" - if string.startswith("Sparse"): - try: - sub_type, has_fill_value = cls._parse_subtype(string) - except ValueError as err: - raise TypeError(msg) from err - else: - result = SparseDtype(sub_type) - msg = ( - f"Cannot construct a 'SparseDtype' from '{string}'.\n\nIt " - "looks like the fill_value in the string is not " - "the default for the dtype. Non-default fill_values " - "are not supported. Use the 'SparseDtype()' " - "constructor instead." - ) - if has_fill_value and str(result) != string: - raise TypeError(msg) - return result - else: - raise TypeError(msg) - - @staticmethod - def _parse_subtype(dtype: str) -> tuple[str, bool]: - """ - Parse a string to get the subtype - - Parameters - ---------- - dtype : str - A string like - - * Sparse[subtype] - * Sparse[subtype, fill_value] - - Returns - ------- - subtype : str - - Raises - ------ - ValueError - When the subtype cannot be extracted. - """ - xpr = re.compile(r"Sparse\[(?P[^,]*)(, )?(?P.*?)?\]$") - m = xpr.match(dtype) - has_fill_value = False - if m: - subtype = m.groupdict()["subtype"] - has_fill_value = bool(m.groupdict()["fill_value"]) - elif dtype == "Sparse": - subtype = "float64" - else: - raise ValueError(f"Cannot parse {dtype}") - return subtype, has_fill_value - - @classmethod - def is_dtype(cls, dtype: object) -> bool: - dtype = getattr(dtype, "dtype", dtype) - if isinstance(dtype, str) and dtype.startswith("Sparse"): - sub_type, _ = cls._parse_subtype(dtype) - dtype = np.dtype(sub_type) - elif isinstance(dtype, cls): - return True - return isinstance(dtype, np.dtype) or dtype == "Sparse" - - def update_dtype(self, dtype) -> SparseDtype: - """ - Convert the SparseDtype to a new dtype. - - This takes care of converting the ``fill_value``. - - Parameters - ---------- - dtype : Union[str, numpy.dtype, SparseDtype] - The new dtype to use. - - * For a SparseDtype, it is simply returned - * For a NumPy dtype (or str), the current fill value - is converted to the new dtype, and a SparseDtype - with `dtype` and the new fill value is returned. - - Returns - ------- - SparseDtype - A new SparseDtype with the correct `dtype` and fill value - for that `dtype`. - - Raises - ------ - ValueError - When the current fill value cannot be converted to the - new `dtype` (e.g. trying to convert ``np.nan`` to an - integer dtype). - - - Examples - -------- - >>> SparseDtype(int, 0).update_dtype(float) - Sparse[float64, 0.0] - - >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan)) - Sparse[float64, nan] - """ - cls = type(self) - dtype = pandas_dtype(dtype) - - if not isinstance(dtype, cls): - if not isinstance(dtype, np.dtype): - raise TypeError("sparse arrays of extension dtypes not supported") - - fv_asarray = np.atleast_1d(np.array(self.fill_value)) - fvarr = astype_array(fv_asarray, dtype) - # NB: not fv_0d.item(), as that casts dt64->int - fill_value = fvarr[0] - dtype = cls(dtype, fill_value=fill_value) - - return dtype - - @property - def _subtype_with_str(self): - """ - Whether the SparseDtype's subtype should be considered ``str``. - - Typically, pandas will store string data in an object-dtype array. - When converting values to a dtype, e.g. in ``.astype``, we need to - be more specific, we need the actual underlying type. - - Returns - ------- - >>> SparseDtype(int, 1)._subtype_with_str - dtype('int64') - - >>> SparseDtype(object, 1)._subtype_with_str - dtype('O') - - >>> dtype = SparseDtype(str, '') - >>> dtype.subtype - dtype('O') - - >>> dtype._subtype_with_str - - """ - if isinstance(self.fill_value, str): - return type(self.fill_value) - return self.subtype - - def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: - # TODO for now only handle SparseDtypes and numpy dtypes => extend - # with other compatible extension dtypes - if any( - isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype) - for x in dtypes - ): - return None - - fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)] - fill_value = fill_values[0] - - # np.nan isn't a singleton, so we may end up with multiple - # NaNs here, so we ignore the all NA case too. - if not (len(set(fill_values)) == 1 or isna(fill_values).all()): - warnings.warn( - "Concatenating sparse arrays with multiple fill " - f"values: '{fill_values}'. Picking the first and " - "converting the rest.", - PerformanceWarning, - stacklevel=find_stack_level(), - ) - - np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] - return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 2c426187c83e8..3931b12e06f9b 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -28,6 +28,7 @@ ExtensionDtype, IntervalDtype, PeriodDtype, + SparseDtype, ) from pandas.core.dtypes.generic import ABCIndex from pandas.core.dtypes.inference import ( @@ -213,7 +214,6 @@ def is_sparse(arr) -> bool: FutureWarning, stacklevel=find_stack_level(), ) - from pandas.core.arrays.sparse import SparseDtype dtype = getattr(arr, "dtype", arr) return isinstance(dtype, SparseDtype) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 093101e2ae5a4..a2ce29d848bee 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -10,11 +10,15 @@ MutableMapping, cast, ) +import warnings import numpy as np import pytz -from pandas._libs import missing as libmissing +from pandas._libs import ( + lib, + missing as libmissing, +) from pandas._libs.interval import Interval from pandas._libs.properties import cache_readonly from pandas._libs.tslibs import ( @@ -31,6 +35,8 @@ PeriodDtypeBase, abbrev_to_npy_unit, ) +from pandas.errors import PerformanceWarning +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ( ExtensionDtype, @@ -69,6 +75,7 @@ IntervalArray, PandasArray, PeriodArray, + SparseArray, ) str_type = str @@ -606,8 +613,6 @@ def _is_boolean(self) -> bool: return is_bool_dtype(self.categories) def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: - from pandas.core.arrays.sparse import SparseDtype - # check if we have all categorical dtype with identical categories if all(isinstance(x, CategoricalDtype) for x in dtypes): first = dtypes[0] @@ -1500,3 +1505,401 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: return type(self).from_numpy_dtype(new_dtype) except (KeyError, NotImplementedError): return None + + +@register_extension_dtype +class SparseDtype(ExtensionDtype): + """ + Dtype for data stored in :class:`SparseArray`. + + This dtype implements the pandas ExtensionDtype interface. + + Parameters + ---------- + dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64 + The dtype of the underlying array storing the non-fill value values. + fill_value : scalar, optional + The scalar value not stored in the SparseArray. By default, this + depends on `dtype`. + + =========== ========== + dtype na_value + =========== ========== + float ``np.nan`` + int ``0`` + bool ``False`` + datetime64 ``pd.NaT`` + timedelta64 ``pd.NaT`` + =========== ========== + + The default value may be overridden by specifying a `fill_value`. + + Attributes + ---------- + None + + Methods + ------- + None + """ + + # We include `_is_na_fill_value` in the metadata to avoid hash collisions + # between SparseDtype(float, 0.0) and SparseDtype(float, nan). + # Without is_na_fill_value in the comparison, those would be equal since + # hash(nan) is (sometimes?) 0. + _metadata = ("_dtype", "_fill_value", "_is_na_fill_value") + + def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None: + if isinstance(dtype, type(self)): + if fill_value is None: + fill_value = dtype.fill_value + dtype = dtype.subtype + + from pandas.core.dtypes.common import ( + is_string_dtype, + pandas_dtype, + ) + from pandas.core.dtypes.missing import na_value_for_dtype + + dtype = pandas_dtype(dtype) + if is_string_dtype(dtype): + dtype = np.dtype("object") + + if fill_value is None: + fill_value = na_value_for_dtype(dtype) + + self._dtype = dtype + self._fill_value = fill_value + self._check_fill_value() + + def __hash__(self) -> int: + # Python3 doesn't inherit __hash__ when a base class overrides + # __eq__, so we explicitly do it here. + return super().__hash__() + + def __eq__(self, other: Any) -> bool: + # We have to override __eq__ to handle NA values in _metadata. + # The base class does simple == checks, which fail for NA. + if isinstance(other, str): + try: + other = self.construct_from_string(other) + except TypeError: + return False + + if isinstance(other, type(self)): + subtype = self.subtype == other.subtype + if self._is_na_fill_value: + # this case is complicated by two things: + # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan) + # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT) + # i.e. we want to treat any floating-point NaN as equal, but + # not a floating-point NaN and a datetime NaT. + fill_value = ( + other._is_na_fill_value + and isinstance(self.fill_value, type(other.fill_value)) + or isinstance(other.fill_value, type(self.fill_value)) + ) + else: + with warnings.catch_warnings(): + # Ignore spurious numpy warning + warnings.filterwarnings( + "ignore", + "elementwise comparison failed", + category=DeprecationWarning, + ) + + fill_value = self.fill_value == other.fill_value + + return subtype and fill_value + return False + + @property + def fill_value(self): + """ + The fill value of the array. + + Converting the SparseArray to a dense ndarray will fill the + array with this value. + + .. warning:: + + It's possible to end up with a SparseArray that has ``fill_value`` + values in ``sp_values``. This can occur, for example, when setting + ``SparseArray.fill_value`` directly. + """ + return self._fill_value + + def _check_fill_value(self): + if not lib.is_scalar(self._fill_value): + raise ValueError( + f"fill_value must be a scalar. Got {self._fill_value} instead" + ) + # TODO: Right now we can use Sparse boolean array + # with any fill_value. Here was an attempt + # to allow only 3 value: True, False or nan + # but plenty test has failed. + # see pull 44955 + # if self._is_boolean and not ( + # is_bool(self._fill_value) or isna(self._fill_value) + # ): + # raise ValueError( + # "fill_value must be True, False or nan " + # f"for boolean type. Got {self._fill_value} instead" + # ) + + @property + def _is_na_fill_value(self) -> bool: + from pandas import isna + + return isna(self.fill_value) + + @property + def _is_numeric(self) -> bool: + return not self.subtype == object + + @property + def _is_boolean(self) -> bool: + return self.subtype.kind == "b" + + @property + def kind(self) -> str: + """ + The sparse kind. Either 'integer', or 'block'. + """ + return self.subtype.kind + + @property + def type(self): + return self.subtype.type + + @property + def subtype(self): + return self._dtype + + @property + def name(self) -> str: + return f"Sparse[{self.subtype.name}, {repr(self.fill_value)}]" + + def __repr__(self) -> str: + return self.name + + @classmethod + def construct_array_type(cls) -> type_t[SparseArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + from pandas.core.arrays.sparse.array import SparseArray + + return SparseArray + + @classmethod + def construct_from_string(cls, string: str) -> SparseDtype: + """ + Construct a SparseDtype from a string form. + + Parameters + ---------- + string : str + Can take the following forms. + + string dtype + ================ ============================ + 'int' SparseDtype[np.int64, 0] + 'Sparse' SparseDtype[np.float64, nan] + 'Sparse[int]' SparseDtype[np.int64, 0] + 'Sparse[int, 0]' SparseDtype[np.int64, 0] + ================ ============================ + + It is not possible to specify non-default fill values + with a string. An argument like ``'Sparse[int, 1]'`` + will raise a ``TypeError`` because the default fill value + for integers is 0. + + Returns + ------- + SparseDtype + """ + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + msg = f"Cannot construct a 'SparseDtype' from '{string}'" + if string.startswith("Sparse"): + try: + sub_type, has_fill_value = cls._parse_subtype(string) + except ValueError as err: + raise TypeError(msg) from err + else: + result = SparseDtype(sub_type) + msg = ( + f"Cannot construct a 'SparseDtype' from '{string}'.\n\nIt " + "looks like the fill_value in the string is not " + "the default for the dtype. Non-default fill_values " + "are not supported. Use the 'SparseDtype()' " + "constructor instead." + ) + if has_fill_value and str(result) != string: + raise TypeError(msg) + return result + else: + raise TypeError(msg) + + @staticmethod + def _parse_subtype(dtype: str) -> tuple[str, bool]: + """ + Parse a string to get the subtype + + Parameters + ---------- + dtype : str + A string like + + * Sparse[subtype] + * Sparse[subtype, fill_value] + + Returns + ------- + subtype : str + + Raises + ------ + ValueError + When the subtype cannot be extracted. + """ + xpr = re.compile(r"Sparse\[(?P[^,]*)(, )?(?P.*?)?\]$") + m = xpr.match(dtype) + has_fill_value = False + if m: + subtype = m.groupdict()["subtype"] + has_fill_value = bool(m.groupdict()["fill_value"]) + elif dtype == "Sparse": + subtype = "float64" + else: + raise ValueError(f"Cannot parse {dtype}") + return subtype, has_fill_value + + @classmethod + def is_dtype(cls, dtype: object) -> bool: + dtype = getattr(dtype, "dtype", dtype) + if isinstance(dtype, str) and dtype.startswith("Sparse"): + sub_type, _ = cls._parse_subtype(dtype) + dtype = np.dtype(sub_type) + elif isinstance(dtype, cls): + return True + return isinstance(dtype, np.dtype) or dtype == "Sparse" + + def update_dtype(self, dtype) -> SparseDtype: + """ + Convert the SparseDtype to a new dtype. + + This takes care of converting the ``fill_value``. + + Parameters + ---------- + dtype : Union[str, numpy.dtype, SparseDtype] + The new dtype to use. + + * For a SparseDtype, it is simply returned + * For a NumPy dtype (or str), the current fill value + is converted to the new dtype, and a SparseDtype + with `dtype` and the new fill value is returned. + + Returns + ------- + SparseDtype + A new SparseDtype with the correct `dtype` and fill value + for that `dtype`. + + Raises + ------ + ValueError + When the current fill value cannot be converted to the + new `dtype` (e.g. trying to convert ``np.nan`` to an + integer dtype). + + + Examples + -------- + >>> SparseDtype(int, 0).update_dtype(float) + Sparse[float64, 0.0] + + >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan)) + Sparse[float64, nan] + """ + from pandas.core.dtypes.astype import astype_array + from pandas.core.dtypes.common import pandas_dtype + + cls = type(self) + dtype = pandas_dtype(dtype) + + if not isinstance(dtype, cls): + if not isinstance(dtype, np.dtype): + raise TypeError("sparse arrays of extension dtypes not supported") + + fv_asarray = np.atleast_1d(np.array(self.fill_value)) + fvarr = astype_array(fv_asarray, dtype) + # NB: not fv_0d.item(), as that casts dt64->int + fill_value = fvarr[0] + dtype = cls(dtype, fill_value=fill_value) + + return dtype + + @property + def _subtype_with_str(self): + """ + Whether the SparseDtype's subtype should be considered ``str``. + + Typically, pandas will store string data in an object-dtype array. + When converting values to a dtype, e.g. in ``.astype``, we need to + be more specific, we need the actual underlying type. + + Returns + ------- + >>> SparseDtype(int, 1)._subtype_with_str + dtype('int64') + + >>> SparseDtype(object, 1)._subtype_with_str + dtype('O') + + >>> dtype = SparseDtype(str, '') + >>> dtype.subtype + dtype('O') + + >>> dtype._subtype_with_str + + """ + if isinstance(self.fill_value, str): + return type(self.fill_value) + return self.subtype + + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + # TODO for now only handle SparseDtypes and numpy dtypes => extend + # with other compatible extension dtypes + if any( + isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype) + for x in dtypes + ): + return None + + fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)] + fill_value = fill_values[0] + + from pandas import isna + + # np.nan isn't a singleton, so we may end up with multiple + # NaNs here, so we ignore the all NA case too. + if not (len(set(fill_values)) == 1 or isna(fill_values).all()): + warnings.warn( + "Concatenating sparse arrays with multiple fill " + f"values: '{fill_values}'. Picking the first and " + "converting the rest.", + PerformanceWarning, + stacklevel=find_stack_level(), + ) + + np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] + return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 0d2058e66cfab..15f3630b2d735 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -41,6 +41,7 @@ from pandas.core.dtypes.dtypes import ( ExtensionDtype, PandasDtype, + SparseDtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -61,7 +62,6 @@ PandasArray, TimedeltaArray, ) -from pandas.core.arrays.sparse import SparseDtype from pandas.core.construction import ( ensure_wrapped_if_datetimelike, extract_array, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ff523862f8770..cd9d166ab6ba2 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -65,6 +65,7 @@ IntervalDtype, PandasDtype, PeriodDtype, + SparseDtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -103,7 +104,6 @@ PeriodArray, TimedeltaArray, ) -from pandas.core.arrays.sparse.dtype import SparseDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.computation import expressions diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index e34e7fbf1035e..cc46977cee6dc 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -29,14 +29,16 @@ needs_i8_conversion, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + ExtensionDtype, + SparseDtype, +) from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, isna, isna_all, ) -from pandas.core.arrays.sparse import SparseDtype from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.internals.array_manager import ArrayManager from pandas.core.internals.blocks import ( diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 397f9d5b1bbe6..31827a6bcf6d0 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -40,6 +40,7 @@ from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, ExtensionDtype, + SparseDtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -53,7 +54,6 @@ import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray from pandas.core.arrays._mixins import NDArrayBackedExtensionArray -from pandas.core.arrays.sparse import SparseDtype import pandas.core.common as com from pandas.core.construction import ( ensure_wrapped_if_datetimelike, diff --git a/pandas/core/sparse/api.py b/pandas/core/sparse/api.py index 2a324ebf77d9d..6650a5c4e90a0 100644 --- a/pandas/core/sparse/api.py +++ b/pandas/core/sparse/api.py @@ -1,6 +1,5 @@ -from pandas.core.arrays.sparse import ( - SparseArray, - SparseDtype, -) +from pandas.core.dtypes.dtypes import SparseDtype + +from pandas.core.arrays.sparse import SparseArray __all__ = ["SparseArray", "SparseDtype"] diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index ff5fc63318c38..e782b148803d7 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -6,11 +6,9 @@ import pandas.util._test_decorators as td import pandas as pd +from pandas import SparseDtype import pandas._testing as tm -from pandas.core.arrays.sparse import ( - SparseArray, - SparseDtype, -) +from pandas.core.arrays.sparse import SparseArray class TestSeriesAccessor: diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index a2b8c071b9d3c..6c11979506b58 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -4,11 +4,9 @@ import pytest import pandas as pd +from pandas import SparseDtype import pandas._testing as tm -from pandas.core.arrays.sparse import ( - SparseArray, - SparseDtype, -) +from pandas.core.arrays.sparse import SparseArray @pytest.fixture(params=["integer", "block"]) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 4a0795137f80b..d13a388df9d74 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -6,12 +6,12 @@ from pandas._libs.sparse import IntIndex import pandas as pd -from pandas import isna -import pandas._testing as tm -from pandas.core.arrays.sparse import ( - SparseArray, +from pandas import ( SparseDtype, + isna, ) +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray @pytest.fixture diff --git a/pandas/tests/arrays/sparse/test_astype.py b/pandas/tests/arrays/sparse/test_astype.py index 86d69610059b3..143a4dba7e09e 100644 --- a/pandas/tests/arrays/sparse/test_astype.py +++ b/pandas/tests/arrays/sparse/test_astype.py @@ -6,13 +6,11 @@ from pandas import ( DataFrame, Series, + SparseDtype, Timestamp, ) import pandas._testing as tm -from pandas.core.arrays.sparse import ( - SparseArray, - SparseDtype, -) +from pandas.core.arrays.sparse import SparseArray class TestAstype: diff --git a/pandas/tests/arrays/sparse/test_constructors.py b/pandas/tests/arrays/sparse/test_constructors.py index 29438b692fa72..efe60fe3c7a62 100644 --- a/pandas/tests/arrays/sparse/test_constructors.py +++ b/pandas/tests/arrays/sparse/test_constructors.py @@ -5,12 +5,12 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import isna -import pandas._testing as tm -from pandas.core.arrays.sparse import ( - SparseArray, +from pandas import ( SparseDtype, + isna, ) +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray class TestConstructors: diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py index 58fedbd3e4231..24f70af0cad09 100644 --- a/pandas/tests/arrays/sparse/test_dtype.py +++ b/pandas/tests/arrays/sparse/test_dtype.py @@ -4,7 +4,7 @@ import pytest import pandas as pd -from pandas.core.arrays.sparse import SparseDtype +from pandas import SparseDtype @pytest.mark.parametrize( diff --git a/pandas/tests/arrays/sparse/test_indexing.py b/pandas/tests/arrays/sparse/test_indexing.py index 5acb2167915d2..f639e9b18596c 100644 --- a/pandas/tests/arrays/sparse/test_indexing.py +++ b/pandas/tests/arrays/sparse/test_indexing.py @@ -2,11 +2,9 @@ import pytest import pandas as pd +from pandas import SparseDtype import pandas._testing as tm -from pandas.core.arrays.sparse import ( - SparseArray, - SparseDtype, -) +from pandas.core.arrays.sparse import SparseArray arr_data = np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6]) arr = SparseArray(arr_data) diff --git a/pandas/tests/arrays/sparse/test_reductions.py b/pandas/tests/arrays/sparse/test_reductions.py index 5d6d65dde69ad..f44423d5e635c 100644 --- a/pandas/tests/arrays/sparse/test_reductions.py +++ b/pandas/tests/arrays/sparse/test_reductions.py @@ -3,13 +3,11 @@ from pandas import ( NaT, + SparseDtype, Timestamp, isna, ) -from pandas.core.arrays.sparse import ( - SparseArray, - SparseDtype, -) +from pandas.core.arrays.sparse import SparseArray class TestReductions: diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 1cc10d14b904f..155c61508b706 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -33,13 +33,11 @@ DatetimeIndex, IntervalIndex, Series, + SparseDtype, date_range, ) import pandas._testing as tm -from pandas.core.arrays.sparse import ( - SparseArray, - SparseDtype, -) +from pandas.core.arrays.sparse import SparseArray class Base: diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index fab9b0a5d1846..e396076435cd3 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -13,13 +13,11 @@ DataFrame, RangeIndex, Series, + SparseDtype, get_dummies, ) import pandas._testing as tm -from pandas.core.arrays.sparse import ( - SparseArray, - SparseDtype, -) +from pandas.core.arrays.sparse import SparseArray class TestGetDummies: From 96a906eff159fac04b01a51fecc00c14749402cc Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 May 2023 15:53:19 -0700 Subject: [PATCH 2/2] REF: move ArrowDtype to dtypes.dtypes --- pandas/core/api.py | 2 +- pandas/core/arrays/arrow/__init__.py | 3 +- pandas/core/arrays/arrow/array.py | 3 +- pandas/core/arrays/arrow/dtype.py | 323 --------------------------- pandas/core/dtypes/cast.py | 2 +- pandas/core/dtypes/dtypes.py | 314 +++++++++++++++++++++++++- pandas/core/frame.py | 6 +- pandas/core/indexes/accessors.py | 2 +- pandas/core/interchange/column.py | 3 +- pandas/core/interchange/utils.py | 7 +- pandas/core/methods/describe.py | 2 +- pandas/core/strings/accessor.py | 6 +- pandas/core/tools/numeric.py | 2 +- pandas/tests/extension/test_arrow.py | 7 +- pandas/util/__init__.py | 36 ++- 15 files changed, 355 insertions(+), 363 deletions(-) delete mode 100644 pandas/core/arrays/arrow/dtype.py diff --git a/pandas/core/api.py b/pandas/core/api.py index c0b828d9330b4..2cfe5ffc0170d 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -7,6 +7,7 @@ from pandas._libs.missing import NA from pandas.core.dtypes.dtypes import ( + ArrowDtype, CategoricalDtype, DatetimeTZDtype, IntervalDtype, @@ -25,7 +26,6 @@ value_counts, ) from pandas.core.arrays import Categorical -from pandas.core.arrays.arrow import ArrowDtype from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.floating import ( Float32Dtype, diff --git a/pandas/core/arrays/arrow/__init__.py b/pandas/core/arrays/arrow/__init__.py index e7fa6fae0a5a1..58b268cbdd221 100644 --- a/pandas/core/arrays/arrow/__init__.py +++ b/pandas/core/arrays/arrow/__init__.py @@ -1,4 +1,3 @@ from pandas.core.arrays.arrow.array import ArrowExtensionArray -from pandas.core.arrays.arrow.dtype import ArrowDtype -__all__ = ["ArrowDtype", "ArrowExtensionArray"] +__all__ = ["ArrowExtensionArray"] diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4d62f680811ae..4b77d17c643a1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -60,8 +60,9 @@ import pyarrow as pa import pyarrow.compute as pc + from pandas.core.dtypes.dtypes import ArrowDtype + from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning - from pandas.core.arrays.arrow.dtype import ArrowDtype ARROW_CMP_FUNCS = { "eq": pc.equal, diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py deleted file mode 100644 index c416fbd03417a..0000000000000 --- a/pandas/core/arrays/arrow/dtype.py +++ /dev/null @@ -1,323 +0,0 @@ -from __future__ import annotations - -from datetime import ( - date, - datetime, - time, - timedelta, -) -from decimal import Decimal -import re -from typing import TYPE_CHECKING - -import numpy as np - -from pandas._libs.tslibs import ( - Timedelta, - Timestamp, -) -from pandas.compat import pa_version_under7p0 -from pandas.util._decorators import cache_readonly - -from pandas.core.dtypes.base import ( - StorageExtensionDtype, - register_extension_dtype, -) -from pandas.core.dtypes.dtypes import CategoricalDtypeType - -if not pa_version_under7p0: - import pyarrow as pa - -if TYPE_CHECKING: - from pandas._typing import ( - DtypeObj, - type_t, - ) - - from pandas.core.arrays.arrow import ArrowExtensionArray - - -@register_extension_dtype -class ArrowDtype(StorageExtensionDtype): - """ - An ExtensionDtype for PyArrow data types. - - .. warning:: - - ArrowDtype is considered experimental. The implementation and - parts of the API may change without warning. - - While most ``dtype`` arguments can accept the "string" - constructor, e.g. ``"int64[pyarrow]"``, ArrowDtype is useful - if the data type contains parameters like ``pyarrow.timestamp``. - - Parameters - ---------- - pyarrow_dtype : pa.DataType - An instance of a `pyarrow.DataType `__. - - Attributes - ---------- - pyarrow_dtype - - Methods - ------- - None - - Returns - ------- - ArrowDtype - - Examples - -------- - >>> import pyarrow as pa - >>> pd.ArrowDtype(pa.int64()) - int64[pyarrow] - - Types with parameters must be constructed with ArrowDtype. - - >>> pd.ArrowDtype(pa.timestamp("s", tz="America/New_York")) - timestamp[s, tz=America/New_York][pyarrow] - >>> pd.ArrowDtype(pa.list_(pa.int64())) - list[pyarrow] - """ # noqa: E501 - - _metadata = ("storage", "pyarrow_dtype") # type: ignore[assignment] - - def __init__(self, pyarrow_dtype: pa.DataType) -> None: - super().__init__("pyarrow") - if pa_version_under7p0: - raise ImportError("pyarrow>=7.0.0 is required for ArrowDtype") - if not isinstance(pyarrow_dtype, pa.DataType): - raise ValueError( - f"pyarrow_dtype ({pyarrow_dtype}) must be an instance " - f"of a pyarrow.DataType. Got {type(pyarrow_dtype)} instead." - ) - self.pyarrow_dtype = pyarrow_dtype - - def __repr__(self) -> str: - return self.name - - @property - def type(self): - """ - Returns associated scalar type. - """ - pa_type = self.pyarrow_dtype - if pa.types.is_integer(pa_type): - return int - elif pa.types.is_floating(pa_type): - return float - elif pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type): - return str - elif ( - pa.types.is_binary(pa_type) - or pa.types.is_fixed_size_binary(pa_type) - or pa.types.is_large_binary(pa_type) - ): - return bytes - elif pa.types.is_boolean(pa_type): - return bool - elif pa.types.is_duration(pa_type): - if pa_type.unit == "ns": - return Timedelta - else: - return timedelta - elif pa.types.is_timestamp(pa_type): - if pa_type.unit == "ns": - return Timestamp - else: - return datetime - elif pa.types.is_date(pa_type): - return date - elif pa.types.is_time(pa_type): - return time - elif pa.types.is_decimal(pa_type): - return Decimal - elif pa.types.is_dictionary(pa_type): - # TODO: Potentially change this & CategoricalDtype.type to - # something more representative of the scalar - return CategoricalDtypeType - elif pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type): - return list - elif pa.types.is_map(pa_type): - return dict - elif pa.types.is_null(pa_type): - # TODO: None? pd.NA? pa.null? - return type(pa_type) - else: - raise NotImplementedError(pa_type) - - @property - def name(self) -> str: # type: ignore[override] - """ - A string identifying the data type. - """ - return f"{str(self.pyarrow_dtype)}[{self.storage}]" - - @cache_readonly - def numpy_dtype(self) -> np.dtype: - """Return an instance of the related numpy dtype""" - if pa.types.is_timestamp(self.pyarrow_dtype): - # pa.timestamp(unit).to_pandas_dtype() returns ns units - # regardless of the pyarrow timestamp units. - # This can be removed if/when pyarrow addresses it: - # https://github.com/apache/arrow/issues/34462 - return np.dtype(f"datetime64[{self.pyarrow_dtype.unit}]") - if pa.types.is_duration(self.pyarrow_dtype): - # pa.duration(unit).to_pandas_dtype() returns ns units - # regardless of the pyarrow duration units - # This can be removed if/when pyarrow addresses it: - # https://github.com/apache/arrow/issues/34462 - return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") - if pa.types.is_string(self.pyarrow_dtype): - # pa.string().to_pandas_dtype() = object which we don't want - return np.dtype(str) - try: - return np.dtype(self.pyarrow_dtype.to_pandas_dtype()) - except (NotImplementedError, TypeError): - return np.dtype(object) - - @cache_readonly - def kind(self) -> str: - if pa.types.is_timestamp(self.pyarrow_dtype): - # To mirror DatetimeTZDtype - return "M" - return self.numpy_dtype.kind - - @cache_readonly - def itemsize(self) -> int: - """Return the number of bytes in this dtype""" - return self.numpy_dtype.itemsize - - @classmethod - def construct_array_type(cls) -> type_t[ArrowExtensionArray]: - """ - Return the array type associated with this dtype. - - Returns - ------- - type - """ - from pandas.core.arrays.arrow import ArrowExtensionArray - - return ArrowExtensionArray - - @classmethod - def construct_from_string(cls, string: str) -> ArrowDtype: - """ - Construct this type from a string. - - Parameters - ---------- - string : str - string should follow the format f"{pyarrow_type}[pyarrow]" - e.g. int64[pyarrow] - """ - if not isinstance(string, str): - raise TypeError( - f"'construct_from_string' expects a string, got {type(string)}" - ) - if not string.endswith("[pyarrow]"): - raise TypeError(f"'{string}' must end with '[pyarrow]'") - if string == "string[pyarrow]": - # Ensure Registry.find skips ArrowDtype to use StringDtype instead - raise TypeError("string[pyarrow] should be constructed by StringDtype") - - base_type = string[:-9] # get rid of "[pyarrow]" - try: - pa_dtype = pa.type_for_alias(base_type) - except ValueError as err: - has_parameters = re.search(r"[\[\(].*[\]\)]", base_type) - if has_parameters: - # Fallback to try common temporal types - try: - return cls._parse_temporal_dtype_string(base_type) - except (NotImplementedError, ValueError): - # Fall through to raise with nice exception message below - pass - - raise NotImplementedError( - "Passing pyarrow type specific parameters " - f"({has_parameters.group()}) in the string is not supported. " - "Please construct an ArrowDtype object with a pyarrow_dtype " - "instance with specific parameters." - ) from err - raise TypeError(f"'{base_type}' is not a valid pyarrow data type.") from err - return cls(pa_dtype) - - # TODO(arrow#33642): This can be removed once supported by pyarrow - @classmethod - def _parse_temporal_dtype_string(cls, string: str) -> ArrowDtype: - """ - Construct a temporal ArrowDtype from string. - """ - # we assume - # 1) "[pyarrow]" has already been stripped from the end of our string. - # 2) we know "[" is present - head, tail = string.split("[", 1) - - if not tail.endswith("]"): - raise ValueError - tail = tail[:-1] - - if head == "timestamp": - assert "," in tail # otherwise type_for_alias should work - unit, tz = tail.split(",", 1) - unit = unit.strip() - tz = tz.strip() - if tz.startswith("tz="): - tz = tz[3:] - - pa_type = pa.timestamp(unit, tz=tz) - dtype = cls(pa_type) - return dtype - - raise NotImplementedError(string) - - @property - def _is_numeric(self) -> bool: - """ - Whether columns with this dtype should be considered numeric. - """ - # TODO: pa.types.is_boolean? - return ( - pa.types.is_integer(self.pyarrow_dtype) - or pa.types.is_floating(self.pyarrow_dtype) - or pa.types.is_decimal(self.pyarrow_dtype) - ) - - @property - def _is_boolean(self) -> bool: - """ - Whether this dtype should be considered boolean. - """ - return pa.types.is_boolean(self.pyarrow_dtype) - - def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: - # We unwrap any masked dtypes, find the common dtype we would use - # for that, then re-mask the result. - # Mirrors BaseMaskedDtype - from pandas.core.dtypes.cast import find_common_type - - new_dtype = find_common_type( - [ - dtype.numpy_dtype if isinstance(dtype, ArrowDtype) else dtype - for dtype in dtypes - ] - ) - if not isinstance(new_dtype, np.dtype): - return None - try: - pa_dtype = pa.from_numpy_dtype(new_dtype) - return type(self)(pa_dtype) - except NotImplementedError: - return None - - def __from_arrow__(self, array: pa.Array | pa.ChunkedArray): - """ - Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray. - """ - array_class = self.construct_array_type() - arr = array.cast(self.pyarrow_dtype, safe=True) - return array_class(arr) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 6dabb866b8f5c..4f79de25fb576 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -58,6 +58,7 @@ pandas_dtype as pandas_dtype_func, ) from pandas.core.dtypes.dtypes import ( + ArrowDtype, BaseMaskedDtype, CategoricalDtype, DatetimeTZDtype, @@ -1070,7 +1071,6 @@ def convert_dtypes( if dtype_backend == "pyarrow": from pandas.core.arrays.arrow.array import to_pyarrow_type - from pandas.core.arrays.arrow.dtype import ArrowDtype from pandas.core.arrays.string_ import StringDtype assert not isinstance(inferred_dtype, str) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index a2ce29d848bee..f86d880b25668 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -3,6 +3,13 @@ """ from __future__ import annotations +from datetime import ( + date, + datetime, + time, + timedelta, +) +from decimal import Decimal import re from typing import ( TYPE_CHECKING, @@ -26,6 +33,7 @@ NaT, NaTType, Period, + Timedelta, Timestamp, timezones, to_offset, @@ -35,11 +43,13 @@ PeriodDtypeBase, abbrev_to_npy_unit, ) +from pandas.compat import pa_version_under7p0 from pandas.errors import PerformanceWarning from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ( ExtensionDtype, + StorageExtensionDtype, register_extension_dtype, ) from pandas.core.dtypes.generic import ( @@ -51,10 +61,13 @@ is_list_like, ) +if not pa_version_under7p0: + import pyarrow as pa + if TYPE_CHECKING: from datetime import tzinfo - import pyarrow + import pyarrow as pa # noqa: F811, TCH004 from pandas._typing import ( Dtype, @@ -77,6 +90,7 @@ PeriodArray, SparseArray, ) + from pandas.core.arrays.arrow import ArrowExtensionArray str_type = str @@ -826,9 +840,7 @@ def __eq__(self, other: Any) -> bool: and tz_compare(self.tz, other.tz) ) - def __from_arrow__( - self, array: pyarrow.Array | pyarrow.ChunkedArray - ) -> DatetimeArray: + def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> DatetimeArray: """ Construct DatetimeArray from pyarrow Array/ChunkedArray. @@ -1033,9 +1045,7 @@ def construct_array_type(cls) -> type_t[PeriodArray]: return PeriodArray - def __from_arrow__( - self, array: pyarrow.Array | pyarrow.ChunkedArray - ) -> PeriodArray: + def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> PeriodArray: """ Construct PeriodArray from pyarrow Array/ChunkedArray. """ @@ -1283,9 +1293,7 @@ def is_dtype(cls, dtype: object) -> bool: return False return super().is_dtype(dtype) - def __from_arrow__( - self, array: pyarrow.Array | pyarrow.ChunkedArray - ) -> IntervalArray: + def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> IntervalArray: """ Construct IntervalArray from pyarrow Array/ChunkedArray. """ @@ -1903,3 +1911,289 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value) + + +@register_extension_dtype +class ArrowDtype(StorageExtensionDtype): + """ + An ExtensionDtype for PyArrow data types. + + .. warning:: + + ArrowDtype is considered experimental. The implementation and + parts of the API may change without warning. + + While most ``dtype`` arguments can accept the "string" + constructor, e.g. ``"int64[pyarrow]"``, ArrowDtype is useful + if the data type contains parameters like ``pyarrow.timestamp``. + + Parameters + ---------- + pyarrow_dtype : pa.DataType + An instance of a `pyarrow.DataType `__. + + Attributes + ---------- + pyarrow_dtype + + Methods + ------- + None + + Returns + ------- + ArrowDtype + + Examples + -------- + >>> import pyarrow as pa + >>> pd.ArrowDtype(pa.int64()) + int64[pyarrow] + + Types with parameters must be constructed with ArrowDtype. + + >>> pd.ArrowDtype(pa.timestamp("s", tz="America/New_York")) + timestamp[s, tz=America/New_York][pyarrow] + >>> pd.ArrowDtype(pa.list_(pa.int64())) + list[pyarrow] + """ # noqa: E501 + + _metadata = ("storage", "pyarrow_dtype") # type: ignore[assignment] + + def __init__(self, pyarrow_dtype: pa.DataType) -> None: + super().__init__("pyarrow") + if pa_version_under7p0: + raise ImportError("pyarrow>=7.0.0 is required for ArrowDtype") + if not isinstance(pyarrow_dtype, pa.DataType): + raise ValueError( + f"pyarrow_dtype ({pyarrow_dtype}) must be an instance " + f"of a pyarrow.DataType. Got {type(pyarrow_dtype)} instead." + ) + self.pyarrow_dtype = pyarrow_dtype + + def __repr__(self) -> str: + return self.name + + @property + def type(self): + """ + Returns associated scalar type. + """ + pa_type = self.pyarrow_dtype + if pa.types.is_integer(pa_type): + return int + elif pa.types.is_floating(pa_type): + return float + elif pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type): + return str + elif ( + pa.types.is_binary(pa_type) + or pa.types.is_fixed_size_binary(pa_type) + or pa.types.is_large_binary(pa_type) + ): + return bytes + elif pa.types.is_boolean(pa_type): + return bool + elif pa.types.is_duration(pa_type): + if pa_type.unit == "ns": + return Timedelta + else: + return timedelta + elif pa.types.is_timestamp(pa_type): + if pa_type.unit == "ns": + return Timestamp + else: + return datetime + elif pa.types.is_date(pa_type): + return date + elif pa.types.is_time(pa_type): + return time + elif pa.types.is_decimal(pa_type): + return Decimal + elif pa.types.is_dictionary(pa_type): + # TODO: Potentially change this & CategoricalDtype.type to + # something more representative of the scalar + return CategoricalDtypeType + elif pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type): + return list + elif pa.types.is_map(pa_type): + return dict + elif pa.types.is_null(pa_type): + # TODO: None? pd.NA? pa.null? + return type(pa_type) + else: + raise NotImplementedError(pa_type) + + @property + def name(self) -> str: # type: ignore[override] + """ + A string identifying the data type. + """ + return f"{str(self.pyarrow_dtype)}[{self.storage}]" + + @cache_readonly + def numpy_dtype(self) -> np.dtype: + """Return an instance of the related numpy dtype""" + if pa.types.is_timestamp(self.pyarrow_dtype): + # pa.timestamp(unit).to_pandas_dtype() returns ns units + # regardless of the pyarrow timestamp units. + # This can be removed if/when pyarrow addresses it: + # https://github.com/apache/arrow/issues/34462 + return np.dtype(f"datetime64[{self.pyarrow_dtype.unit}]") + if pa.types.is_duration(self.pyarrow_dtype): + # pa.duration(unit).to_pandas_dtype() returns ns units + # regardless of the pyarrow duration units + # This can be removed if/when pyarrow addresses it: + # https://github.com/apache/arrow/issues/34462 + return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") + if pa.types.is_string(self.pyarrow_dtype): + # pa.string().to_pandas_dtype() = object which we don't want + return np.dtype(str) + try: + return np.dtype(self.pyarrow_dtype.to_pandas_dtype()) + except (NotImplementedError, TypeError): + return np.dtype(object) + + @cache_readonly + def kind(self) -> str: + if pa.types.is_timestamp(self.pyarrow_dtype): + # To mirror DatetimeTZDtype + return "M" + return self.numpy_dtype.kind + + @cache_readonly + def itemsize(self) -> int: + """Return the number of bytes in this dtype""" + return self.numpy_dtype.itemsize + + @classmethod + def construct_array_type(cls) -> type_t[ArrowExtensionArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + from pandas.core.arrays.arrow import ArrowExtensionArray + + return ArrowExtensionArray + + @classmethod + def construct_from_string(cls, string: str) -> ArrowDtype: + """ + Construct this type from a string. + + Parameters + ---------- + string : str + string should follow the format f"{pyarrow_type}[pyarrow]" + e.g. int64[pyarrow] + """ + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + if not string.endswith("[pyarrow]"): + raise TypeError(f"'{string}' must end with '[pyarrow]'") + if string == "string[pyarrow]": + # Ensure Registry.find skips ArrowDtype to use StringDtype instead + raise TypeError("string[pyarrow] should be constructed by StringDtype") + + base_type = string[:-9] # get rid of "[pyarrow]" + try: + pa_dtype = pa.type_for_alias(base_type) + except ValueError as err: + has_parameters = re.search(r"[\[\(].*[\]\)]", base_type) + if has_parameters: + # Fallback to try common temporal types + try: + return cls._parse_temporal_dtype_string(base_type) + except (NotImplementedError, ValueError): + # Fall through to raise with nice exception message below + pass + + raise NotImplementedError( + "Passing pyarrow type specific parameters " + f"({has_parameters.group()}) in the string is not supported. " + "Please construct an ArrowDtype object with a pyarrow_dtype " + "instance with specific parameters." + ) from err + raise TypeError(f"'{base_type}' is not a valid pyarrow data type.") from err + return cls(pa_dtype) + + # TODO(arrow#33642): This can be removed once supported by pyarrow + @classmethod + def _parse_temporal_dtype_string(cls, string: str) -> ArrowDtype: + """ + Construct a temporal ArrowDtype from string. + """ + # we assume + # 1) "[pyarrow]" has already been stripped from the end of our string. + # 2) we know "[" is present + head, tail = string.split("[", 1) + + if not tail.endswith("]"): + raise ValueError + tail = tail[:-1] + + if head == "timestamp": + assert "," in tail # otherwise type_for_alias should work + unit, tz = tail.split(",", 1) + unit = unit.strip() + tz = tz.strip() + if tz.startswith("tz="): + tz = tz[3:] + + pa_type = pa.timestamp(unit, tz=tz) + dtype = cls(pa_type) + return dtype + + raise NotImplementedError(string) + + @property + def _is_numeric(self) -> bool: + """ + Whether columns with this dtype should be considered numeric. + """ + # TODO: pa.types.is_boolean? + return ( + pa.types.is_integer(self.pyarrow_dtype) + or pa.types.is_floating(self.pyarrow_dtype) + or pa.types.is_decimal(self.pyarrow_dtype) + ) + + @property + def _is_boolean(self) -> bool: + """ + Whether this dtype should be considered boolean. + """ + return pa.types.is_boolean(self.pyarrow_dtype) + + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + # We unwrap any masked dtypes, find the common dtype we would use + # for that, then re-mask the result. + # Mirrors BaseMaskedDtype + from pandas.core.dtypes.cast import find_common_type + + new_dtype = find_common_type( + [ + dtype.numpy_dtype if isinstance(dtype, ArrowDtype) else dtype + for dtype in dtypes + ] + ) + if not isinstance(new_dtype, np.dtype): + return None + try: + pa_dtype = pa.from_numpy_dtype(new_dtype) + return type(self)(pa_dtype) + except NotImplementedError: + return None + + def __from_arrow__(self, array: pa.Array | pa.ChunkedArray): + """ + Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray. + """ + array_class = self.construct_array_type() + arr = array.cast(self.pyarrow_dtype, safe=True) + return array_class(arr) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 674746e00c84b..0710617ad5f2b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -105,7 +105,10 @@ needs_i8_conversion, pandas_dtype, ) -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + ExtensionDtype, +) from pandas.core.dtypes.missing import ( isna, notna, @@ -131,7 +134,6 @@ PeriodArray, TimedeltaArray, ) -from pandas.core.arrays.arrow import ArrowDtype from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.construction import ( ensure_wrapped_if_datetimelike, diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 3ddc8aaf02d97..c6da7d847c363 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -19,6 +19,7 @@ is_list_like, ) from pandas.core.dtypes.dtypes import ( + ArrowDtype, CategoricalDtype, DatetimeTZDtype, PeriodDtype, @@ -35,7 +36,6 @@ TimedeltaArray, ) from pandas.core.arrays.arrow.array import ArrowExtensionArray -from pandas.core.arrays.arrow.dtype import ArrowDtype from pandas.core.base import ( NoNewAttributesMixin, PandasObject, diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index fea96d861f12c..ff4ff487e23ea 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -9,9 +9,10 @@ from pandas.errors import NoBufferPresent from pandas.util._decorators import cache_readonly +from pandas.core.dtypes.dtypes import ArrowDtype + import pandas as pd from pandas.api.types import is_string_dtype -from pandas.core.arrays.arrow.dtype import ArrowDtype from pandas.core.interchange.buffer import PandasBuffer from pandas.core.interchange.dataframe_protocol import ( Column, diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py index e92899583176f..6b690ee031471 100644 --- a/pandas/core/interchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -11,9 +11,10 @@ from pandas._libs import lib -from pandas.core.dtypes.dtypes import CategoricalDtype - -from pandas.core.arrays.arrow.dtype import ArrowDtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + CategoricalDtype, +) if typing.TYPE_CHECKING: from pandas._typing import DtypeObj diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index c8f8a2127083e..bc64707ab238f 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -32,11 +32,11 @@ is_numeric_dtype, ) from pandas.core.dtypes.dtypes import ( + ArrowDtype, DatetimeTZDtype, ExtensionDtype, ) -from pandas.core.arrays.arrow.dtype import ArrowDtype from pandas.core.arrays.floating import Float64Dtype from pandas.core.reshape.concat import concat diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 9ffb0444f1516..c8430b832f782 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -32,7 +32,10 @@ is_object_dtype, is_re, ) -from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + CategoricalDtype, +) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCIndex, @@ -41,7 +44,6 @@ ) from pandas.core.dtypes.missing import isna -from pandas.core.arrays.arrow.dtype import ArrowDtype from pandas.core.base import NoNewAttributesMixin from pandas.core.construction import extract_array diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index b9a5c431d8387..32620d02c64e8 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -22,13 +22,13 @@ is_string_dtype, needs_i8_conversion, ) +from pandas.core.dtypes.dtypes import ArrowDtype from pandas.core.dtypes.generic import ( ABCIndex, ABCSeries, ) from pandas.core.arrays import BaseMaskedArray -from pandas.core.arrays.arrow import ArrowDtype from pandas.core.arrays.string_ import StringDtype if TYPE_CHECKING: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 5078a4e8078f8..f18ff45c011e2 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -40,7 +40,10 @@ ) from pandas.errors import PerformanceWarning -from pandas.core.dtypes.dtypes import CategoricalDtypeType +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + CategoricalDtypeType, +) import pandas as pd import pandas._testing as tm @@ -59,8 +62,6 @@ from pandas.core.arrays.arrow.array import ArrowExtensionArray -from pandas.core.arrays.arrow.dtype import ArrowDtype # isort:skip - @pytest.fixture(params=tm.ALL_PYARROW_DTYPES, ids=str) def dtype(request): diff --git a/pandas/util/__init__.py b/pandas/util/__init__.py index aa31c024fe338..8fe928ed6c5cf 100644 --- a/pandas/util/__init__.py +++ b/pandas/util/__init__.py @@ -1,11 +1,25 @@ -# pyright: reportUnusedImport = false -from pandas.util._decorators import ( # noqa:F401 - Appender, - Substitution, - cache_readonly, -) - -from pandas.core.util.hashing import ( # noqa:F401 - hash_array, - hash_pandas_object, -) +def __getattr__(key: str): + # These imports need to be lazy to avoid circular import errors + if key == "hash_array": + from pandas.core.util.hashing import hash_array + + return hash_array + if key == "hash_pandas_object": + from pandas.core.util.hashing import hash_pandas_object + + return hash_pandas_object + if key == "Appender": + from pandas.util._decorators import Appender + + return Appender + if key == "Substitution": + from pandas.util._decorators import Substitution + + return Substitution + + if key == "cache_readonly": + from pandas.util._decorators import cache_readonly + + return cache_readonly + + raise AttributeError(f"module 'pandas.util' has no attribute '{key}'")