From 85ea4faf301016c16b7ce336588d0950d803ba86 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 19 Apr 2020 13:56:23 -0700 Subject: [PATCH 1/3] REF: Implement NDArrayBackedExtensionArray --- pandas/core/arrays/_mixins.py | 62 ++++++++++++ pandas/core/arrays/categorical.py | 96 ++++--------------- pandas/core/arrays/datetimelike.py | 31 +++--- pandas/tests/arrays/categorical/test_algos.py | 2 +- pandas/tests/frame/test_reshape.py | 4 +- 5 files changed, 100 insertions(+), 95 deletions(-) create mode 100644 pandas/core/arrays/_mixins.py diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py new file mode 100644 index 0000000000000..0ed9de804c55e --- /dev/null +++ b/pandas/core/arrays/_mixins.py @@ -0,0 +1,62 @@ +from typing import Any, Sequence, TypeVar + +import numpy as np + +from pandas.errors import AbstractMethodError + +from pandas.core.algorithms import take +from pandas.core.arrays.base import ExtensionArray + +_T = TypeVar("_T", bound="NDArrayBackedExtensionArray") + + +class NDArrayBackedExtensionArray(ExtensionArray): + """ + ExtensionArray that is backed by a single NumPy ndarray. + """ + + _ndarray: np.ndarray + + def _from_backing_data(self: _T, arr: np.ndarray) -> _T: + """ + Construct a new ExtensionArray `new_array` with `arr` as its _ndarray. + + This should round-trip: + self == self._from_backing_data(self._ndarray) + """ + raise AbstractMethodError(self) + + # ------------------------------------------------------------------------ + + def take( + self: _T, + indices: Sequence[int], + allow_fill: bool = False, + fill_value: Any = None, + ) -> _T: + if allow_fill: + fill_value = self._validate_fill_value(fill_value) + + new_data = take( + self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value, + ) + return self._from_backing_data(new_data) + + def _validate_fill_value(self, fill_value): + """ + If a fill_value is passed to `take` convert it to a representation + suitable for self._ndarray, raising ValueError if this is not possible. + + Parameters + ---------- + fill_value : object + + Returns + ------- + fill_value : native representation + + Raises + ------ + ValueError + """ + raise AbstractMethodError(self) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index af07dee3b6838..2b8b71d096737 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -50,8 +50,9 @@ from pandas.core import ops from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algorithms -from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d -from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs +from pandas.core.algorithms import _get_data_algo, factorize, take_1d, unique1d +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.arrays.base import _extension_array_shared_docs from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array @@ -210,7 +211,7 @@ def contains(cat, key, container): """ -class Categorical(ExtensionArray, PandasObject): +class Categorical(NDArrayBackedExtensionArray, PandasObject): """ Represent a categorical variable in classic R / S-plus fashion. @@ -1762,84 +1763,20 @@ def fillna(self, value=None, method=None, limit=None): return self._constructor(codes, dtype=self.dtype, fastpath=True) - def take(self, indexer, allow_fill: bool = False, fill_value=None): - """ - Take elements from the Categorical. - - Parameters - ---------- - indexer : sequence of int - The indices in `self` to take. The meaning of negative values in - `indexer` depends on the value of `allow_fill`. - allow_fill : bool, default False - How to handle negative values in `indexer`. - - * False: negative values in `indices` indicate positional indices - from the right. This is similar to - :func:`numpy.take`. - - * True: negative values in `indices` indicate missing values - (the default). These values are set to `fill_value`. Any other - other negative values raise a ``ValueError``. - - .. versionchanged:: 1.0.0 - - Default value changed from ``True`` to ``False``. - - fill_value : object - The value to use for `indices` that are missing (-1), when - ``allow_fill=True``. This should be the category, i.e. a value - in ``self.categories``, not a code. - - Returns - ------- - Categorical - This Categorical will have the same categories and ordered as - `self`. - - See Also - -------- - Series.take : Similar method for Series. - numpy.ndarray.take : Similar method for NumPy arrays. - - Examples - -------- - >>> cat = pd.Categorical(['a', 'a', 'b']) - >>> cat - [a, a, b] - Categories (2, object): [a, b] - - Specify ``allow_fill==False`` to have negative indices mean indexing - from the right. - - >>> cat.take([0, -1, -2], allow_fill=False) - [a, b, a] - Categories (2, object): [a, b] - - With ``allow_fill=True``, indices equal to ``-1`` mean "missing" - values that should be filled with the `fill_value`, which is - ``np.nan`` by default. - - >>> cat.take([0, -1, -1], allow_fill=True) - [a, NaN, NaN] - Categories (2, object): [a, b] + # ------------------------------------------------------------------ + # NDArrayBackedExtensionArray compat - The fill value can be specified. - - >>> cat.take([0, -1, -1], allow_fill=True, fill_value='a') - [a, a, a] - Categories (2, object): [a, b] - - Specifying a fill value that's not in ``self.categories`` - will raise a ``TypeError``. - """ - indexer = np.asarray(indexer, dtype=np.intp) + @property + def _ndarray(self) -> np.ndarray: + return self._codes - dtype = self.dtype + def _from_backing_data(self, arr: np.ndarray): + return self._constructor(arr, dtype=self.dtype, fastpath=True) + def _validate_fill_value(self, fill_value): if isna(fill_value): fill_value = -1 - elif allow_fill: + else: # convert user-provided `fill_value` to codes if fill_value in self.categories: fill_value = self.categories.get_loc(fill_value) @@ -1848,11 +1785,10 @@ def take(self, indexer, allow_fill: bool = False, fill_value=None): f"'fill_value' ('{fill_value}') is not in this " "Categorical's categories." ) - raise TypeError(msg) + raise ValueError(msg) + return fill_value - codes = take(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) - result = type(self).from_codes(codes, dtype=dtype) - return result + # ------------------------------------------------------------------ def take_nd(self, indexer, allow_fill: bool = False, fill_value=None): # GH#27745 deprecate alias that other EAs dont have diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index e3cdc898a88bf..925417639a734 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -39,8 +39,9 @@ from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna from pandas.core import missing, nanops, ops -from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts +from pandas.core.algorithms import checked_add_with_arr, unique1d, value_counts from pandas.core.array_algos.transforms import shift +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin import pandas.core.common as com from pandas.core.construction import array, extract_array @@ -425,7 +426,9 @@ def _with_freq(self, freq): return self -class DatetimeLikeArrayMixin(ExtensionOpsMixin, AttributesMixin, ExtensionArray): +class DatetimeLikeArrayMixin( + ExtensionOpsMixin, AttributesMixin, NDArrayBackedExtensionArray +): """ Shared Base/Mixin class for DatetimeArray, TimedeltaArray, PeriodArray @@ -437,6 +440,20 @@ class DatetimeLikeArrayMixin(ExtensionOpsMixin, AttributesMixin, ExtensionArray) _generate_range """ + # ------------------------------------------------------------------ + # NDArrayBackedExtensionArray compat + + @property + def _ndarray(self) -> np.ndarray: + # NB: A bunch of Interval tests fail if we use ._data + return self.asi8 + + def _from_backing_data(self, arr: np.ndarray): + # Note: we do not retain `freq` + return type(self)(arr, dtype=self.dtype) # type: ignore + + # ------------------------------------------------------------------ + @property def ndim(self) -> int: return self._data.ndim @@ -711,16 +728,6 @@ def _validate_fill_value(self, fill_value): ) return fill_value - def take(self, indices, allow_fill=False, fill_value=None): - if allow_fill: - fill_value = self._validate_fill_value(fill_value) - - new_values = take( - self.asi8, indices, allow_fill=allow_fill, fill_value=fill_value - ) - - return type(self)(new_values, dtype=self.dtype) - @classmethod def _concat_same_type(cls, to_concat, axis: int = 0): diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 325fa476d70e6..02143a833c2ed 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -185,7 +185,7 @@ def test_take_fill_value_new_raises(self): # https://github.com/pandas-dev/pandas/issues/23296 cat = pd.Categorical(["a", "b", "c"]) xpr = r"'fill_value' \('d'\) is not in this Categorical's categories." - with pytest.raises(TypeError, match=xpr): + with pytest.raises(ValueError, match=xpr): cat.take([0, 1, -1], fill_value="d", allow_fill=True) def test_take_nd_deprecated(self): diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 9d3c40ce926d7..c504037c740f2 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -320,9 +320,9 @@ def test_unstack_fill_frame_categorical(self): ) tm.assert_frame_equal(result, expected) - # Fill with non-category results in a TypeError + # Fill with non-category results in a ValueError msg = r"'fill_value' \('d'\) is not in" - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): data.unstack(fill_value="d") # Fill with category value replaces missing values as expected From aad99709e8ce658743639de58fe2bb61f588103c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 20 Apr 2020 08:25:42 -0700 Subject: [PATCH 2/3] restore docstring, whatsnew --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/arrays/categorical.py | 75 +++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 9d40f9b6ffa2c..e0ddda40c68f8 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -190,6 +190,7 @@ Backwards incompatible API changes Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`) - :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`) - Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`) +- Passing an invalid ``fill_value`` to :meth:`Categorical.take` raises a ``ValueError`` instead of ``TypeError`` (:issue:`33660`) ``MultiIndex.get_indexer`` interprets `method` argument differently ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c044cf490ad58..34f717f2335f4 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1781,6 +1781,81 @@ def fillna(self, value=None, method=None, limit=None): return self._constructor(codes, dtype=self.dtype, fastpath=True) + def take(self, indexer, allow_fill: bool = False, fill_value=None): + """ + Take elements from the Categorical. + + Parameters + ---------- + indexer : sequence of int + The indices in `self` to take. The meaning of negative values in + `indexer` depends on the value of `allow_fill`. + allow_fill : bool, default False + How to handle negative values in `indexer`. + + * False: negative values in `indices` indicate positional indices + from the right. This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate missing values + (the default). These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + .. versionchanged:: 1.0.0 + + Default value changed from ``True`` to ``False``. + + fill_value : object + The value to use for `indices` that are missing (-1), when + ``allow_fill=True``. This should be the category, i.e. a value + in ``self.categories``, not a code. + + Returns + ------- + Categorical + This Categorical will have the same categories and ordered as + `self`. + + See Also + -------- + Series.take : Similar method for Series. + numpy.ndarray.take : Similar method for NumPy arrays. + + Examples + -------- + >>> cat = pd.Categorical(['a', 'a', 'b']) + >>> cat + [a, a, b] + Categories (2, object): [a, b] + + Specify ``allow_fill==False`` to have negative indices mean indexing + from the right. + + >>> cat.take([0, -1, -2], allow_fill=False) + [a, b, a] + Categories (2, object): [a, b] + + With ``allow_fill=True``, indices equal to ``-1`` mean "missing" + values that should be filled with the `fill_value`, which is + ``np.nan`` by default. + + >>> cat.take([0, -1, -1], allow_fill=True) + [a, NaN, NaN] + Categories (2, object): [a, b] + + The fill value can be specified. + + >>> cat.take([0, -1, -1], allow_fill=True, fill_value='a') + [a, a, a] + Categories (2, object): [a, b] + + Specifying a fill value that's not in ``self.categories`` + will raise a ``ValueError``. + """ + return NDArrayBackedExtensionArray.take( + self, indexer, allow_fill=allow_fill, fill_value=fill_value + ) + # ------------------------------------------------------------------ # NDArrayBackedExtensionArray compat From fc304a09df1ab9c2cf593b2e8fd8aadc37e2b37c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 21 Apr 2020 09:16:34 -0700 Subject: [PATCH 3/3] annotate --- pandas/core/arrays/categorical.py | 6 +++--- pandas/core/arrays/datetimelike.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 266acf476da12..0bb6ca8315a3a 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -51,7 +51,7 @@ import pandas.core.algorithms as algorithms from pandas.core.algorithms import _get_data_algo, factorize, take_1d, unique1d from pandas.core.array_algos.transforms import shift -from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.arrays._mixins import _T, NDArrayBackedExtensionArray from pandas.core.arrays.base import _extension_array_shared_docs from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com @@ -1769,7 +1769,7 @@ def fillna(self, value=None, method=None, limit=None): return self._constructor(codes, dtype=self.dtype, fastpath=True) - def take(self, indexer, allow_fill: bool = False, fill_value=None): + def take(self: _T, indexer, allow_fill: bool = False, fill_value=None) -> _T: """ Take elements from the Categorical. @@ -1851,7 +1851,7 @@ def take(self, indexer, allow_fill: bool = False, fill_value=None): def _ndarray(self) -> np.ndarray: return self._codes - def _from_backing_data(self, arr: np.ndarray): + def _from_backing_data(self, arr: np.ndarray) -> "Categorical": return self._constructor(arr, dtype=self.dtype, fastpath=True) # ------------------------------------------------------------------ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index d59278354ff8b..f41964cc7b0c8 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -41,7 +41,7 @@ from pandas.core import missing, nanops, ops from pandas.core.algorithms import checked_add_with_arr, unique1d, value_counts from pandas.core.array_algos.transforms import shift -from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.arrays._mixins import _T, NDArrayBackedExtensionArray from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin import pandas.core.common as com from pandas.core.construction import array, extract_array @@ -448,7 +448,7 @@ def _ndarray(self) -> np.ndarray: # NB: A bunch of Interval tests fail if we use ._data return self.asi8 - def _from_backing_data(self, arr: np.ndarray): + def _from_backing_data(self: _T, arr: np.ndarray) -> _T: # Note: we do not retain `freq` return type(self)(arr, dtype=self.dtype) # type: ignore