diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a47406cded7b4..969add2d3efef 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -316,50 +316,19 @@ class Categorical(ExtensionArray, PandasObject): def __init__(self, values, categories=None, ordered=None, dtype=None, fastpath=False): - # Ways of specifying the dtype (prioritized ordered) - # 1. dtype is a CategoricalDtype - # a.) with known categories, use dtype.categories - # b.) else with Categorical values, use values.dtype - # c.) else, infer from values - # d.) specifying dtype=CategoricalDtype and categories is an error - # 2. dtype is a string 'category' - # a.) use categories, ordered - # b.) use values.dtype - # c.) infer from values - # 3. dtype is None - # a.) use categories, ordered - # b.) use values.dtype - # c.) infer from values - if dtype is not None: - # The dtype argument takes precedence over values.dtype (if any) - if isinstance(dtype, compat.string_types): - if dtype == 'category': - dtype = CategoricalDtype(categories, ordered) - else: - msg = "Unknown `dtype` {dtype}" - raise ValueError(msg.format(dtype=dtype)) - elif categories is not None or ordered is not None: - raise ValueError("Cannot specify both `dtype` and `categories`" - " or `ordered`.") - elif is_categorical(values): - # If no "dtype" was passed, use the one from "values", but honor - # the "ordered" and "categories" arguments - dtype = values.dtype._from_categorical_dtype(values.dtype, - categories, ordered) + dtype = CategoricalDtype._from_values_or_dtype(values, categories, + ordered, dtype) + # At this point, dtype is always a CategoricalDtype, but + # we may have dtype.categories be None, and we need to + # infer categories in a factorization step futher below + if is_categorical(values): # GH23814, for perf, if values._values already an instance of # Categorical, set values to codes, and run fastpath if (isinstance(values, (ABCSeries, ABCIndexClass)) and isinstance(values._values, type(self))): values = values._values.codes.copy() fastpath = True - else: - # If dtype=None and values is not categorical, create a new dtype - dtype = CategoricalDtype(categories, ordered) - - # At this point, dtype is always a CategoricalDtype and you should not - # use categories and ordered seperately. - # if dtype.categories is None, we are inferring if fastpath: self._codes = coerce_indexer_dtype(values, dtype.categories) @@ -656,6 +625,9 @@ def from_codes(cls, codes, categories, ordered=False): categorical. If not given, the resulting categorical will be unordered. """ + dtype = CategoricalDtype._from_values_or_dtype(codes, categories, + ordered) + codes = np.asarray(codes) # #21767 if not is_integer_dtype(codes): msg = "codes need to be array-like integers" @@ -675,14 +647,12 @@ def from_codes(cls, codes, categories, ordered=False): raise ValueError( "codes need to be convertible to an arrays of integers") - categories = CategoricalDtype.validate_categories(categories) - - if len(codes) and (codes.max() >= len(categories) or codes.min() < -1): + if len(codes) and ( + codes.max() >= len(dtype.categories) or codes.min() < -1): raise ValueError("codes need to be between -1 and " "len(categories)-1") - return cls(codes, categories=categories, ordered=ordered, - fastpath=True) + return cls(codes, dtype=dtype, fastpath=True) _codes = None diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index ac69927d4adf1..e35ee32657509 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -13,6 +13,7 @@ from pandas import compat from .base import ExtensionDtype, _DtypeOpsMixin +from .inference import is_list_like def register_extension_dtype(cls): @@ -240,6 +241,90 @@ def _from_categorical_dtype(cls, dtype, categories=None, ordered=None): ordered = dtype.ordered return cls(categories, ordered) + @classmethod + def _from_values_or_dtype(cls, values=None, categories=None, ordered=None, + dtype=None): + """ + Construct dtype from the input parameters used in :class:`Categorical`. + + This constructor method specifically does not do the factorization + step, if that is needed to find the categories. This constructor may + therefore return ``CategoricalDtype(categories=None, ordered=None)``, + which may not be useful. Additional steps may therefore have to be + taken to create the final dtype. + + The return dtype is specified from the inputs in this prioritized + order: + 1. if dtype is a CategoricalDtype, return dtype + 2. if dtype is the string 'category', create a CategoricalDtype from + the supplied categories and ordered parameters, and return that. + 3. if values is a categorical, use value.dtype, but override it with + categories and ordered if either/both of those are not None. + 4. if dtype is None and values is not a categorical, construct the + dtype from categories and ordered, even if either of those is None. + + Parameters + ---------- + values : list-like, optional + The list-like must be 1-dimensional. + categories : list-like, optional + Categories for the CategoricalDtype. + ordered : bool, optional + Designating if the categories are ordered. + dtype : CategoricalDtype or the string "category", optional + If ``CategoricalDtype``, cannot be used together with + `categories` or `ordered`. + + Returns + ------- + CategoricalDtype + + Examples + -------- + >>> CategoricalDtype._from_values_or_dtype() + CategoricalDtype(categories=None, ordered=None) + >>> CategoricalDtype._from_values_or_dtype(categories=['a', 'b'], + ... ordered=True) + CategoricalDtype(categories=['a', 'b'], ordered=True) + >>> dtype1 = CategoricalDtype(['a', 'b'], ordered=True) + >>> dtype2 = CategoricalDtype(['x', 'y'], ordered=False) + >>> c = Categorical([0, 1], dtype=dtype1, fastpath=True) + >>> CategoricalDtype._from_values_or_dtype(c, ['x', 'y'], ordered=True, + ... dtype=dtype2) + ValueError: Cannot specify `categories` or `ordered` together with + `dtype`. + + The supplied dtype takes precedence over values' dtype: + + >>> CategoricalDtype._from_values_or_dtype(c, dtype=dtype2) + CategoricalDtype(['x', 'y'], ordered=False) + """ + from pandas.core.dtypes.common import is_categorical + + if dtype is not None: + # The dtype argument takes precedence over values.dtype (if any) + if isinstance(dtype, compat.string_types): + if dtype == 'category': + dtype = CategoricalDtype(categories, ordered) + else: + msg = "Unknown dtype {dtype!r}" + raise ValueError(msg.format(dtype=dtype)) + elif categories is not None or ordered is not None: + raise ValueError("Cannot specify `categories` or `ordered` " + "together with `dtype`.") + elif is_categorical(values): + # If no "dtype" was passed, use the one from "values", but honor + # the "ordered" and "categories" arguments + dtype = values.dtype._from_categorical_dtype(values.dtype, + categories, ordered) + else: + # If dtype=None and values is not categorical, create a new dtype. + # Note: This could potentially have categories=None and + # ordered=None. + dtype = CategoricalDtype(categories, ordered) + + return dtype + def _finalize(self, categories, ordered, fastpath=False): if ordered is not None: @@ -408,7 +493,10 @@ def validate_categories(categories, fastpath=False): """ from pandas import Index - if not isinstance(categories, ABCIndexClass): + if not fastpath and not is_list_like(categories): + msg = "Parameter 'categories' must be list-like, was {!r}" + raise TypeError(msg.format(categories)) + elif not isinstance(categories, ABCIndexClass): categories = Index(categories, tupleize_cols=False) if not fastpath: diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 0ef7422555fe6..f76085f9889dd 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -107,29 +107,23 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, if fastpath: return cls._simple_new(data, name=name, dtype=dtype) + dtype = CategoricalDtype._from_values_or_dtype(data, categories, + ordered, dtype) + if name is None and hasattr(data, 'name'): name = data.name - if isinstance(data, ABCCategorical): - data = cls._create_categorical(data, categories, ordered, - dtype) - elif isinstance(data, CategoricalIndex): - data = data._data - data = cls._create_categorical(data, categories, ordered, - dtype) - else: - + if not is_categorical_dtype(data): # don't allow scalars # if data is None, then categories must be provided if is_scalar(data): if data is not None or categories is None: cls._scalar_data_error(data) data = [] - data = cls._create_categorical(data, categories, ordered, - dtype) - if copy: - data = data.copy() + data = cls._create_categorical(data, dtype=dtype) + + data = data.copy() if copy else data return cls._simple_new(data, name=name) @@ -159,8 +153,7 @@ def _create_from_codes(self, codes, dtype=None, name=None): return CategoricalIndex(cat, name=name) @classmethod - def _create_categorical(cls, data, categories=None, ordered=None, - dtype=None): + def _create_categorical(cls, data, dtype=None): """ *this is an internal non-public method* @@ -169,8 +162,6 @@ def _create_categorical(cls, data, categories=None, ordered=None, Parameters ---------- data : data for new Categorical - categories : optional categories, defaults to existing - ordered : optional ordered attribute, defaults to existing dtype : CategoricalDtype, defaults to existing Returns @@ -182,18 +173,11 @@ def _create_categorical(cls, data, categories=None, ordered=None, data = data.values if not isinstance(data, ABCCategorical): - if ordered is None and dtype is None: - ordered = False - data = Categorical(data, categories=categories, ordered=ordered, - dtype=dtype) - else: - if categories is not None: - data = data.set_categories(categories, ordered=ordered) - elif ordered is not None and ordered != data.ordered: - data = data.set_ordered(ordered) - if isinstance(dtype, CategoricalDtype) and dtype != data.dtype: - # we want to silently ignore dtype='category' - data = data._set_dtype(dtype) + return Categorical(data, dtype=dtype) + + if isinstance(dtype, CategoricalDtype) and dtype != data.dtype: + # we want to silently ignore dtype='category' + data = data._set_dtype(dtype) return data @classmethod diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 79e10de2b8aaf..f8e9e393091e5 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -368,7 +368,7 @@ def test_constructor_str_category(self, categories, ordered): tm.assert_categorical_equal(result, expected) def test_constructor_str_unknown(self): - with pytest.raises(ValueError, match="Unknown `dtype`"): + with pytest.raises(ValueError, match="Unknown dtype"): Categorical([1, 2], dtype="foo") def test_constructor_from_categorical_with_dtype(self): diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index ab52a8a81385c..40b8f7afa3598 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -90,10 +90,40 @@ def test_construction_from_string(self): TypeError, lambda: CategoricalDtype.construct_from_string('foo')) def test_constructor_invalid(self): - msg = "CategoricalIndex.* must be called" + msg = "Parameter 'categories' must be list-like" with pytest.raises(TypeError, match=msg): CategoricalDtype("category") + dtype1 = CategoricalDtype(['a', 'b'], ordered=True) + dtype2 = CategoricalDtype(['x', 'y'], ordered=False) + c = Categorical([0, 1], dtype=dtype1, fastpath=True) + + @pytest.mark.parametrize('values, categories, ordered, dtype, expected', + [ + [None, None, None, None, + CategoricalDtype()], + [None, ['a', 'b'], True, None, dtype1], + [c, None, None, dtype2, dtype2], + [c, ['x', 'y'], False, None, dtype2], + ]) + def test_from_values_or_dtype( + self, values, categories, ordered, dtype, expected): + result = CategoricalDtype._from_values_or_dtype(values, categories, + ordered, dtype) + assert result == expected + + @pytest.mark.parametrize('values, categories, ordered, dtype', [ + [None, ['a', 'b'], True, dtype2], + [None, ['a', 'b'], None, dtype2], + [None, None, True, dtype2], + ]) + def test_from_values_or_dtype_raises(self, values, categories, + ordered, dtype): + msg = "Cannot specify `categories` or `ordered` together with `dtype`." + with pytest.raises(ValueError, match=msg): + CategoricalDtype._from_values_or_dtype(values, categories, + ordered, dtype) + def test_is_dtype(self): assert CategoricalDtype.is_dtype(self.dtype) assert CategoricalDtype.is_dtype('category') @@ -706,7 +736,7 @@ def test_invalid_raises(self): with pytest.raises(TypeError, match='ordered'): CategoricalDtype(['a', 'b'], ordered='foo') - with pytest.raises(TypeError, match='collection'): + with pytest.raises(TypeError, match="'categories' must be list-like"): CategoricalDtype('category') def test_mixed(self): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 8552e65a0dd24..8518c1fa369c2 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -158,7 +158,7 @@ def test_construction_with_categorical_dtype(self): tm.assert_index_equal(result, expected, exact=True) # error when combining categories/ordered and dtype kwargs - msg = 'Cannot specify both `dtype` and `categories` or `ordered`.' + msg = 'Cannot specify `categories` or `ordered` together with `dtype`.' with pytest.raises(ValueError, match=msg): CategoricalIndex(data, categories=cats, dtype=dtype)