Skip to content

REF: clearer Categorical/CategoricalIndex construction #24419

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jan 4, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 12 additions & 42 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,50 +316,19 @@ class Categorical(ExtensionArray, PandasObject):
def __init__(self, values, categories=None, ordered=None, dtype=None,
fastpath=False):

# Ways of specifying the dtype (prioritized ordered)
# 1. dtype is a CategoricalDtype
# a.) with known categories, use dtype.categories
# b.) else with Categorical values, use values.dtype
# c.) else, infer from values
# d.) specifying dtype=CategoricalDtype and categories is an error
# 2. dtype is a string 'category'
# a.) use categories, ordered
# b.) use values.dtype
# c.) infer from values
# 3. dtype is None
# a.) use categories, ordered
# b.) use values.dtype
# c.) infer from values
if dtype is not None:
# The dtype argument takes precedence over values.dtype (if any)
if isinstance(dtype, compat.string_types):
if dtype == 'category':
dtype = CategoricalDtype(categories, ordered)
else:
msg = "Unknown `dtype` {dtype}"
raise ValueError(msg.format(dtype=dtype))
elif categories is not None or ordered is not None:
raise ValueError("Cannot specify both `dtype` and `categories`"
" or `ordered`.")
elif is_categorical(values):
# If no "dtype" was passed, use the one from "values", but honor
# the "ordered" and "categories" arguments
dtype = values.dtype._from_categorical_dtype(values.dtype,
categories, ordered)
dtype = CategoricalDtype._from_values_or_dtype(values, categories,
ordered, dtype)
# At this point, dtype is always a CategoricalDtype, but
# we may have dtype.categories be None, and we need to
# infer categories in a factorization step futher below

if is_categorical(values):
# GH23814, for perf, if values._values already an instance of
# Categorical, set values to codes, and run fastpath
if (isinstance(values, (ABCSeries, ABCIndexClass)) and
isinstance(values._values, type(self))):
values = values._values.codes.copy()
fastpath = True
else:
# If dtype=None and values is not categorical, create a new dtype
dtype = CategoricalDtype(categories, ordered)

# At this point, dtype is always a CategoricalDtype and you should not
# use categories and ordered seperately.
# if dtype.categories is None, we are inferring

if fastpath:
self._codes = coerce_indexer_dtype(values, dtype.categories)
Expand Down Expand Up @@ -656,6 +625,9 @@ def from_codes(cls, codes, categories, ordered=False):
categorical. If not given, the resulting categorical will be
unordered.
"""
dtype = CategoricalDtype._from_values_or_dtype(codes, categories,
ordered)

codes = np.asarray(codes) # #21767
if not is_integer_dtype(codes):
msg = "codes need to be array-like integers"
Expand All @@ -675,14 +647,12 @@ def from_codes(cls, codes, categories, ordered=False):
raise ValueError(
"codes need to be convertible to an arrays of integers")

categories = CategoricalDtype.validate_categories(categories)

if len(codes) and (codes.max() >= len(categories) or codes.min() < -1):
if len(codes) and (
codes.max() >= len(dtype.categories) or codes.min() < -1):
raise ValueError("codes need to be between -1 and "
"len(categories)-1")

return cls(codes, categories=categories, ordered=ordered,
fastpath=True)
return cls(codes, dtype=dtype, fastpath=True)

_codes = None

Expand Down
90 changes: 89 additions & 1 deletion pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from pandas import compat

from .base import ExtensionDtype, _DtypeOpsMixin
from .inference import is_list_like


def register_extension_dtype(cls):
Expand Down Expand Up @@ -240,6 +241,90 @@ def _from_categorical_dtype(cls, dtype, categories=None, ordered=None):
ordered = dtype.ordered
return cls(categories, ordered)

@classmethod
def _from_values_or_dtype(cls, values=None, categories=None, ordered=None,
dtype=None):
"""
Construct dtype from the input parameters used in :class:`Categorical`.

This constructor method specifically does not do the factorization
step, if that is needed to find the categories. This constructor may
therefore return ``CategoricalDtype(categories=None, ordered=None)``,
which may not be useful. Additional steps may therefore have to be
taken to create the final dtype.

The return dtype is specified from the inputs in this prioritized
order:
1. if dtype is a CategoricalDtype, return dtype
2. if dtype is the string 'category', create a CategoricalDtype from
the supplied categories and ordered parameters, and return that.
3. if values is a categorical, use value.dtype, but override it with
categories and ordered if either/both of those are not None.
4. if dtype is None and values is not a categorical, construct the
dtype from categories and ordered, even if either of those is None.

Parameters
----------
values : list-like, optional
The list-like must be 1-dimensional.
categories : list-like, optional
Categories for the CategoricalDtype.
ordered : bool, optional
Designating if the categories are ordered.
dtype : CategoricalDtype or the string "category", optional
If ``CategoricalDtype``, cannot be used together with
`categories` or `ordered`.

Returns
-------
CategoricalDtype

Examples
--------
>>> CategoricalDtype._from_values_or_dtype()
CategoricalDtype(categories=None, ordered=None)
>>> CategoricalDtype._from_values_or_dtype(categories=['a', 'b'],
... ordered=True)
CategoricalDtype(categories=['a', 'b'], ordered=True)
>>> dtype1 = CategoricalDtype(['a', 'b'], ordered=True)
>>> dtype2 = CategoricalDtype(['x', 'y'], ordered=False)
>>> c = Categorical([0, 1], dtype=dtype1, fastpath=True)
>>> CategoricalDtype._from_values_or_dtype(c, ['x', 'y'], ordered=True,
... dtype=dtype2)
ValueError: Cannot specify `categories` or `ordered` together with
`dtype`.

The supplied dtype takes precedence over values' dtype:

>>> CategoricalDtype._from_values_or_dtype(c, dtype=dtype2)
CategoricalDtype(['x', 'y'], ordered=False)
"""
from pandas.core.dtypes.common import is_categorical

if dtype is not None:
# The dtype argument takes precedence over values.dtype (if any)
if isinstance(dtype, compat.string_types):
if dtype == 'category':
dtype = CategoricalDtype(categories, ordered)
else:
msg = "Unknown dtype {dtype!r}"
raise ValueError(msg.format(dtype=dtype))
elif categories is not None or ordered is not None:
raise ValueError("Cannot specify `categories` or `ordered` "
"together with `dtype`.")
elif is_categorical(values):
# If no "dtype" was passed, use the one from "values", but honor
# the "ordered" and "categories" arguments
dtype = values.dtype._from_categorical_dtype(values.dtype,
categories, ordered)
else:
# If dtype=None and values is not categorical, create a new dtype.
# Note: This could potentially have categories=None and
# ordered=None.
dtype = CategoricalDtype(categories, ordered)

return dtype

def _finalize(self, categories, ordered, fastpath=False):

if ordered is not None:
Expand Down Expand Up @@ -408,7 +493,10 @@ def validate_categories(categories, fastpath=False):
"""
from pandas import Index

if not isinstance(categories, ABCIndexClass):
if not fastpath and not is_list_like(categories):
msg = "Parameter 'categories' must be list-like, was {!r}"
raise TypeError(msg.format(categories))
elif not isinstance(categories, ABCIndexClass):
categories = Index(categories, tupleize_cols=False)

if not fastpath:
Expand Down
42 changes: 13 additions & 29 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,29 +107,23 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None,
if fastpath:
return cls._simple_new(data, name=name, dtype=dtype)

dtype = CategoricalDtype._from_values_or_dtype(data, categories,
ordered, dtype)

if name is None and hasattr(data, 'name'):
name = data.name

if isinstance(data, ABCCategorical):
data = cls._create_categorical(data, categories, ordered,
dtype)
elif isinstance(data, CategoricalIndex):
data = data._data
data = cls._create_categorical(data, categories, ordered,
dtype)
else:

if not is_categorical_dtype(data):
# don't allow scalars
# if data is None, then categories must be provided
if is_scalar(data):
if data is not None or categories is None:
cls._scalar_data_error(data)
data = []
data = cls._create_categorical(data, categories, ordered,
dtype)

if copy:
data = data.copy()
data = cls._create_categorical(data, dtype=dtype)

data = data.copy() if copy else data

return cls._simple_new(data, name=name)

Expand Down Expand Up @@ -159,8 +153,7 @@ def _create_from_codes(self, codes, dtype=None, name=None):
return CategoricalIndex(cat, name=name)

@classmethod
def _create_categorical(cls, data, categories=None, ordered=None,
dtype=None):
def _create_categorical(cls, data, dtype=None):
"""
*this is an internal non-public method*

Expand All @@ -169,8 +162,6 @@ def _create_categorical(cls, data, categories=None, ordered=None,
Parameters
----------
data : data for new Categorical
categories : optional categories, defaults to existing
ordered : optional ordered attribute, defaults to existing
dtype : CategoricalDtype, defaults to existing

Returns
Expand All @@ -182,18 +173,11 @@ def _create_categorical(cls, data, categories=None, ordered=None,
data = data.values

if not isinstance(data, ABCCategorical):
if ordered is None and dtype is None:
ordered = False
data = Categorical(data, categories=categories, ordered=ordered,
dtype=dtype)
else:
if categories is not None:
data = data.set_categories(categories, ordered=ordered)
elif ordered is not None and ordered != data.ordered:
data = data.set_ordered(ordered)
if isinstance(dtype, CategoricalDtype) and dtype != data.dtype:
# we want to silently ignore dtype='category'
data = data._set_dtype(dtype)
return Categorical(data, dtype=dtype)

if isinstance(dtype, CategoricalDtype) and dtype != data.dtype:
# we want to silently ignore dtype='category'
data = data._set_dtype(dtype)
return data

@classmethod
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/categorical/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@ def test_constructor_str_category(self, categories, ordered):
tm.assert_categorical_equal(result, expected)

def test_constructor_str_unknown(self):
with pytest.raises(ValueError, match="Unknown `dtype`"):
with pytest.raises(ValueError, match="Unknown dtype"):
Categorical([1, 2], dtype="foo")

def test_constructor_from_categorical_with_dtype(self):
Expand Down
34 changes: 32 additions & 2 deletions pandas/tests/dtypes/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,40 @@ def test_construction_from_string(self):
TypeError, lambda: CategoricalDtype.construct_from_string('foo'))

def test_constructor_invalid(self):
msg = "CategoricalIndex.* must be called"
msg = "Parameter 'categories' must be list-like"
with pytest.raises(TypeError, match=msg):
CategoricalDtype("category")

dtype1 = CategoricalDtype(['a', 'b'], ordered=True)
dtype2 = CategoricalDtype(['x', 'y'], ordered=False)
c = Categorical([0, 1], dtype=dtype1, fastpath=True)

@pytest.mark.parametrize('values, categories, ordered, dtype, expected',
[
[None, None, None, None,
CategoricalDtype()],
[None, ['a', 'b'], True, None, dtype1],
[c, None, None, dtype2, dtype2],
[c, ['x', 'y'], False, None, dtype2],
])
def test_from_values_or_dtype(
self, values, categories, ordered, dtype, expected):
result = CategoricalDtype._from_values_or_dtype(values, categories,
ordered, dtype)
assert result == expected

@pytest.mark.parametrize('values, categories, ordered, dtype', [
[None, ['a', 'b'], True, dtype2],
[None, ['a', 'b'], None, dtype2],
[None, None, True, dtype2],
])
def test_from_values_or_dtype_raises(self, values, categories,
ordered, dtype):
msg = "Cannot specify `categories` or `ordered` together with `dtype`."
with pytest.raises(ValueError, match=msg):
CategoricalDtype._from_values_or_dtype(values, categories,
ordered, dtype)

def test_is_dtype(self):
assert CategoricalDtype.is_dtype(self.dtype)
assert CategoricalDtype.is_dtype('category')
Expand Down Expand Up @@ -706,7 +736,7 @@ def test_invalid_raises(self):
with pytest.raises(TypeError, match='ordered'):
CategoricalDtype(['a', 'b'], ordered='foo')

with pytest.raises(TypeError, match='collection'):
with pytest.raises(TypeError, match="'categories' must be list-like"):
CategoricalDtype('category')

def test_mixed(self):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def test_construction_with_categorical_dtype(self):
tm.assert_index_equal(result, expected, exact=True)

# error when combining categories/ordered and dtype kwargs
msg = 'Cannot specify both `dtype` and `categories` or `ordered`.'
msg = 'Cannot specify `categories` or `ordered` together with `dtype`.'
with pytest.raises(ValueError, match=msg):
CategoricalIndex(data, categories=cats, dtype=dtype)

Expand Down