diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 27110fe1f8439..413309b3d01ad 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -298,7 +298,13 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi _can_hold_na = True def __init__( - self, values, categories=None, ordered=None, dtype=None, fastpath=False + self, + values, + categories=None, + ordered=None, + dtype=None, + fastpath=False, + copy: bool = True, ): dtype = CategoricalDtype._from_values_or_dtype( @@ -359,9 +365,9 @@ def __init__( dtype = CategoricalDtype(categories, dtype.ordered) elif is_categorical_dtype(values.dtype): - old_codes = extract_array(values).codes + old_codes = extract_array(values)._codes codes = recode_for_categories( - old_codes, values.dtype.categories, dtype.categories + old_codes, values.dtype.categories, dtype.categories, copy=copy ) else: @@ -389,7 +395,7 @@ def _constructor(self) -> Type["Categorical"]: @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy=False): - return Categorical(scalars, dtype=dtype) + return Categorical(scalars, dtype=dtype, copy=copy) def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: """ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 76b1c061cc827..ecf77a9987bee 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -15,7 +15,6 @@ is_categorical_dtype, is_scalar, ) -from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna from pandas.core import accessor @@ -184,28 +183,18 @@ def __new__( cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None ): - dtype = CategoricalDtype._from_values_or_dtype(data, categories, ordered, dtype) - name = maybe_extract_name(name, data, cls) - if not is_categorical_dtype(data): + if is_scalar(data): # don't allow scalars # if data is None, then categories must be provided - if is_scalar(data): - if data is not None or categories is None: - raise cls._scalar_data_error(data) - data = [] - - assert isinstance(dtype, CategoricalDtype), dtype - data = extract_array(data, extract_numpy=True) + if data is not None or categories is None: + raise cls._scalar_data_error(data) + data = [] - if not isinstance(data, Categorical): - data = Categorical(data, dtype=dtype) - elif isinstance(dtype, CategoricalDtype) and dtype != data.dtype: - # we want to silently ignore dtype='category' - data = data._set_dtype(dtype) - - data = data.copy() if copy else data + data = Categorical( + data, categories=categories, ordered=ordered, dtype=dtype, copy=copy + ) return cls._simple_new(data, name=name) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 753c15bde6bba..25b5be2ccc918 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -699,3 +699,14 @@ def test_categorical_extension_array_nullable(self, nulls_fixture): result = Categorical(arr) expected = Categorical(Series([pd.NA, pd.NA], dtype="object")) tm.assert_categorical_equal(result, expected) + + def test_from_sequence_copy(self): + cat = Categorical(np.arange(5).repeat(2)) + result = Categorical._from_sequence(cat, dtype=None, copy=False) + + # more generally, we'd be OK with a view + assert result._codes is cat._codes + + result = Categorical._from_sequence(cat, dtype=None, copy=True) + + assert not np.shares_memory(result._codes, cat._codes) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 3bab57e1d265e..00fca00c5b4a0 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -304,7 +304,7 @@ def test_ensure_copied_data(self, index): assert _base(index.values) is not _base(result.values) result = CategoricalIndex(index.values, copy=False) - assert _base(index.values) is _base(result.values) + assert result._data._codes is index._data._codes def test_frame_repr(self): df = pd.DataFrame({"A": [1, 2, 3]}, index=CategoricalIndex(["a", "b", "c"]))