Description
Pandas version checks
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
import pandas as pd
df = pd.DataFrame([0, 1, 2], dtype="Int64")
df._mgr.arrays[0]._mask.flags["WRITEABLE"] = False
df.astype('category')
Issue Description
works with the default writeable array, but with non-writeable array, I get this error:
Details
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[1], line 5
3 df = pd.DataFrame([0, 1, 2], dtype="Int64")
4 df._mgr.arrays[0]._mask.flags["WRITEABLE"] = False
----> 5 df.astype('category')
File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/generic.py:6427, in NDFrame.astype(self, dtype, copy, errors)
6421 results.append(res_col)
6423 elif is_extension_array_dtype(dtype) and self.ndim > 1:
6424 # GH 18099/22869: columnwise conversion to extension dtype
6425 # GH 24704: use iloc to handle duplicate column names
6426 # TODO(EA2D): special case not needed with 2D EAs
-> 6427 results = [
6428 self.iloc[:, i].astype(dtype, copy=copy)
6429 for i in range(len(self.columns))
6430 ]
6432 else:
6433 # else, only a single dtype is given
6434 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/generic.py:6428, in <listcomp>(.0)
6421 results.append(res_col)
6423 elif is_extension_array_dtype(dtype) and self.ndim > 1:
6424 # GH 18099/22869: columnwise conversion to extension dtype
6425 # GH 24704: use iloc to handle duplicate column names
6426 # TODO(EA2D): special case not needed with 2D EAs
6427 results = [
-> 6428 self.iloc[:, i].astype(dtype, copy=copy)
6429 for i in range(len(self.columns))
6430 ]
6432 else:
6433 # else, only a single dtype is given
6434 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/generic.py:6434, in NDFrame.astype(self, dtype, copy, errors)
6427 results = [
6428 self.iloc[:, i].astype(dtype, copy=copy)
6429 for i in range(len(self.columns))
6430 ]
6432 else:
6433 # else, only a single dtype is given
-> 6434 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
6435 return self._constructor(new_data).__finalize__(self, method="astype")
6437 # GH 33113: handle empty frame or series
File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/internals/managers.py:454, in BaseBlockManager.astype(self, dtype, copy, errors)
451 elif using_copy_on_write():
452 copy = False
--> 454 return self.apply(
455 "astype",
456 dtype=dtype,
457 copy=copy,
458 errors=errors,
459 using_cow=using_copy_on_write(),
460 )
File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/internals/managers.py:355, in BaseBlockManager.apply(self, f, align_keys, **kwargs)
353 applied = b.apply(f, **kwargs)
354 else:
--> 355 applied = getattr(b, f)(**kwargs)
356 result_blocks = extend_blocks(applied, result_blocks)
358 out = type(self).from_blocks(result_blocks, self.axes)
File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/internals/blocks.py:538, in Block.astype(self, dtype, copy, errors, using_cow)
518 """
519 Coerce to the new dtype.
520
(...)
534 Block
535 """
536 values = self.values
--> 538 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
540 new_values = maybe_coerce_values(new_values)
542 refs = None
File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/dtypes/astype.py:238, in astype_array_safe(values, dtype, copy, errors)
235 dtype = dtype.numpy_dtype
237 try:
--> 238 new_values = astype_array(values, dtype, copy=copy)
239 except (ValueError, TypeError):
240 # e.g. _astype_nansafe can fail on object-dtype of strings
241 # trying to convert to float
242 if errors == "ignore":
File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/dtypes/astype.py:180, in astype_array(values, dtype, copy)
176 return values
178 if not isinstance(values, np.ndarray):
179 # i.e. ExtensionArray
--> 180 values = values.astype(dtype, copy=copy)
182 else:
183 values = _astype_nansafe(values, dtype, copy=copy)
File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/arrays/masked.py:497, in BaseMaskedArray.astype(self, dtype, copy)
495 if isinstance(dtype, ExtensionDtype):
496 eacls = dtype.construct_array_type()
--> 497 return eacls._from_sequence(self, dtype=dtype, copy=copy)
499 na_value: float | np.datetime64 | lib.NoDefault
501 # coerce
File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/arrays/categorical.py:498, in Categorical._from_sequence(cls, scalars, dtype, copy)
494 @classmethod
495 def _from_sequence(
496 cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
497 ) -> Self:
--> 498 return cls(scalars, dtype=dtype, copy=copy)
File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/arrays/categorical.py:446, in Categorical.__init__(self, values, categories, ordered, dtype, fastpath, copy)
444 values = sanitize_array(values, None)
445 try:
--> 446 codes, categories = factorize(values, sort=True)
447 except TypeError as err:
448 codes, categories = factorize(values, sort=False)
File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/algorithms.py:775, in factorize(values, sort, use_na_sentinel, size_hint)
771 return codes, uniques
773 elif not isinstance(values, np.ndarray):
774 # i.e. ExtensionArray
--> 775 codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel)
777 else:
778 values = np.asarray(values) # convert DTA/TDA/MultiIndex
File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/arrays/masked.py:944, in BaseMaskedArray.factorize(self, use_na_sentinel)
941 mask = self._mask
943 # Use a sentinel for na; recode and add NA to uniques if necessary below
--> 944 codes, uniques = factorize_array(arr, use_na_sentinel=True, mask=mask)
946 # check that factorize_array correctly preserves dtype.
947 assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype)
File ~/anaconda3/envs/pandas-dev-py39/lib/python3.9/site-packages/pandas/core/algorithms.py:591, in factorize_array(values, use_na_sentinel, size_hint, na_value, mask)
588 hash_klass, values = _get_hashtable_algo(values)
590 table = hash_klass(size_hint or len(values))
--> 591 uniques, codes = table.factorize(
592 values,
593 na_sentinel=-1,
594 na_value=na_value,
595 mask=mask,
596 ignore_na=use_na_sentinel,
597 )
599 # re-cast e.g. i8->dt64/td64, uint8->bool
600 uniques = _reconstruct_data(uniques, original.dtype, original)
File pandas/_libs/hashtable_class_helper.pxi:2976, in pandas._libs.hashtable.Int64HashTable.factorize()
File pandas/_libs/hashtable_class_helper.pxi:2825, in pandas._libs.hashtable.Int64HashTable._unique()
File stringsource:660, in View.MemoryView.memoryview_cwrapper()
File stringsource:350, in View.MemoryView.memoryview.__cinit__()
ValueError: buffer source array is read-only
This was one cause of modin-project/modin#6259
Expected Behavior
should convert to category dtype without error
Installed Versions
INSTALLED VERSIONS
commit : c65c8dd
python : 3.9.16.final.0
python-bits : 64
OS : Darwin
OS-release : 22.5.0
Version : Darwin Kernel Version 22.5.0: Mon Apr 24 20:51:50 PDT 2023; root:xnu-8796.121.2~5/RELEASE_X86_64
machine : x86_64
processor : i386
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 2.1.0.dev0+942.gc65c8ddd73
numpy : 2.0.0.dev0+84.g828fba29e
pytz : 2023.3
dateutil : 2.8.2
setuptools : 67.8.0
pip : 23.1.2
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : None
IPython : 8.14.0
pandas_datareader: None
bs4 : None
bottleneck : None
brotli : None
fastparquet : None
fsspec : None
gcsfs : None
matplotlib : None
numba : None
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pyreadstat : None
pyxlsb : None
s3fs : None
scipy : None
snappy : None
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
zstandard : None
tzdata : 2023.3
qtpy : None
pyqt5 : None