-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
PERF/ENH: add fast astyping for Categorical #37355
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 46 commits
5d82b02
c18ae4e
d7c0575
856995f
57817a4
1050d9e
c8c05cc
b8141c4
3d3bcf1
3714d09
f8f501f
f4b5952
2ec7ded
cd110bc
113a569
341ceb6
c5f3fd4
9943bb9
c720536
f96a20d
37e3264
190c015
f9a3040
568aa7f
6860e48
f2aa2ef
d226d84
9a9e24a
a323544
07b2a65
229bfc7
da12be0
e5ede6d
93f3e1a
9cb5fe3
f55964e
b342135
73e0442
19e22e2
d195d91
3351cb1
38696d9
071deec
13fa086
dda6804
1016894
a9544b3
527b15a
9c29946
7e9fc32
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -402,20 +402,40 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: | |
If copy is set to False and dtype is categorical, the original | ||
object is returned. | ||
""" | ||
if is_categorical_dtype(dtype): | ||
if self.dtype is dtype: | ||
result = self.copy() if copy else self | ||
|
||
elif is_categorical_dtype(dtype): | ||
dtype = cast(Union[str, CategoricalDtype], dtype) | ||
|
||
# GH 10696/18593/18630 | ||
dtype = self.dtype.update_dtype(dtype) | ||
result = self.copy() if copy else self | ||
if dtype == self.dtype: | ||
return result | ||
return result._set_dtype(dtype) | ||
if is_extension_array_dtype(dtype): | ||
return array(self, dtype=dtype, copy=copy) | ||
if is_integer_dtype(dtype) and self.isna().any(): | ||
self = self.copy() if copy else self | ||
result = self._set_dtype(dtype) | ||
|
||
# TODO: consolidate with ndarray case? | ||
elif is_extension_array_dtype(dtype): | ||
result = array(self, dtype=dtype, copy=copy) | ||
|
||
elif is_integer_dtype(dtype) and self.isna().any(): | ||
raise ValueError("Cannot convert float NaN to integer") | ||
return np.array(self, dtype=dtype, copy=copy) | ||
|
||
elif len(self.codes) == 0 or len(self.categories) == 0: | ||
result = np.array(self, dtype=dtype, copy=copy) | ||
|
||
else: | ||
# GH8628 (PERF): astype category codes instead of astyping array | ||
try: | ||
astyped_cats = self.categories.astype(dtype=dtype, copy=copy) | ||
except (TypeError, ValueError): | ||
raise ValueError( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why change TypeError? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's to fix the error message for In [2]: idx = pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"])
In [3]: idx.astype('float')
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/workspaces/pandas-arw2019/pandas/core/indexes/base.py in astype(self, dtype, copy)
700 try:
--> 701 casted = self._values.astype(dtype, copy=copy)
702 except (TypeError, ValueError) as err:
ValueError: could not convert string to float: 'a'
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
<ipython-input-4-38d56ec15c36> in <module>
----> 1 idx.astype('float')
/workspaces/pandas-arw2019/pandas/core/indexes/category.py in astype(self, dtype, copy)
369 @doc(Index.astype)
370 def astype(self, dtype, copy=True):
--> 371 res_data = self._data.astype(dtype, copy=copy)
372 return Index(res_data, name=self.name)
373
/workspaces/pandas-arw2019/pandas/core/arrays/categorical.py in astype(self, dtype, copy)
427 # GH8628 (PERF): astype category codes instead of astyping array
428 try:
--> 429 astyped_cats = self.categories.astype(dtype=dtype, copy=copy)
430 except (ValueError):
431 raise ValueError(
/workspaces/pandas-arw2019/pandas/core/indexes/base.py in astype(self, dtype, copy)
701 casted = self._values.astype(dtype, copy=copy)
702 except (TypeError, ValueError) as err:
--> 703 raise TypeError(
704 f"Cannot cast {type(self).__name__} to dtype {dtype}"
705 ) from err
TypeError: Cannot cast Index to dtype float64 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok this is fine, but can you add a comment for this then right here (so future readers understand) |
||
f"Cannot cast {self.categories.dtype} dtype to {dtype}" | ||
) | ||
|
||
astyped_cats = extract_array(astyped_cats, extract_numpy=True) | ||
result = take_1d(astyped_cats, libalgos.ensure_platform_int(self._codes)) | ||
|
||
return result | ||
|
||
@cache_readonly | ||
def itemsize(self) -> int: | ||
|
Uh oh!
There was an error while loading. Please reload this page.