|
1 | 1 | from __future__ import annotations
|
2 | 2 |
|
3 | 3 | from distutils.version import LooseVersion
|
4 |
| -from typing import TYPE_CHECKING, Any, Sequence, Type, Union |
| 4 | +from typing import TYPE_CHECKING, Any, Sequence, Tuple, Type, Union |
5 | 5 |
|
6 | 6 | import numpy as np
|
7 | 7 |
|
|
15 | 15 | from pandas.api.types import (
|
16 | 16 | is_array_like,
|
17 | 17 | is_bool_dtype,
|
| 18 | + is_int64_dtype, |
18 | 19 | is_integer,
|
19 | 20 | is_integer_dtype,
|
20 | 21 | is_scalar,
|
21 | 22 | )
|
| 23 | +from pandas.core.algorithms import factorize |
22 | 24 | from pandas.core.arraylike import OpsMixin
|
23 | 25 | from pandas.core.arrays.base import ExtensionArray
|
24 | 26 | from pandas.core.indexers import check_array_indexer, validate_indices
|
@@ -252,9 +254,45 @@ def __len__(self) -> int:
|
252 | 254 | """
|
253 | 255 | return len(self._data)
|
254 | 256 |
|
255 |
| - @classmethod |
256 |
| - def _from_factorized(cls, values, original): |
257 |
| - return cls._from_sequence(values) |
| 257 | + def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: |
| 258 | + """Encode the extension array as an enumerated type. |
| 259 | + Parameters |
| 260 | + ---------- |
| 261 | + na_sentinel : int, default -1 |
| 262 | + Value to use in the `labels` array to indicate missing values. |
| 263 | + Returns |
| 264 | + ------- |
| 265 | + labels : ndarray |
| 266 | + An integer NumPy array that's an indexer into the original |
| 267 | + ExtensionArray. |
| 268 | + uniques : ExtensionArray |
| 269 | + An ExtensionArray containing the unique values of `self`. |
| 270 | + .. note:: |
| 271 | + uniques will *not* contain an entry for the NA value of |
| 272 | + the ExtensionArray if there are any missing values present |
| 273 | + in `self`. |
| 274 | + See Also |
| 275 | + -------- |
| 276 | + pandas.factorize : Top-level factorize method that dispatches here. |
| 277 | + Notes |
| 278 | + ----- |
| 279 | + :meth:`pandas.factorize` offers a `sort` keyword as well. |
| 280 | + """ |
| 281 | + if pa.types.is_dictionary(self._data.type): |
| 282 | + raise NotImplementedError() |
| 283 | + elif self._data.num_chunks == 1: |
| 284 | + # Dictionaryencode and do the same as above |
| 285 | + encoded = self._data.chunk(0).dictionary_encode() |
| 286 | + indices = encoded.indices.to_pandas() |
| 287 | + if indices.dtype.kind == "f": |
| 288 | + indices[np.isnan(indices)] = na_sentinel |
| 289 | + indices = indices.astype(int) |
| 290 | + if not is_int64_dtype(indices): |
| 291 | + indices = indices.astype(np.int64) |
| 292 | + return indices.values, type(self)(encoded.dictionary) |
| 293 | + else: |
| 294 | + np_array = self._data.to_pandas().values |
| 295 | + return factorize(np_array, na_sentinel=na_sentinel) |
258 | 296 |
|
259 | 297 | @classmethod
|
260 | 298 | def _concat_same_type(cls, to_concat) -> ArrowStringArray:
|
|
0 commit comments