Skip to content

Commit c53a3c2

Browse files
moreless copy/paste from fletcher
1 parent fb379d8 commit c53a3c2

File tree

1 file changed

+42
-4
lines changed

1 file changed

+42
-4
lines changed

pandas/core/arrays/string_arrow.py

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from __future__ import annotations
22

33
from distutils.version import LooseVersion
4-
from typing import TYPE_CHECKING, Any, Sequence, Type, Union
4+
from typing import TYPE_CHECKING, Any, Sequence, Tuple, Type, Union
55

66
import numpy as np
77

@@ -15,10 +15,12 @@
1515
from pandas.api.types import (
1616
is_array_like,
1717
is_bool_dtype,
18+
is_int64_dtype,
1819
is_integer,
1920
is_integer_dtype,
2021
is_scalar,
2122
)
23+
from pandas.core.algorithms import factorize
2224
from pandas.core.arraylike import OpsMixin
2325
from pandas.core.arrays.base import ExtensionArray
2426
from pandas.core.indexers import check_array_indexer, validate_indices
@@ -252,9 +254,45 @@ def __len__(self) -> int:
252254
"""
253255
return len(self._data)
254256

255-
@classmethod
256-
def _from_factorized(cls, values, original):
257-
return cls._from_sequence(values)
257+
def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
258+
"""Encode the extension array as an enumerated type.
259+
Parameters
260+
----------
261+
na_sentinel : int, default -1
262+
Value to use in the `labels` array to indicate missing values.
263+
Returns
264+
-------
265+
labels : ndarray
266+
An integer NumPy array that's an indexer into the original
267+
ExtensionArray.
268+
uniques : ExtensionArray
269+
An ExtensionArray containing the unique values of `self`.
270+
.. note::
271+
uniques will *not* contain an entry for the NA value of
272+
the ExtensionArray if there are any missing values present
273+
in `self`.
274+
See Also
275+
--------
276+
pandas.factorize : Top-level factorize method that dispatches here.
277+
Notes
278+
-----
279+
:meth:`pandas.factorize` offers a `sort` keyword as well.
280+
"""
281+
if pa.types.is_dictionary(self._data.type):
282+
raise NotImplementedError()
283+
elif self._data.num_chunks == 1:
284+
# Dictionaryencode and do the same as above
285+
encoded = self._data.chunk(0).dictionary_encode()
286+
indices = encoded.indices.to_pandas()
287+
if indices.dtype.kind == "f":
288+
indices[np.isnan(indices)] = na_sentinel
289+
indices = indices.astype(int)
290+
if not is_int64_dtype(indices):
291+
indices = indices.astype(np.int64)
292+
return indices.values, type(self)(encoded.dictionary)
293+
else:
294+
np_array = self._data.to_pandas().values
295+
return factorize(np_array, na_sentinel=na_sentinel)
258296

259297
@classmethod
260298
def _concat_same_type(cls, to_concat) -> ArrowStringArray:

0 commit comments

Comments
 (0)