moreless copy/paste from fletcher

simonjayhawkins · simonjayhawkins · commit c53a3c2dd7b1 · 2020-11-21T19:54:02.000Z
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from distutils.version import LooseVersion
-from typing import TYPE_CHECKING, Any, Sequence, Type, Union
+from typing import TYPE_CHECKING, Any, Sequence, Tuple, Type, Union
 
 import numpy as np
 
@@ -15,10 +15,12 @@
 from pandas.api.types import (
     is_array_like,
     is_bool_dtype,
+    is_int64_dtype,
     is_integer,
     is_integer_dtype,
     is_scalar,
 )
+from pandas.core.algorithms import factorize
 from pandas.core.arraylike import OpsMixin
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.indexers import check_array_indexer, validate_indices
@@ -252,9 +254,45 @@ def __len__(self) -> int:
         """
         return len(self._data)
 
-    @classmethod
-    def _from_factorized(cls, values, original):
-        return cls._from_sequence(values)
+    def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
+        """Encode the extension array as an enumerated type.
+        Parameters
+        ----------
+        na_sentinel : int, default -1
+            Value to use in the `labels` array to indicate missing values.
+        Returns
+        -------
+        labels : ndarray
+            An integer NumPy array that's an indexer into the original
+            ExtensionArray.
+        uniques : ExtensionArray
+            An ExtensionArray containing the unique values of `self`.
+            .. note::
+               uniques will *not* contain an entry for the NA value of
+               the ExtensionArray if there are any missing values present
+               in `self`.
+        See Also
+        --------
+        pandas.factorize : Top-level factorize method that dispatches here.
+        Notes
+        -----
+        :meth:`pandas.factorize` offers a `sort` keyword as well.
+        """
+        if pa.types.is_dictionary(self._data.type):
+            raise NotImplementedError()
+        elif self._data.num_chunks == 1:
+            # Dictionaryencode and do the same as above
+            encoded = self._data.chunk(0).dictionary_encode()
+            indices = encoded.indices.to_pandas()
+            if indices.dtype.kind == "f":
+                indices[np.isnan(indices)] = na_sentinel
+                indices = indices.astype(int)
+            if not is_int64_dtype(indices):
+                indices = indices.astype(np.int64)
+            return indices.values, type(self)(encoded.dictionary)
+        else:
+            np_array = self._data.to_pandas().values
+            return factorize(np_array, na_sentinel=na_sentinel)
 
     @classmethod
     def _concat_same_type(cls, to_concat) -> ArrowStringArray: