WIP

rhshadrach · rhshadrach · commit c1627dda85c4 · 2022-04-16T17:47:03.000-04:00
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -623,14 +623,13 @@ cdef class {{name}}HashTable(HashTable):
                             labels[i] = count
                             na_code = count
                             count += 1
+                        seen_na = True
                     if return_inverse:
                         idx = na_code
                         labels[i] = idx
                     continue
 
-
                 k = kh_get_{{dtype}}(self.table, val)
-
                 if k == self.table.n_buckets:
                     # k hasn't been seen yet
                     k = kh_put_{{dtype}}(self.table, val, &ret)
@@ -720,7 +719,7 @@ cdef class {{name}}HashTable(HashTable):
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
         # -> np.ndarray[np.intp]
-        _, labels = self._unique(values, uniques, count_prior=count_prior,
+        _, _, labels = self._unique(values, uniques, count_prior=count_prior,
                                  na_sentinel=na_sentinel, na_value=na_value,
                                  ignore_na=True, return_inverse=True)
         return labels
@@ -1097,7 +1096,7 @@ cdef class StringHashTable(HashTable):
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
         # -> np.ndarray[np.intp]
-        _, labels = self._unique(values, uniques, count_prior=count_prior,
+        _, _, labels = self._unique(values, uniques, count_prior=count_prior,
                                  na_sentinel=na_sentinel, na_value=na_value,
                                  ignore_na=True, return_inverse=True)
         return labels
@@ -1363,7 +1362,7 @@ cdef class PyObjectHashTable(HashTable):
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
         # -> np.ndarray[np.intp]
-        _, labels = self._unique(values, uniques, count_prior=count_prior,
+        _, _, labels = self._unique(values, uniques, count_prior=count_prior,
                                  na_sentinel=na_sentinel, na_value=na_value,
                                  ignore_na=True, return_inverse=True)
         return labels
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -497,13 +497,13 @@ def f(c, v):
 
 
 def factorize_array(
-    values: np.ndarray,
+    arr: ArrayLike,
     na_sentinel: int = -1,
     size_hint: int | None = None,
     na_value=None,
     mask: npt.NDArray[np.bool_] | None = None,
     ignore_na: bool = True,
-) -> tuple[npt.NDArray[np.intp], np.ndarray]:
+) -> tuple[npt.NDArray[np.intp], ArrayLike]:
     """
     Factorize a numpy array to codes and uniques.
 
@@ -530,6 +530,11 @@ def factorize_array(
     codes : ndarray[np.intp]
     uniques : ndarray
     """
+    if is_extension_array_dtype(arr):
+        values, na_value = arr._values_for_factorize()
+    else:
+        values = arr
+
     original = values
     if values.dtype.kind in ["m", "M"]:
         # _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -105,7 +105,9 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
         return type(self)(self._data)
 
     @doc(ExtensionArray.factorize)
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
+    def factorize(
+        self, na_sentinel: int = -1, ignore_na=True
+    ) -> tuple[np.ndarray, ExtensionArray]:
         encoded = self._data.dictionary_encode()
         indices = pa.chunked_array(
             [c.indices for c in encoded.chunks], type=encoded.type.index_type
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -855,9 +855,7 @@ def factorize(
         # ExtensionArray.factorize -> Tuple[EA, EA]
         # Given that we have to return a dense array of codes, why bother
         # implementing an efficient factorize?
-        codes, uniques = algos.factorize(
-            np.asarray(self), na_sentinel=na_sentinel, ignore_na=ignore_na
-        )
+        codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
         uniques_sp = SparseArray(uniques, dtype=self.dtype)
         return codes, uniques_sp
 
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -2243,7 +2243,7 @@ def _factorize_keys(
     # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
     # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
     # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"
-    llab, _, _ = rizer.factorize(lk)  # type: ignore[arg-type]
+    llab = rizer.factorize(lk)  # type: ignore[arg-type]
     # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
     # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
     # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"