WIP

rhshadrach · rhshadrach · commit 6e28b43795bb · 2022-04-05T16:15:36.000-04:00
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -560,13 +560,14 @@ cdef class {{name}}HashTable(HashTable):
             The labels from values to uniques
         """
         cdef:
-            Py_ssize_t i, idx, count = count_prior, n = len(values)
+            Py_ssize_t i, idx, count = count_prior, n = len(values), na_index = -1, na_code = -1
             intp_t[::1] labels
             int ret = 0
             {{c_type}} val, na_value2
             khiter_t k
             {{name}}VectorData *ud
-            bint use_na_value, use_mask
+            bint use_na_value, use_mask, seen_na = False
+            uint8_t is_na
             uint8_t[:] mask_values
 
         if return_inverse:
@@ -592,20 +593,42 @@ cdef class {{name}}HashTable(HashTable):
             for i in range(n):
                 val = {{to_c_type}}(values[i])
 
-                if ignore_na and use_mask:
-                    if mask_values[i]:
+                if use_mask:
+                    is_na = mask_values[i]
+                else:
+                    is_na = (
+                       is_nan_{{c_type}}(val) or
+                       (use_na_value and are_equivalent_{{c_type}}(val, na_value2))
+                    )
+
+                if is_na:
+                    if ignore_na:
+                        # if missing values do not count as unique values (i.e. if
+                        # ignore_na is True), skip the hashtable entry for them,
+                        # and replace the corresponding label with na_sentinel
                         labels[i] = na_sentinel
                         continue
-                elif ignore_na and (
-                   is_nan_{{c_type}}(val) or
-                   (use_na_value and are_equivalent_{{c_type}}(val, na_value2))
-                ):
-                    # if missing values do not count as unique values (i.e. if
-                    # ignore_na is True), skip the hashtable entry for them,
-                    # and replace the corresponding label with na_sentinel
-                    labels[i] = na_sentinel
+
+                    if not seen_na:
+                        if needs_resize(ud):
+                            with gil:
+                                if uniques.external_view_exists:
+                                    raise ValueError("external reference to "
+                                                     "uniques held, but "
+                                                     "Vector.resize() needed")
+                                uniques.resize()
+                        append_data_{{dtype}}(ud, val)
+                        na_index = i
+                        if return_inverse:
+                            labels[i] = count
+                            na_code = count
+                            count += 1
+                    if return_inverse:
+                        idx = na_code
+                        labels[i] = idx
                     continue
 
+
                 k = kh_get_{{dtype}}(self.table, val)
 
                 if k == self.table.n_buckets:
@@ -620,6 +643,7 @@ cdef class {{name}}HashTable(HashTable):
                                                  "Vector.resize() needed")
                             uniques.resize()
                     append_data_{{dtype}}(ud, val)
+
                     if return_inverse:
                         self.table.vals[k] = count
                         labels[i] = count
@@ -631,8 +655,8 @@ cdef class {{name}}HashTable(HashTable):
                     labels[i] = idx
 
         if return_inverse:
-            return uniques.to_array(), labels.base  # .base -> underlying ndarray
-        return uniques.to_array()
+            return uniques.to_array(), na_index, labels.base  # .base -> underlying ndarray
+        return uniques.to_array(), na_index
 
     def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
         """
@@ -1010,8 +1034,8 @@ cdef class StringHashTable(HashTable):
             uniques.append(values[uindexer[i]])
 
         if return_inverse:
-            return uniques.to_array(), labels.base  # .base -> underlying ndarray
-        return uniques.to_array()
+            return uniques.to_array(), -1, labels.base  # .base -> underlying ndarray
+        return uniques.to_array(), -1
 
     def unique(self, ndarray[object] values, bint return_inverse=False):
         """
@@ -1222,12 +1246,13 @@ cdef class PyObjectHashTable(HashTable):
             The labels from values to uniques
         """
         cdef:
-            Py_ssize_t i, idx, count = count_prior, n = len(values)
+            Py_ssize_t i, idx, count = count_prior, n = len(values), na_index = -1
             intp_t[::1] labels
             int ret = 0
             object val
             khiter_t k
-            bint use_na_value
+            bint use_na_value, seen_na, is_na
+            intp_t na_code
 
         if return_inverse:
             labels = np.empty(n, dtype=np.intp)
@@ -1237,14 +1262,26 @@ cdef class PyObjectHashTable(HashTable):
             val = values[i]
             hash(val)
 
-            if ignore_na and (
-                checknull(val)
-                or (use_na_value and val == na_value)
-            ):
-                # if missing values do not count as unique values (i.e. if
-                # ignore_na is True), skip the hashtable entry for them, and
-                # replace the corresponding label with na_sentinel
-                labels[i] = na_sentinel
+            is_na = checknull(val) or (use_na_value and val == na_value)
+
+            if is_na:
+                if ignore_na:
+                    # if missing values do not count as unique values (i.e. if
+                    # ignore_na is True), skip the hashtable entry for them, and
+                    # replace the corresponding label with na_sentinel
+                    labels[i] = na_sentinel
+                    continue
+
+                if not seen_na:
+                    seen_na = True
+                    uniques.append(val)
+                    na_index = i
+                    if return_inverse:
+                        labels[i] = count
+                        na_code = count
+                        count += 1
+                elif return_inverse:
+                    labels[i] = na_code
                 continue
 
             k = kh_get_pymap(self.table, <PyObject*>val)
@@ -1263,8 +1300,8 @@ cdef class PyObjectHashTable(HashTable):
                 labels[i] = idx
 
         if return_inverse:
-            return uniques.to_array(), labels.base  # .base -> underlying ndarray
-        return uniques.to_array()
+            return uniques.to_array(), na_index, labels.base  # .base -> underlying ndarray
+        return uniques.to_array(), na_index
 
     def unique(self, ndarray[object] values, bint return_inverse=False):
         """
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -412,7 +412,7 @@ def unique(values):
     htable, values = _get_hashtable_algo(values)
 
     table = htable(len(values))
-    uniques = table.unique(values)
+    uniques, _ = table.unique(values)
     uniques = _reconstruct_data(uniques, original.dtype, original)
     return uniques
 
@@ -541,7 +541,7 @@ def factorize_array(
     hash_klass, values = _get_hashtable_algo(values)
 
     table = hash_klass(size_hint or len(values))
-    uniques, codes = table.factorize(
+    uniques, na_index, codes = table.factorize(
         values,
         na_sentinel=na_sentinel,
         na_value=na_value,
@@ -553,7 +553,7 @@ def factorize_array(
     uniques = _reconstruct_data(uniques, original.dtype, original)
 
     codes = ensure_platform_int(codes)
-    return codes, uniques
+    return codes, uniques, na_index
 
 
 @doc(
@@ -733,11 +733,13 @@ def factorize(
 
     if not isinstance(values.dtype, np.dtype):
         # i.e. ExtensionDtype
-        assert dropna or sort
-        codes, uniques = values.factorize(na_sentinel=na_sentinel)
+        # assert dropna or sort
+        codes, uniques = values.factorize(
+            na_sentinel=na_sentinel, ignore_na=dropna or sort
+        )
     else:
         values = np.asarray(values)  # convert DTA/TDA/MultiIndex
-        codes, uniques = factorize_array(
+        codes, uniques, _ = factorize_array(
             values,
             na_sentinel=na_sentinel,
             size_hint=size_hint,
@@ -749,8 +751,7 @@ def factorize(
             uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
         )
 
-    # TODO: Fix
-    if not dropna and (sort or not isinstance(values.dtype, np.dtype)):
+    if not dropna and sort:
         code_is_na = codes == na_sentinel
         if code_is_na.any():
             # na_value is set based on the dtype of uniques, and compat set to False is
diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
@@ -605,7 +605,10 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
         return type(self)(self._data)
 
     @doc(ExtensionArray.factorize)
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
+    # TODO: ignore_na?
+    def factorize(
+        self, na_sentinel: int = -1, ignore_na: bool = True
+    ) -> tuple[np.ndarray, ExtensionArray]:
         encoded = self._data.dictionary_encode()
         indices = pa.chunked_array(
             [c.indices for c in encoded.chunks], type=encoded.type.index_type
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -1002,7 +1002,9 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
         """
         return self.astype(object), np.nan
 
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
+    def factorize(
+        self, na_sentinel: int = -1, ignore_na: bool = True
+    ) -> tuple[np.ndarray, ExtensionArray]:
         """
         Encode the extension array as an enumerated type.
 
@@ -1043,8 +1045,8 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
         #    Complete control over factorization.
         arr, na_value = self._values_for_factorize()
 
-        codes, uniques = factorize_array(
-            arr, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True
+        codes, uniques, _ = factorize_array(
+            arr, na_sentinel=na_sentinel, na_value=na_value, ignore_na=ignore_na
         )
 
         uniques_ea = self._from_factorized(uniques, self)
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -1878,7 +1878,8 @@ def _with_freq(self, freq):
 
     # --------------------------------------------------------------
 
-    def factorize(self, na_sentinel=-1, sort: bool = False):
+    # TODO: Fix?
+    def factorize(self, na_sentinel=-1, sort: bool = False, ignore_na: bool = True):
         if self.freq is not None:
             # We must be unique, so can short-circuit (and retain freq)
             codes = np.arange(len(self), dtype=np.intp)
@@ -1888,7 +1889,7 @@ def factorize(self, na_sentinel=-1, sort: bool = False):
                 uniques = uniques[::-1]
             return codes, uniques
         # FIXME: shouldn't get here; we are ignoring sort
-        return super().factorize(na_sentinel=na_sentinel)
+        return super().factorize(na_sentinel=na_sentinel, ignore_na=ignore_na)[0]
 
 
 # -------------------------------------------------------------------
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -868,16 +868,23 @@ def searchsorted(
         return self._data.searchsorted(value, side=side, sorter=sorter)
 
     @doc(ExtensionArray.factorize)
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
+    def factorize(
+        self, na_sentinel: int = -1, ignore_na: bool = True
+    ) -> tuple[np.ndarray, ExtensionArray]:
         arr = self._data
         mask = self._mask
 
-        codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask)
+        codes, uniques, na_index = factorize_array(
+            arr, na_sentinel=na_sentinel, mask=mask, ignore_na=ignore_na
+        )
 
         # check that factorize_array correctly preserves dtype.
         assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype)
 
-        uniques_ea = type(self)(uniques, np.zeros(len(uniques), dtype=bool))
+        mask = np.zeros(len(uniques), dtype=bool)
+        if na_index >= 0:
+            mask[na_index] = True
+        uniques_ea = type(self)(uniques, mask)
         return codes, uniques_ea
 
     @doc(ExtensionArray._values_for_argsort)
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -847,13 +847,17 @@ def _values_for_factorize(self):
         # Still override this for hash_pandas_object
         return np.asarray(self), self.fill_value
 
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, SparseArray]:
+    def factorize(
+        self, na_sentinel: int = -1, ignore_na: bool = True
+    ) -> tuple[np.ndarray, SparseArray]:
         # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
         # The sparsity on this is backwards from what Sparse would want. Want
         # ExtensionArray.factorize -> Tuple[EA, EA]
         # Given that we have to return a dense array of codes, why bother
         # implementing an efficient factorize?
-        codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
+        codes, uniques = algos.factorize(
+            np.asarray(self), na_sentinel=na_sentinel, ignore_na=ignore_na
+        )
         uniques_sp = SparseArray(uniques, dtype=self.dtype)
         return codes, uniques_sp
 
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -687,7 +687,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
                 na_sentinel = None
             else:
                 na_sentinel = -1
-            codes, uniques = algorithms.factorize(
+            codes, uniques, _ = algorithms.factorize(
                 self.grouping_vector, sort=self._sort, na_sentinel=na_sentinel
             )
         return codes, uniques
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -2243,7 +2243,7 @@ def _factorize_keys(
     # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
     # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
     # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"
-    llab = rizer.factorize(lk)  # type: ignore[arg-type]
+    llab, _ = rizer.factorize(lk)  # type: ignore[arg-type]
     # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
     # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
     # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"

Original file line number	Diff line number	Diff line change
`@@ -687,7 +687,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:`
`687`	`687`	`na_sentinel = None`
`688`	`688`	`else:`
`689`	`689`	`na_sentinel = -1`
`690`		`- codes, uniques = algorithms.factorize(`
	`690`	`+ codes, uniques, _ = algorithms.factorize(`
`691`	`691`	`self.grouping_vector, sort=self._sort, na_sentinel=na_sentinel`
`692`	`692`	`)`
`693`	`693`	`return codes, uniques`