pandas-dev · jreback · May 5, 2021 · Apr 14, 2021 · Apr 30, 2021 · May 4, 2021
diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi
@@ -0,0 +1,242 @@
+from typing import (
+    Any,
+    Hashable,
+    Literal,
+)
+
+import numpy as np
+
+def unique_label_indices(
+    labels: np.ndarray,  # const int64_t[:]
+) -> np.ndarray: ...
+
+
+class Factorizer:
+    table: PyObjectHashTable
+    uniques: ObjectVector
+    count: int
+
+    def __init__(self, size_hint: int): ...
+    def get_count(self) -> int: ...
+
+    def factorize(
+        self,
+        values: np.ndarray,  # np.ndarray[object]
+        sort: bool = ...,
+        na_sentinel=...,
+        na_value=...,
+    ) -> np.ndarray: ...  # np.ndarray[intp]
+
+    def unique(
+        self,
+        values: np.ndarray,  # np.ndarray[object]
+    ) -> np.ndarray: ... # np.ndarray[object]
+
+
+class Int64Factorizer:
+    table: Int64HashTable
+    uniques: Int64Vector
+    count: int
+
+    def __init__(self, size_hint: int): ...
+    def get_count(self) -> int: ...
+
+    def factorize(
+        self,
+        values: np.ndarray,  # const int64_t[:]
+        sort: bool = ...,
+        na_sentinel=...,
+        na_value=...,
+    ) -> np.ndarray: ...  # np.ndarray[intp]
+
+
+class Int64Vector:
+    def __init__(self): ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> np.ndarray: ...  # np.ndarray[np.int64]
+
+class Int32Vector:
+    def __init__(self): ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> np.ndarray: ...  # np.ndarray[np.int32]
+
+class Int16Vector:
+    def __init__(self): ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> np.ndarray: ...  # np.ndarray[np.int16]
+
+class Int8Vector:
+    def __init__(self): ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> np.ndarray: ...  # np.ndarray[np.int8]
+
+class UInt64Vector:
+    def __init__(self): ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> np.ndarray: ...  # np.ndarray[np.uint64]
+
+class UInt32Vector:
+    def __init__(self): ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> np.ndarray: ...  # np.ndarray[np.uint32]
+
+class UInt16Vector:
+    def __init__(self): ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> np.ndarray: ...  # np.ndarray[np.uint16]
+
+class UInt8Vector:
+    def __init__(self): ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> np.ndarray: ...  # np.ndarray[np.uint8]
+
+class Float64Vector:
+    def __init__(self): ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> np.ndarray: ...  # np.ndarray[np.float64]
+
+class Float32Vector:
+    def __init__(self): ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> np.ndarray: ...  # np.ndarray[np.float32]
+
+class Complex128Vector:
+    def __init__(self): ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> np.ndarray: ...  # np.ndarray[np.complex128]
+
+class Complex64Vector:
+    def __init__(self): ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> np.ndarray: ...  # np.ndarray[np.complex64]
+
+class StringVector:
+    def __init__(self): ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> np.ndarray: ...  # np.ndarray[object]
+
+class ObjectVector:
+    def __init__(self): ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> np.ndarray: ...  # np.ndarray[object]
+
+
+class HashTable:
+    # NB: The base HashTable class does _not_ actually have these methods;
+    #  we are putting the here for the sake of mypy to avoid
+    #  reproducing them in each subclass below.
+    def __init__(self, size_hint: int = ...): ...
+    def __len__(self) -> int: ...
+    def __contains__(self, key: Hashable) -> bool: ...
+    def sizeof(self, deep: bool = ...) -> int: ...
+    def get_state(self) -> dict[str, int]: ...
+
+    # TODO: `item` type is subclass-specific
+    def get_item(self, item): ...  # TODO: return type?
+    def set_item(self, item) -> None: ...
+
+    # FIXME: we don't actually have this for StringHashTable or ObjectHashTable?
+    def map(
+        self,
+        keys: np.ndarray,     # np.ndarray[subclass-specific]
+        values: np.ndarray,   # const int64_t[:] values
+    ) -> None: ...
+
+    def map_locations(
+        self,
+        values: np.ndarray,  # np.ndarray[subclass-specific]
+    ) -> None: ...
+
+    def lookup(
+        self,
+        values: np.ndarray,  # np.ndarray[subclass-specific]
+    ) -> np.ndarray: ...     # np.ndarray[np.intp]
+
+    def get_labels(
+        self,
+        values: np.ndarray,  # np.ndarray[subclass-specific]
+        uniques,             # SubclassTypeVector
+        count_prior: int = ...,
+        na_sentinel: int = ...,
+        na_value: object = ...,
+    ) -> np.ndarray: ... # np.ndarray[intp_t]
+
+    def unique(
+        self,
+        values: np.ndarray,  # np.ndarray[subclass-specific]
+        return_inverse: bool = ...,
+    ) -> tuple[
+        np.ndarray,  # np.ndarray[subclass-specific]
+        np.ndarray,  # np.ndarray[np.intp],
+    ] | np.ndarray: ...  # np.ndarray[subclass-specific]
+
+    def _unique(
+        self,
+        values: np.ndarray,  # np.ndarray[subclass-specific]
+        uniques,   # FooVector
+        count_prior: int = ...,
+        na_sentinel: int = ...,
+        na_value: object = ...,
+        ignore_na: bool = ...,
+        return_inverse: bool = ...,
+    ) -> tuple[
+        np.ndarray,  # np.ndarray[subclass-specific]
+        np.ndarray,  # np.ndarray[np.intp],
+    ] | np.ndarray: ...  # np.ndarray[subclass-specific]
+
+    def factorize(
+        self,
+        values: np.ndarray,  # np.ndarray[subclass-specific]
+        na_sentinel: int = ...,
+        na_value: object = ...,
+        mask=...,
+    ) -> tuple[
+            np.ndarray,  # np.ndarray[subclass-specific]
+            np.ndarray,  # np.ndarray[np.intp],
+        ]: ...
+
+class Complex128HashTable(HashTable): ...
+class Complex64HashTable(HashTable): ...
+class Float64HashTable(HashTable): ...
+class Float32HashTable(HashTable): ...
+
+class Int64HashTable(HashTable):
+    # Only Int64HashTable has get_labels_groupby
+    def get_labels_groupby(
+        self,
+        values: np.ndarray,  # const int64_t[:]
+    ) -> tuple[
+        np.ndarray,  # np.ndarray[np.intp]
+        np.ndarray,  # np.ndarray[np.int64]
+    ]: ...
+
+class Int32HashTable(HashTable): ...
+class Int16HashTable(HashTable): ...
+class Int8HashTable(HashTable): ...
+class UInt64HashTable(HashTable): ...
+class UInt32HashTable(HashTable): ...
+class UInt16HashTable(HashTable): ...
+class UInt8HashTable(HashTable): ...
+
+class StringHashTable(HashTable): ...
+class PyObjectHashTable(HashTable): ...
+
+
+def duplicated_int64(
+    values: np.ndarray,  # const int64_t[:] values
+    keep: Literal["last", "first", False] = ...,
+) -> np.ndarray: ...  # np.ndarray[bool]
+# TODO: Is it actually bool or is it uint8?
+
+def mode_int64(
+    values: np.ndarray,  # const int64_t[:] values
+    dropna: bool,
+) -> np.ndarray: ...  # np.ndarray[np.int64]
+
+def value_count_int64(
+    values: np.ndarray,  # const int64_t[:]
+    dropna: bool,
+) -> tuple[
+    np.ndarray,  # np.ndarray[np.int64]
+    np.ndarray,  # np.ndarray[np.int64]
+]: ...
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -680,6 +680,7 @@ cdef class {{name}}HashTable(HashTable):
     def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
+        # -> np.ndarray[np.intp]
         _, labels = self._unique(values, uniques, count_prior=count_prior,
                                  na_sentinel=na_sentinel, na_value=na_value,
                                  ignore_na=True, return_inverse=True)
@@ -1012,7 +1013,7 @@ cdef class StringHashTable(HashTable):
         -------
         uniques : ndarray[object]
             Unique values of input, not sorted
-        labels : ndarray[int64] (if return_inverse)
+        labels : ndarray[intp_t] (if return_inverse)
             The labels from values to uniques
         """
         uniques = ObjectVector()
@@ -1045,7 +1046,7 @@ cdef class StringHashTable(HashTable):
         -------
         uniques : ndarray[object]
             Unique values of input, not sorted
-        labels : ndarray[int64]
+        labels : ndarray[intp]
             The labels from values to uniques
         """
         uniques_vector = ObjectVector()
@@ -1056,6 +1057,7 @@ cdef class StringHashTable(HashTable):
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
+        # -> np.ndarray[np.intp]
         _, labels = self._unique(values, uniques, count_prior=count_prior,
                                  na_sentinel=na_sentinel, na_value=na_value,
                                  ignore_na=True, return_inverse=True)
@@ -1310,6 +1312,7 @@ cdef class PyObjectHashTable(HashTable):
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
+        # -> np.ndarray[np.intp]
         _, labels = self._unique(values, uniques, count_prior=count_prior,
                                  na_sentinel=na_sentinel, na_value=na_value,
                                  ignore_na=True, return_inverse=True)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -555,7 +555,7 @@ def factorize_array(
 
     Returns
     -------
-    codes : ndarray
+    codes : ndarray[np.intp]
     uniques : ndarray
     """
     hash_klass, values = get_data_algo(values)
@@ -907,9 +907,9 @@ def value_counts_arraylike(values, dropna: bool):
         f = getattr(htable, f"value_count_{ndtype}")
         keys, counts = f(values, dropna)
 
-    keys = _reconstruct_data(keys, original.dtype, original)
+    res_keys = _reconstruct_data(keys, original.dtype, original)
 
-    return keys, counts
+    return res_keys, counts
 
 
 def duplicated(values: ArrayLike, keep: str | bool = "first") -> np.ndarray:

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5954,7 +5954,7 @@ def dropna(
     def drop_duplicates(
         self,
         subset: Hashable | Sequence[Hashable] | None = None,
-        keep: str | bool = "first",
+        keep: Literal["first"] | Literal["last"] | Literal[False] = "first",
         inplace: bool = False,
         ignore_index: bool = False,
     ) -> DataFrame | None:
@@ -6051,7 +6051,7 @@ def drop_duplicates(
     def duplicated(
         self,
         subset: Hashable | Sequence[Hashable] | None = None,
-        keep: str | bool = "first",
+        keep: Literal["first"] | Literal["last"] | Literal[False] = "first",
     ) -> Series:
         """
         Return boolean Series denoting duplicate rows.

diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -2138,6 +2138,7 @@ def _factorize_keys(
         # "_values_for_factorize"
         rk, _ = rk._values_for_factorize()  # type: ignore[union-attr,assignment]
 
+    klass: type[libhashtable.Factorizer] | type[libhashtable.Int64Factorizer]
     if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype):
         # GH#23917 TODO: needs tests for case where lk is integer-dtype
         #  and rk is datetime-dtype