diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi new file mode 100644 index 0000000000000..b6278b3956a1d --- /dev/null +++ b/pandas/_libs/hashtable.pyi @@ -0,0 +1,242 @@ +from typing import ( + Any, + Hashable, + Literal, +) + +import numpy as np + +def unique_label_indices( + labels: np.ndarray, # const int64_t[:] +) -> np.ndarray: ... + + +class Factorizer: + table: PyObjectHashTable + uniques: ObjectVector + count: int + + def __init__(self, size_hint: int): ... + def get_count(self) -> int: ... + + def factorize( + self, + values: np.ndarray, # np.ndarray[object] + sort: bool = ..., + na_sentinel=..., + na_value=..., + ) -> np.ndarray: ... # np.ndarray[intp] + + def unique( + self, + values: np.ndarray, # np.ndarray[object] + ) -> np.ndarray: ... # np.ndarray[object] + + +class Int64Factorizer: + table: Int64HashTable + uniques: Int64Vector + count: int + + def __init__(self, size_hint: int): ... + def get_count(self) -> int: ... + + def factorize( + self, + values: np.ndarray, # const int64_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + ) -> np.ndarray: ... # np.ndarray[intp] + + +class Int64Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.int64] + +class Int32Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.int32] + +class Int16Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.int16] + +class Int8Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.int8] + +class UInt64Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint64] + +class UInt32Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint32] + +class UInt16Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint16] + +class UInt8Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint8] + +class Float64Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.float64] + +class Float32Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.float32] + +class Complex128Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.complex128] + +class Complex64Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.complex64] + +class StringVector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[object] + +class ObjectVector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[object] + + +class HashTable: + # NB: The base HashTable class does _not_ actually have these methods; + # we are putting the here for the sake of mypy to avoid + # reproducing them in each subclass below. + def __init__(self, size_hint: int = ...): ... + def __len__(self) -> int: ... + def __contains__(self, key: Hashable) -> bool: ... + def sizeof(self, deep: bool = ...) -> int: ... + def get_state(self) -> dict[str, int]: ... + + # TODO: `item` type is subclass-specific + def get_item(self, item): ... # TODO: return type? + def set_item(self, item) -> None: ... + + # FIXME: we don't actually have this for StringHashTable or ObjectHashTable? + def map( + self, + keys: np.ndarray, # np.ndarray[subclass-specific] + values: np.ndarray, # const int64_t[:] values + ) -> None: ... + + def map_locations( + self, + values: np.ndarray, # np.ndarray[subclass-specific] + ) -> None: ... + + def lookup( + self, + values: np.ndarray, # np.ndarray[subclass-specific] + ) -> np.ndarray: ... # np.ndarray[np.intp] + + def get_labels( + self, + values: np.ndarray, # np.ndarray[subclass-specific] + uniques, # SubclassTypeVector + count_prior: int = ..., + na_sentinel: int = ..., + na_value: object = ..., + ) -> np.ndarray: ... # np.ndarray[intp_t] + + def unique( + self, + values: np.ndarray, # np.ndarray[subclass-specific] + return_inverse: bool = ..., + ) -> tuple[ + np.ndarray, # np.ndarray[subclass-specific] + np.ndarray, # np.ndarray[np.intp], + ] | np.ndarray: ... # np.ndarray[subclass-specific] + + def _unique( + self, + values: np.ndarray, # np.ndarray[subclass-specific] + uniques, # FooVector + count_prior: int = ..., + na_sentinel: int = ..., + na_value: object = ..., + ignore_na: bool = ..., + return_inverse: bool = ..., + ) -> tuple[ + np.ndarray, # np.ndarray[subclass-specific] + np.ndarray, # np.ndarray[np.intp], + ] | np.ndarray: ... # np.ndarray[subclass-specific] + + def factorize( + self, + values: np.ndarray, # np.ndarray[subclass-specific] + na_sentinel: int = ..., + na_value: object = ..., + mask=..., + ) -> tuple[ + np.ndarray, # np.ndarray[subclass-specific] + np.ndarray, # np.ndarray[np.intp], + ]: ... + +class Complex128HashTable(HashTable): ... +class Complex64HashTable(HashTable): ... +class Float64HashTable(HashTable): ... +class Float32HashTable(HashTable): ... + +class Int64HashTable(HashTable): + # Only Int64HashTable has get_labels_groupby + def get_labels_groupby( + self, + values: np.ndarray, # const int64_t[:] + ) -> tuple[ + np.ndarray, # np.ndarray[np.intp] + np.ndarray, # np.ndarray[np.int64] + ]: ... + +class Int32HashTable(HashTable): ... +class Int16HashTable(HashTable): ... +class Int8HashTable(HashTable): ... +class UInt64HashTable(HashTable): ... +class UInt32HashTable(HashTable): ... +class UInt16HashTable(HashTable): ... +class UInt8HashTable(HashTable): ... + +class StringHashTable(HashTable): ... +class PyObjectHashTable(HashTable): ... + + +def duplicated_int64( + values: np.ndarray, # const int64_t[:] values + keep: Literal["last", "first", False] = ..., +) -> np.ndarray: ... # np.ndarray[bool] +# TODO: Is it actually bool or is it uint8? + +def mode_int64( + values: np.ndarray, # const int64_t[:] values + dropna: bool, +) -> np.ndarray: ... # np.ndarray[np.int64] + +def value_count_int64( + values: np.ndarray, # const int64_t[:] + dropna: bool, +) -> tuple[ + np.ndarray, # np.ndarray[np.int64] + np.ndarray, # np.ndarray[np.int64] +]: ... diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index a25867c4a3b0c..4cacd3245f9d8 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -680,6 +680,7 @@ cdef class {{name}}HashTable(HashTable): def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): + # -> np.ndarray[np.intp] _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True, return_inverse=True) @@ -1012,7 +1013,7 @@ cdef class StringHashTable(HashTable): ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse) + labels : ndarray[intp_t] (if return_inverse) The labels from values to uniques """ uniques = ObjectVector() @@ -1045,7 +1046,7 @@ cdef class StringHashTable(HashTable): ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[intp] The labels from values to uniques """ uniques_vector = ObjectVector() @@ -1056,6 +1057,7 @@ cdef class StringHashTable(HashTable): def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): + # -> np.ndarray[np.intp] _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True, return_inverse=True) @@ -1310,6 +1312,7 @@ cdef class PyObjectHashTable(HashTable): def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): + # -> np.ndarray[np.intp] _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True, return_inverse=True) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2c4477056a112..95aed9cff123b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -555,7 +555,7 @@ def factorize_array( Returns ------- - codes : ndarray + codes : ndarray[np.intp] uniques : ndarray """ hash_klass, values = get_data_algo(values) @@ -907,9 +907,9 @@ def value_counts_arraylike(values, dropna: bool): f = getattr(htable, f"value_count_{ndtype}") keys, counts = f(values, dropna) - keys = _reconstruct_data(keys, original.dtype, original) + res_keys = _reconstruct_data(keys, original.dtype, original) - return keys, counts + return res_keys, counts def duplicated(values: ArrayLike, keep: str | bool = "first") -> np.ndarray: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8e12a8cb18b68..0f14dcf0df0bb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5954,7 +5954,7 @@ def dropna( def drop_duplicates( self, subset: Hashable | Sequence[Hashable] | None = None, - keep: str | bool = "first", + keep: Literal["first"] | Literal["last"] | Literal[False] = "first", inplace: bool = False, ignore_index: bool = False, ) -> DataFrame | None: @@ -6051,7 +6051,7 @@ def drop_duplicates( def duplicated( self, subset: Hashable | Sequence[Hashable] | None = None, - keep: str | bool = "first", + keep: Literal["first"] | Literal["last"] | Literal[False] = "first", ) -> Series: """ Return boolean Series denoting duplicate rows. diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index cdb2702e7f867..8478e2a17efa5 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2138,6 +2138,7 @@ def _factorize_keys( # "_values_for_factorize" rk, _ = rk._values_for_factorize() # type: ignore[union-attr,assignment] + klass: type[libhashtable.Factorizer] | type[libhashtable.Int64Factorizer] if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype): # GH#23917 TODO: needs tests for case where lk is integer-dtype # and rk is datetime-dtype