Skip to content

TYP: _libs.hashtable #41246

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 242 additions & 0 deletions pandas/_libs/hashtable.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
from typing import (
Any,
Hashable,
Literal,
)

import numpy as np

def unique_label_indices(
labels: np.ndarray, # const int64_t[:]
) -> np.ndarray: ...


class Factorizer:
table: PyObjectHashTable
uniques: ObjectVector
count: int

def __init__(self, size_hint: int): ...
def get_count(self) -> int: ...

def factorize(
self,
values: np.ndarray, # np.ndarray[object]
sort: bool = ...,
na_sentinel=...,
na_value=...,
) -> np.ndarray: ... # np.ndarray[intp]

def unique(
self,
values: np.ndarray, # np.ndarray[object]
) -> np.ndarray: ... # np.ndarray[object]


class Int64Factorizer:
table: Int64HashTable
uniques: Int64Vector
count: int

def __init__(self, size_hint: int): ...
def get_count(self) -> int: ...

def factorize(
self,
values: np.ndarray, # const int64_t[:]
sort: bool = ...,
na_sentinel=...,
na_value=...,
) -> np.ndarray: ... # np.ndarray[intp]


class Int64Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.int64]

class Int32Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.int32]

class Int16Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.int16]

class Int8Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.int8]

class UInt64Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint64]

class UInt32Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint32]

class UInt16Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint16]

class UInt8Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint8]

class Float64Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.float64]

class Float32Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.float32]

class Complex128Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.complex128]

class Complex64Vector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[np.complex64]

class StringVector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[object]

class ObjectVector:
def __init__(self): ...
def __len__(self) -> int: ...
def to_array(self) -> np.ndarray: ... # np.ndarray[object]


class HashTable:
# NB: The base HashTable class does _not_ actually have these methods;
# we are putting the here for the sake of mypy to avoid
# reproducing them in each subclass below.
def __init__(self, size_hint: int = ...): ...
def __len__(self) -> int: ...
def __contains__(self, key: Hashable) -> bool: ...
def sizeof(self, deep: bool = ...) -> int: ...
def get_state(self) -> dict[str, int]: ...

# TODO: `item` type is subclass-specific
def get_item(self, item): ... # TODO: return type?
def set_item(self, item) -> None: ...

# FIXME: we don't actually have this for StringHashTable or ObjectHashTable?
def map(
self,
keys: np.ndarray, # np.ndarray[subclass-specific]
values: np.ndarray, # const int64_t[:] values
) -> None: ...

def map_locations(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
) -> None: ...

def lookup(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
) -> np.ndarray: ... # np.ndarray[np.intp]

def get_labels(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
uniques, # SubclassTypeVector
count_prior: int = ...,
na_sentinel: int = ...,
na_value: object = ...,
) -> np.ndarray: ... # np.ndarray[intp_t]

def unique(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
return_inverse: bool = ...,
) -> tuple[
np.ndarray, # np.ndarray[subclass-specific]
np.ndarray, # np.ndarray[np.intp],
] | np.ndarray: ... # np.ndarray[subclass-specific]

def _unique(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
uniques, # FooVector
count_prior: int = ...,
na_sentinel: int = ...,
na_value: object = ...,
ignore_na: bool = ...,
return_inverse: bool = ...,
) -> tuple[
np.ndarray, # np.ndarray[subclass-specific]
np.ndarray, # np.ndarray[np.intp],
] | np.ndarray: ... # np.ndarray[subclass-specific]

def factorize(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
na_sentinel: int = ...,
na_value: object = ...,
mask=...,
) -> tuple[
np.ndarray, # np.ndarray[subclass-specific]
np.ndarray, # np.ndarray[np.intp],
]: ...

class Complex128HashTable(HashTable): ...
class Complex64HashTable(HashTable): ...
class Float64HashTable(HashTable): ...
class Float32HashTable(HashTable): ...

class Int64HashTable(HashTable):
# Only Int64HashTable has get_labels_groupby
def get_labels_groupby(
self,
values: np.ndarray, # const int64_t[:]
) -> tuple[
np.ndarray, # np.ndarray[np.intp]
np.ndarray, # np.ndarray[np.int64]
]: ...

class Int32HashTable(HashTable): ...
class Int16HashTable(HashTable): ...
class Int8HashTable(HashTable): ...
class UInt64HashTable(HashTable): ...
class UInt32HashTable(HashTable): ...
class UInt16HashTable(HashTable): ...
class UInt8HashTable(HashTable): ...

class StringHashTable(HashTable): ...
class PyObjectHashTable(HashTable): ...


def duplicated_int64(
values: np.ndarray, # const int64_t[:] values
keep: Literal["last", "first", False] = ...,
) -> np.ndarray: ... # np.ndarray[bool]
# TODO: Is it actually bool or is it uint8?

def mode_int64(
values: np.ndarray, # const int64_t[:] values
dropna: bool,
) -> np.ndarray: ... # np.ndarray[np.int64]

def value_count_int64(
values: np.ndarray, # const int64_t[:]
dropna: bool,
) -> tuple[
np.ndarray, # np.ndarray[np.int64]
np.ndarray, # np.ndarray[np.int64]
]: ...
7 changes: 5 additions & 2 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -680,6 +680,7 @@ cdef class {{name}}HashTable(HashTable):
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
# -> np.ndarray[np.intp]
_, labels = self._unique(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value,
ignore_na=True, return_inverse=True)
Expand Down Expand Up @@ -1012,7 +1013,7 @@ cdef class StringHashTable(HashTable):
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[int64] (if return_inverse)
labels : ndarray[intp_t] (if return_inverse)
The labels from values to uniques
"""
uniques = ObjectVector()
Expand Down Expand Up @@ -1045,7 +1046,7 @@ cdef class StringHashTable(HashTable):
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[int64]
labels : ndarray[intp]
The labels from values to uniques
"""
uniques_vector = ObjectVector()
Expand All @@ -1056,6 +1057,7 @@ cdef class StringHashTable(HashTable):
def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
# -> np.ndarray[np.intp]
_, labels = self._unique(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value,
ignore_na=True, return_inverse=True)
Expand Down Expand Up @@ -1310,6 +1312,7 @@ cdef class PyObjectHashTable(HashTable):
def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
# -> np.ndarray[np.intp]
_, labels = self._unique(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value,
ignore_na=True, return_inverse=True)
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,7 +555,7 @@ def factorize_array(

Returns
-------
codes : ndarray
codes : ndarray[np.intp]
uniques : ndarray
"""
hash_klass, values = get_data_algo(values)
Expand Down Expand Up @@ -907,9 +907,9 @@ def value_counts_arraylike(values, dropna: bool):
f = getattr(htable, f"value_count_{ndtype}")
keys, counts = f(values, dropna)

keys = _reconstruct_data(keys, original.dtype, original)
res_keys = _reconstruct_data(keys, original.dtype, original)

return keys, counts
return res_keys, counts


def duplicated(values: ArrayLike, keep: str | bool = "first") -> np.ndarray:
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5954,7 +5954,7 @@ def dropna(
def drop_duplicates(
self,
subset: Hashable | Sequence[Hashable] | None = None,
keep: str | bool = "first",
keep: Literal["first"] | Literal["last"] | Literal[False] = "first",
inplace: bool = False,
ignore_index: bool = False,
) -> DataFrame | None:
Expand Down Expand Up @@ -6051,7 +6051,7 @@ def drop_duplicates(
def duplicated(
self,
subset: Hashable | Sequence[Hashable] | None = None,
keep: str | bool = "first",
keep: Literal["first"] | Literal["last"] | Literal[False] = "first",
) -> Series:
"""
Return boolean Series denoting duplicate rows.
Expand Down
1 change: 1 addition & 0 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2138,6 +2138,7 @@ def _factorize_keys(
# "_values_for_factorize"
rk, _ = rk._values_for_factorize() # type: ignore[union-attr,assignment]

klass: type[libhashtable.Factorizer] | type[libhashtable.Int64Factorizer]
if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype):
# GH#23917 TODO: needs tests for case where lk is integer-dtype
# and rk is datetime-dtype
Expand Down