Skip to content

[ENH] introducing IntpHashMap and making unique_label_indices use intp #40653

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pandas/_libs/hashtable.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ class UInt16HashTable(HashTable): ...
class UInt8HashTable(HashTable): ...
class StringHashTable(HashTable): ...
class PyObjectHashTable(HashTable): ...
class IntpHashTable(HashTable): ...

def duplicated_int64(
values: np.ndarray, # const int64_t[:] values
Expand Down
47 changes: 12 additions & 35 deletions pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,18 @@ cdef Py_ssize_t _INIT_VEC_CAP = 128
include "hashtable_class_helper.pxi"
include "hashtable_func_helper.pxi"


# map derived hash-map types onto basic hash-map types:
if np.dtype(np.intp) == np.dtype(np.int64):
IntpHashTable = Int64HashTable
unique_label_indices = _unique_label_indices_int64
elif np.dtype(np.intp) == np.dtype(np.int32):
IntpHashTable = Int32HashTable
unique_label_indices = _unique_label_indices_int32
else:
raise ValueError(np.dtype(np.intp))


cdef class Factorizer:
cdef readonly:
Py_ssize_t count
Expand Down Expand Up @@ -168,38 +180,3 @@ cdef class Int64Factorizer(Factorizer):

self.count = len(self.uniques)
return labels


@cython.wraparound(False)
@cython.boundscheck(False)
def unique_label_indices(const int64_t[:] labels) -> ndarray:
"""
Indices of the first occurrences of the unique labels
*excluding* -1. equivalent to:
np.unique(labels, return_index=True)[1]
"""
cdef:
int ret = 0
Py_ssize_t i, n = len(labels)
kh_int64_t *table = kh_init_int64()
Int64Vector idx = Int64Vector()
ndarray[int64_t, ndim=1] arr
Int64VectorData *ud = idx.data

kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))

with nogil:
for i in range(n):
kh_put_int64(table, labels[i], &ret)
if ret != 0:
if needs_resize(ud):
with gil:
idx.resize()
append_data_int64(ud, i)

kh_destroy_int64(table)

arr = idx.to_array()
arr = arr[np.asarray(labels)[arr].argsort()]

return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
48 changes: 48 additions & 0 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -470,3 +470,51 @@ cpdef mode(ndarray[htfunc_t] values, bint dropna):

else:
raise TypeError(values.dtype)


{{py:

# name, dtype, ttype, c_type
dtypes = [('Int64', 'int64', 'int64', 'int64_t'),
('Int32', 'int32', 'int32', 'int32_t'), ]

}}

{{for name, dtype, ttype, c_type in dtypes}}


@cython.wraparound(False)
@cython.boundscheck(False)
def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray:
"""
Indices of the first occurrences of the unique labels
*excluding* -1. equivalent to:
np.unique(labels, return_index=True)[1]
"""
cdef:
int ret = 0
Py_ssize_t i, n = len(labels)
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
{{name}}Vector idx = {{name}}Vector()
ndarray[{{c_type}}, ndim=1] arr
{{name}}VectorData *ud = idx.data

kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))

with nogil:
for i in range(n):
kh_put_{{ttype}}(table, labels[i], &ret)
if ret != 0:
if needs_resize(ud):
with gil:
idx.resize()
append_data_{{ttype}}(ud, i)

kh_destroy_{{ttype}}(table)

arr = idx.to_array()
arr = arr[np.asarray(labels)[arr].argsort()]

return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr

{{endfor}}
3 changes: 1 addition & 2 deletions pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,8 +261,7 @@ def decons_obs_group_ids(
out = decons_group_index(obs_ids, shape)
return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)]

# TODO: unique_label_indices only used here, should take ndarray[np.intp]
indexer = unique_label_indices(ensure_int64(comp_ids))
indexer = unique_label_indices(comp_ids)
return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels]


Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/libs/test_hashtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def get_allocated_khash_memory():
(ht.UInt16HashTable, np.uint16),
(ht.Int8HashTable, np.int8),
(ht.UInt8HashTable, np.uint8),
(ht.IntpHashTable, np.intp),
],
)
class TestHashTable:
Expand Down Expand Up @@ -389,6 +390,7 @@ def get_ht_function(fun_name, type_suffix):
(np.uint16, "uint16"),
(np.int8, "int8"),
(np.uint8, "uint8"),
(np.intp, "intp"),
],
)
class TestHelpFunctions:
Expand Down Expand Up @@ -471,6 +473,14 @@ def test_modes_with_nans():
assert np.isnan(modes[0])


def test_unique_label_indices_intp(writable):
keys = np.array([1, 2, 2, 2, 1, 3], dtype=np.intp)
keys.flags.writeable = writable
result = ht.unique_label_indices(keys)
expected = np.array([0, 1, 5], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)


@pytest.mark.parametrize(
"dtype, type_suffix",
[
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -1741,7 +1741,7 @@ def test_quantile():

def test_unique_label_indices():

a = np.random.randint(1, 1 << 10, 1 << 15).astype("int64")
a = np.random.randint(1, 1 << 10, 1 << 15).astype(np.intp)

left = ht.unique_label_indices(a)
right = np.unique(a, return_index=True)[1]
Expand Down