From 4ac7a5afd1d325acf20231906c29370a287a7e01 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sat, 27 Mar 2021 07:03:55 +0100 Subject: [PATCH 1/4] introducing IntpHashMap --- pandas/_libs/hashtable.pyx | 18 ++++++++++++++++++ pandas/tests/libs/test_hashtable.py | 2 ++ 2 files changed, 20 insertions(+) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 3eb7bcc673cd4..bc285ca0ffd88 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -65,6 +65,24 @@ cdef Py_ssize_t _INIT_VEC_CAP = 128 include "hashtable_class_helper.pxi" include "hashtable_func_helper.pxi" + +# map derived hash-map types onto basic hash-map types: +if np.dtype(np.intp) == np.dtype(np.int64): + IntpHashTable = Int64HashTable + value_count_intp = value_count_int64 + duplicated_intp = duplicated_int64 + ismember_intp = ismember_int64 + mode_intp = mode_int64 +elif np.dtype(np.intp) == np.dtype(np.int32): + IntpHashTable = Int32HashTable + value_count_intp = value_count_int32 + duplicated_intp = duplicated_int32 + ismember_intp = ismember_int32 + mode_intp = mode_int32 +else: + raise ValueError(np.dtype(np.intp)) + + cdef class Factorizer: cdef readonly: Py_ssize_t count diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 5ff20051da8c0..3290e1a1d9316 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -44,6 +44,7 @@ def get_allocated_khash_memory(): (ht.UInt16HashTable, np.uint16), (ht.Int8HashTable, np.int8), (ht.UInt8HashTable, np.uint8), + (ht.IntpHashTable, np.intp), ], ) class TestHashTable: @@ -389,6 +390,7 @@ def get_ht_function(fun_name, type_suffix): (np.uint16, "uint16"), (np.int8, "int8"), (np.uint8, "uint8"), + (np.intp, "intp"), ], ) class TestHelpFunctions: From ca197248b4e7e1f8e5209b4326bc8b6a684d437f Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sun, 28 Mar 2021 22:11:40 +0200 Subject: [PATCH 2/4] introduce unique_label_indices_intp --- pandas/_libs/hashtable.pyx | 47 ++-------------------- pandas/_libs/hashtable_func_helper.pxi.in | 48 +++++++++++++++++++++++ pandas/tests/libs/test_hashtable.py | 8 ++++ 3 files changed, 60 insertions(+), 43 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index bc285ca0ffd88..f0096baec9bc1 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -69,16 +69,12 @@ include "hashtable_func_helper.pxi" # map derived hash-map types onto basic hash-map types: if np.dtype(np.intp) == np.dtype(np.int64): IntpHashTable = Int64HashTable - value_count_intp = value_count_int64 - duplicated_intp = duplicated_int64 - ismember_intp = ismember_int64 - mode_intp = mode_int64 + unique_label_indices = _unique_label_indices_int64 + unique_label_indices_intp = _unique_label_indices_int64 elif np.dtype(np.intp) == np.dtype(np.int32): IntpHashTable = Int32HashTable - value_count_intp = value_count_int32 - duplicated_intp = duplicated_int32 - ismember_intp = ismember_int32 - mode_intp = mode_int32 + unique_label_indices = _unique_label_indices_int64 + unique_label_indices_intp = _unique_label_indices_int32 else: raise ValueError(np.dtype(np.intp)) @@ -186,38 +182,3 @@ cdef class Int64Factorizer(Factorizer): self.count = len(self.uniques) return labels - - -@cython.wraparound(False) -@cython.boundscheck(False) -def unique_label_indices(const int64_t[:] labels) -> ndarray: - """ - Indices of the first occurrences of the unique labels - *excluding* -1. equivalent to: - np.unique(labels, return_index=True)[1] - """ - cdef: - int ret = 0 - Py_ssize_t i, n = len(labels) - kh_int64_t *table = kh_init_int64() - Int64Vector idx = Int64Vector() - ndarray[int64_t, ndim=1] arr - Int64VectorData *ud = idx.data - - kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) - - with nogil: - for i in range(n): - kh_put_int64(table, labels[i], &ret) - if ret != 0: - if needs_resize(ud): - with gil: - idx.resize() - append_data_int64(ud, i) - - kh_destroy_int64(table) - - arr = idx.to_array() - arr = arr[np.asarray(labels)[arr].argsort()] - - return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index ceb473a0b06af..fb8ce79a924a4 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -470,3 +470,51 @@ cpdef mode(ndarray[htfunc_t] values, bint dropna): else: raise TypeError(values.dtype) + + +{{py: + +# name, dtype, ttype, c_type +dtypes = [('Int64', 'int64', 'int64', 'int64_t'), + ('Int32', 'int32', 'int32', 'int32_t'), ] + +}} + +{{for name, dtype, ttype, c_type in dtypes}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray: + """ + Indices of the first occurrences of the unique labels + *excluding* -1. equivalent to: + np.unique(labels, return_index=True)[1] + """ + cdef: + int ret = 0 + Py_ssize_t i, n = len(labels) + kh_{{ttype}}_t *table = kh_init_{{ttype}}() + {{name}}Vector idx = {{name}}Vector() + ndarray[{{c_type}}, ndim=1] arr + {{name}}VectorData *ud = idx.data + + kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) + + with nogil: + for i in range(n): + kh_put_{{ttype}}(table, labels[i], &ret) + if ret != 0: + if needs_resize(ud): + with gil: + idx.resize() + append_data_{{ttype}}(ud, i) + + kh_destroy_{{ttype}}(table) + + arr = idx.to_array() + arr = arr[np.asarray(labels)[arr].argsort()] + + return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr + +{{endfor}} diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 3290e1a1d9316..8b7304a84c27b 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -473,6 +473,14 @@ def test_modes_with_nans(): assert np.isnan(modes[0]) +def test_unique_label_indices_intp(writable): + keys = np.array([1, 2, 2, 2, 1, 3], dtype=np.intp) + keys.flags.writeable = writable + result = ht.unique_label_indices(keys) + expected = np.array([0, 1, 5], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( "dtype, type_suffix", [ From 9e9dda526397d4df7ae2b456c91da5d5a8777c8a Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Mon, 29 Mar 2021 20:51:13 +0200 Subject: [PATCH 3/4] make unique_label_indices use intp --- pandas/_libs/hashtable.pyx | 4 +--- pandas/core/sorting.py | 3 +-- pandas/tests/test_algos.py | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index f0096baec9bc1..6e97c13c644cf 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -70,11 +70,9 @@ include "hashtable_func_helper.pxi" if np.dtype(np.intp) == np.dtype(np.int64): IntpHashTable = Int64HashTable unique_label_indices = _unique_label_indices_int64 - unique_label_indices_intp = _unique_label_indices_int64 elif np.dtype(np.intp) == np.dtype(np.int32): IntpHashTable = Int32HashTable - unique_label_indices = _unique_label_indices_int64 - unique_label_indices_intp = _unique_label_indices_int32 + unique_label_indices = _unique_label_indices_int32 else: raise ValueError(np.dtype(np.intp)) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index ccb51a0ea2132..a8348b0c5773f 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -261,8 +261,7 @@ def decons_obs_group_ids( out = decons_group_index(obs_ids, shape) return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)] - # TODO: unique_label_indices only used here, should take ndarray[np.intp] - indexer = unique_label_indices(ensure_int64(comp_ids)) + indexer = unique_label_indices(comp_ids) return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels] diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 5488c076554fd..c55f673e4f3e4 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1741,7 +1741,7 @@ def test_quantile(): def test_unique_label_indices(): - a = np.random.randint(1, 1 << 10, 1 << 15).astype("int64") + a = np.random.randint(1, 1 << 10, 1 << 15).astype(np.intp) left = ht.unique_label_indices(a) right = np.unique(a, return_index=True)[1] From 9a444011011302e0311958383e09104269e52fbb Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 11 Jun 2021 17:13:27 +0200 Subject: [PATCH 4/4] adding IntpHashTable to hashtable.pyi --- pandas/_libs/hashtable.pyi | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index bf7df5776896b..9c1de67a7ba2a 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -192,6 +192,7 @@ class UInt16HashTable(HashTable): ... class UInt8HashTable(HashTable): ... class StringHashTable(HashTable): ... class PyObjectHashTable(HashTable): ... +class IntpHashTable(HashTable): ... def duplicated_int64( values: np.ndarray, # const int64_t[:] values