From 4ac7a5afd1d325acf20231906c29370a287a7e01 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Sat, 27 Mar 2021 07:03:55 +0100
Subject: [PATCH 1/4] introducing IntpHashMap

---
 pandas/_libs/hashtable.pyx          | 18 ++++++++++++++++++
 pandas/tests/libs/test_hashtable.py |  2 ++
 2 files changed, 20 insertions(+)

diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
index 3eb7bcc673cd4..bc285ca0ffd88 100644
--- a/pandas/_libs/hashtable.pyx
+++ b/pandas/_libs/hashtable.pyx
@@ -65,6 +65,24 @@ cdef Py_ssize_t _INIT_VEC_CAP = 128
 include "hashtable_class_helper.pxi"
 include "hashtable_func_helper.pxi"
 
+
+# map derived hash-map types onto basic hash-map types:
+if np.dtype(np.intp) == np.dtype(np.int64):
+    IntpHashTable = Int64HashTable
+    value_count_intp = value_count_int64
+    duplicated_intp = duplicated_int64
+    ismember_intp = ismember_int64
+    mode_intp = mode_int64
+elif np.dtype(np.intp) == np.dtype(np.int32):
+    IntpHashTable = Int32HashTable
+    value_count_intp = value_count_int32
+    duplicated_intp = duplicated_int32
+    ismember_intp = ismember_int32
+    mode_intp = mode_int32
+else:
+    raise ValueError(np.dtype(np.intp))
+
+
 cdef class Factorizer:
     cdef readonly:
         Py_ssize_t count
diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py
index 5ff20051da8c0..3290e1a1d9316 100644
--- a/pandas/tests/libs/test_hashtable.py
+++ b/pandas/tests/libs/test_hashtable.py
@@ -44,6 +44,7 @@ def get_allocated_khash_memory():
         (ht.UInt16HashTable, np.uint16),
         (ht.Int8HashTable, np.int8),
         (ht.UInt8HashTable, np.uint8),
+        (ht.IntpHashTable, np.intp),
     ],
 )
 class TestHashTable:
@@ -389,6 +390,7 @@ def get_ht_function(fun_name, type_suffix):
         (np.uint16, "uint16"),
         (np.int8, "int8"),
         (np.uint8, "uint8"),
+        (np.intp, "intp"),
     ],
 )
 class TestHelpFunctions:

From ca197248b4e7e1f8e5209b4326bc8b6a684d437f Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Sun, 28 Mar 2021 22:11:40 +0200
Subject: [PATCH 2/4] introduce unique_label_indices_intp

---
 pandas/_libs/hashtable.pyx                | 47 ++--------------------
 pandas/_libs/hashtable_func_helper.pxi.in | 48 +++++++++++++++++++++++
 pandas/tests/libs/test_hashtable.py       |  8 ++++
 3 files changed, 60 insertions(+), 43 deletions(-)

diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
index bc285ca0ffd88..f0096baec9bc1 100644
--- a/pandas/_libs/hashtable.pyx
+++ b/pandas/_libs/hashtable.pyx
@@ -69,16 +69,12 @@ include "hashtable_func_helper.pxi"
 # map derived hash-map types onto basic hash-map types:
 if np.dtype(np.intp) == np.dtype(np.int64):
     IntpHashTable = Int64HashTable
-    value_count_intp = value_count_int64
-    duplicated_intp = duplicated_int64
-    ismember_intp = ismember_int64
-    mode_intp = mode_int64
+    unique_label_indices = _unique_label_indices_int64
+    unique_label_indices_intp = _unique_label_indices_int64
 elif np.dtype(np.intp) == np.dtype(np.int32):
     IntpHashTable = Int32HashTable
-    value_count_intp = value_count_int32
-    duplicated_intp = duplicated_int32
-    ismember_intp = ismember_int32
-    mode_intp = mode_int32
+    unique_label_indices = _unique_label_indices_int64
+    unique_label_indices_intp = _unique_label_indices_int32
 else:
     raise ValueError(np.dtype(np.intp))
 
@@ -186,38 +182,3 @@ cdef class Int64Factorizer(Factorizer):
 
         self.count = len(self.uniques)
         return labels
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def unique_label_indices(const int64_t[:] labels) -> ndarray:
-    """
-    Indices of the first occurrences of the unique labels
-    *excluding* -1. equivalent to:
-        np.unique(labels, return_index=True)[1]
-    """
-    cdef:
-        int ret = 0
-        Py_ssize_t i, n = len(labels)
-        kh_int64_t *table = kh_init_int64()
-        Int64Vector idx = Int64Vector()
-        ndarray[int64_t, ndim=1] arr
-        Int64VectorData *ud = idx.data
-
-    kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
-    with nogil:
-        for i in range(n):
-            kh_put_int64(table, labels[i], &ret)
-            if ret != 0:
-                if needs_resize(ud):
-                    with gil:
-                        idx.resize()
-                append_data_int64(ud, i)
-
-    kh_destroy_int64(table)
-
-    arr = idx.to_array()
-    arr = arr[np.asarray(labels)[arr].argsort()]
-
-    return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
index ceb473a0b06af..fb8ce79a924a4 100644
--- a/pandas/_libs/hashtable_func_helper.pxi.in
+++ b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -470,3 +470,51 @@ cpdef mode(ndarray[htfunc_t] values, bint dropna):
 
     else:
         raise TypeError(values.dtype)
+
+
+{{py:
+
+# name, dtype, ttype, c_type
+dtypes = [('Int64', 'int64', 'int64', 'int64_t'),
+          ('Int32', 'int32', 'int32', 'int32_t'), ]
+
+}}
+
+{{for name, dtype, ttype, c_type in dtypes}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray:
+    """
+    Indices of the first occurrences of the unique labels
+    *excluding* -1. equivalent to:
+        np.unique(labels, return_index=True)[1]
+    """
+    cdef:
+        int ret = 0
+        Py_ssize_t i, n = len(labels)
+        kh_{{ttype}}_t *table = kh_init_{{ttype}}()
+        {{name}}Vector idx = {{name}}Vector()
+        ndarray[{{c_type}}, ndim=1] arr
+        {{name}}VectorData *ud = idx.data
+
+    kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
+
+    with nogil:
+        for i in range(n):
+            kh_put_{{ttype}}(table, labels[i], &ret)
+            if ret != 0:
+                if needs_resize(ud):
+                    with gil:
+                        idx.resize()
+                append_data_{{ttype}}(ud, i)
+
+    kh_destroy_{{ttype}}(table)
+
+    arr = idx.to_array()
+    arr = arr[np.asarray(labels)[arr].argsort()]
+
+    return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
+
+{{endfor}}
diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py
index 3290e1a1d9316..8b7304a84c27b 100644
--- a/pandas/tests/libs/test_hashtable.py
+++ b/pandas/tests/libs/test_hashtable.py
@@ -473,6 +473,14 @@ def test_modes_with_nans():
     assert np.isnan(modes[0])
 
 
+def test_unique_label_indices_intp(writable):
+    keys = np.array([1, 2, 2, 2, 1, 3], dtype=np.intp)
+    keys.flags.writeable = writable
+    result = ht.unique_label_indices(keys)
+    expected = np.array([0, 1, 5], dtype=np.intp)
+    tm.assert_numpy_array_equal(result, expected)
+
+
 @pytest.mark.parametrize(
     "dtype, type_suffix",
     [

From 9e9dda526397d4df7ae2b456c91da5d5a8777c8a Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Mon, 29 Mar 2021 20:51:13 +0200
Subject: [PATCH 3/4] make unique_label_indices use intp

---
 pandas/_libs/hashtable.pyx | 4 +---
 pandas/core/sorting.py     | 3 +--
 pandas/tests/test_algos.py | 2 +-
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
index f0096baec9bc1..6e97c13c644cf 100644
--- a/pandas/_libs/hashtable.pyx
+++ b/pandas/_libs/hashtable.pyx
@@ -70,11 +70,9 @@ include "hashtable_func_helper.pxi"
 if np.dtype(np.intp) == np.dtype(np.int64):
     IntpHashTable = Int64HashTable
     unique_label_indices = _unique_label_indices_int64
-    unique_label_indices_intp = _unique_label_indices_int64
 elif np.dtype(np.intp) == np.dtype(np.int32):
     IntpHashTable = Int32HashTable
-    unique_label_indices = _unique_label_indices_int64
-    unique_label_indices_intp = _unique_label_indices_int32
+    unique_label_indices = _unique_label_indices_int32
 else:
     raise ValueError(np.dtype(np.intp))
 
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index ccb51a0ea2132..a8348b0c5773f 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -261,8 +261,7 @@ def decons_obs_group_ids(
         out = decons_group_index(obs_ids, shape)
         return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)]
 
-    # TODO: unique_label_indices only used here, should take ndarray[np.intp]
-    indexer = unique_label_indices(ensure_int64(comp_ids))
+    indexer = unique_label_indices(comp_ids)
     return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels]
 
 
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 5488c076554fd..c55f673e4f3e4 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1741,7 +1741,7 @@ def test_quantile():
 
 def test_unique_label_indices():
 
-    a = np.random.randint(1, 1 << 10, 1 << 15).astype("int64")
+    a = np.random.randint(1, 1 << 10, 1 << 15).astype(np.intp)
 
     left = ht.unique_label_indices(a)
     right = np.unique(a, return_index=True)[1]

From 9a444011011302e0311958383e09104269e52fbb Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Fri, 11 Jun 2021 17:13:27 +0200
Subject: [PATCH 4/4] adding IntpHashTable to hashtable.pyi

---
 pandas/_libs/hashtable.pyi | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi
index bf7df5776896b..9c1de67a7ba2a 100644
--- a/pandas/_libs/hashtable.pyi
+++ b/pandas/_libs/hashtable.pyi
@@ -192,6 +192,7 @@ class UInt16HashTable(HashTable): ...
 class UInt8HashTable(HashTable): ...
 class StringHashTable(HashTable): ...
 class PyObjectHashTable(HashTable): ...
+class IntpHashTable(HashTable): ...
 
 def duplicated_int64(
     values: np.ndarray,  # const int64_t[:] values