From d9153d3551c3b4d3e5e9f876ae2c23ce36a43f2a Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Wed, 27 Jan 2021 20:44:01 +0100
Subject: [PATCH 1/4] introduce get_state - a tool to investigate the state  of
  a hashmap

---
 pandas/_libs/hashtable_class_helper.pxi.in | 30 ++++++++++++++++++++++
 pandas/tests/libs/test_hashtable.py        |  8 ++++++
 2 files changed, 38 insertions(+)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index a3e72ed858392..5b2ab4a76c9fc 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -420,6 +420,15 @@ cdef class {{name}}HashTable(HashTable):
                                              sizeof(Py_ssize_t))   # vals
         return overhead + for_flags + for_pairs
 
+    def get_state(self):
+        """ returns infos about the state of the hashtable"""
+        return {
+            'n_buckets' : self.table.n_buckets,
+            'size' : self.table.size,
+            'n_occupied' : self.table.n_occupied,
+            'upper_bound' : self.table.upper_bound,
+        }
+
     cpdef get_item(self, {{dtype}}_t val):
         cdef:
             khiter_t k
@@ -747,6 +756,15 @@ cdef class StringHashTable(HashTable):
                                              sizeof(Py_ssize_t))   # vals
         return overhead + for_flags + for_pairs
 
+    def get_state(self):
+        """ returns infos about the state of the hashtable"""
+        return {
+            'n_buckets' : self.table.n_buckets,
+            'size' : self.table.size,
+            'n_occupied' : self.table.n_occupied,
+            'upper_bound' : self.table.upper_bound,
+        }
+
     cpdef get_item(self, str val):
         cdef:
             khiter_t k
@@ -1072,6 +1090,18 @@ cdef class PyObjectHashTable(HashTable):
                                              sizeof(Py_ssize_t))   # vals
         return overhead + for_flags + for_pairs
 
+    def get_state(self):
+        """
+        returns infos about the current state of the hashtable like size,
+        number of buckets and so on.
+        """
+        return {
+            'n_buckets' : self.table.n_buckets,
+            'size' : self.table.size,
+            'n_occupied' : self.table.n_occupied,
+            'upper_bound' : self.table.upper_bound,
+        }
+
     cpdef get_item(self, object val):
         cdef:
             khiter_t k
diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py
index 8393312004241..da12d69e82720 100644
--- a/pandas/tests/libs/test_hashtable.py
+++ b/pandas/tests/libs/test_hashtable.py
@@ -155,6 +155,14 @@ def test_tracemalloc_for_empty(self, table_type, dtype):
             del table
             assert get_allocated_khash_memory() == 0
 
+    def test_get_state(self, table_type, dtype):
+        table = table_type(1000)
+        state = table.get_state()
+        assert state["size"] == 0
+        assert state["n_occupied"] == 0
+        assert "n_buckets" in state
+        assert "upper_bound" in state
+
 
 def test_get_labels_groupby_for_Int64(writable):
     table = ht.Int64HashTable()

From f13fefb0bb25ea0c8784e46a20b461687b862d28 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Wed, 30 Sep 2020 22:49:38 +0200
Subject: [PATCH 2/4] BUG: translate number of elements into number of needed
 buckets, avoiding rehashing for some cases

---
 pandas/_libs/hashtable.pyx                 |  3 ++-
 pandas/_libs/hashtable_class_helper.pxi.in | 15 ++++++---------
 pandas/_libs/hashtable_func_helper.pxi.in  |  2 +-
 pandas/_libs/khash.pxd                     |  3 +++
 pandas/_libs/src/klib/khash_python.h       | 10 ++++++++++
 pandas/tests/libs/test_hashtable.py        | 21 +++++++++++++++++++++
 6 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
index 2c7780e0d95fd..3527fe2d8cd8d 100644
--- a/pandas/_libs/hashtable.pyx
+++ b/pandas/_libs/hashtable.pyx
@@ -19,6 +19,7 @@ from pandas._libs.khash cimport (
     are_equivalent_float64_t,
     are_equivalent_khcomplex64_t,
     are_equivalent_khcomplex128_t,
+    kh_needed_n_buckets,
     kh_str_t,
     khcomplex64_t,
     khcomplex128_t,
@@ -152,7 +153,7 @@ def unique_label_indices(const int64_t[:] labels):
         ndarray[int64_t, ndim=1] arr
         Int64VectorData *ud = idx.data
 
-    kh_resize_int64(table, min(n, SIZE_HINT_LIMIT))
+    kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
 
     with nogil:
         for i in range(n):
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 5b2ab4a76c9fc..0b6bb170cc531 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -392,9 +392,8 @@ cdef class {{name}}HashTable(HashTable):
 
     def __cinit__(self, int64_t size_hint=1):
         self.table = kh_init_{{dtype}}()
-        if size_hint is not None:
-            size_hint = min(size_hint, SIZE_HINT_LIMIT)
-            kh_resize_{{dtype}}(self.table, size_hint)
+        size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
+        kh_resize_{{dtype}}(self.table, size_hint)
 
     def __len__(self) -> int:
         return self.table.size
@@ -740,9 +739,8 @@ cdef class StringHashTable(HashTable):
 
     def __init__(self, int64_t size_hint=1):
         self.table = kh_init_str()
-        if size_hint is not None:
-            size_hint = min(size_hint, SIZE_HINT_LIMIT)
-            kh_resize_str(self.table, size_hint)
+        size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
+        kh_resize_str(self.table, size_hint)
 
     def __dealloc__(self):
         if self.table is not NULL:
@@ -1062,9 +1060,8 @@ cdef class PyObjectHashTable(HashTable):
 
     def __init__(self, int64_t size_hint=1):
         self.table = kh_init_pymap()
-        if size_hint is not None:
-            size_hint = min(size_hint, SIZE_HINT_LIMIT)
-            kh_resize_pymap(self.table, size_hint)
+        size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
+        kh_resize_pymap(self.table, size_hint)
 
     def __dealloc__(self):
         if self.table is not NULL:
diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
index 4684ecb8716c0..772d83e67394c 100644
--- a/pandas/_libs/hashtable_func_helper.pxi.in
+++ b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -121,7 +121,7 @@ def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
         kh_{{ttype}}_t *table = kh_init_{{ttype}}()
         ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
 
-    kh_resize_{{ttype}}(table, min(n, SIZE_HINT_LIMIT))
+    kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
 
     if keep not in ('last', 'first', False):
         raise ValueError('keep must be either "first", "last" or False')
diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd
index 82b8ca4d443db..ba805e9ff1251 100644
--- a/pandas/_libs/khash.pxd
+++ b/pandas/_libs/khash.pxd
@@ -120,4 +120,7 @@ cdef extern from "khash_python.h":
 
     bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil
 
+    khuint_t kh_needed_n_buckets(khuint_t element_n) nogil
+
+
 include "khash_for_primitive_helper.pxi"
diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h
index fc7a650eebba4..0073aaf0195c7 100644
--- a/pandas/_libs/src/klib/khash_python.h
+++ b/pandas/_libs/src/klib/khash_python.h
@@ -244,3 +244,13 @@ void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) {
 void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khuint_t val) {
 	kh_resize_str(table->table, val);
 }
+
+// utility function: given the number of elements
+// returns number of necessary buckets
+khuint_t PANDAS_INLINE kh_needed_n_buckets(khuint_t n_elements){
+    khuint_t candidate = n_elements;
+    kroundup32(candidate);
+    khuint_t upper_bound = (khuint_t)(candidate * __ac_HASH_UPPER + 0.5);
+    return (upper_bound < n_elements) ? 2*candidate : candidate;
+
+}
diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py
index da12d69e82720..de5b753d9b602 100644
--- a/pandas/tests/libs/test_hashtable.py
+++ b/pandas/tests/libs/test_hashtable.py
@@ -163,6 +163,16 @@ def test_get_state(self, table_type, dtype):
         assert "n_buckets" in state
         assert "upper_bound" in state
 
+    def test_no_reallocation(self, table_type, dtype):
+        N = 110
+        keys = np.arange(N).astype(dtype)
+        table = table_type(N)
+        n_buckets_start = table.get_state()["n_buckets"]
+        table.map_locations(keys)
+        n_buckets_end = table.get_state()["n_buckets"]
+        # orgininal number of buckets was enough:
+        assert n_buckets_start == n_buckets_end
+
 
 def test_get_labels_groupby_for_Int64(writable):
     table = ht.Int64HashTable()
@@ -198,6 +208,17 @@ def test_tracemalloc_for_empty_StringHashTable():
         assert get_allocated_khash_memory() == 0
 
 
+def test_no_reallocation_StringHashTable():
+    N = 110
+    keys = np.arange(N).astype(np.compat.unicode).astype(np.object_)
+    table = ht.StringHashTable(N)
+    n_buckets_start = table.get_state()["n_buckets"]
+    table.map_locations(keys)
+    n_buckets_end = table.get_state()["n_buckets"]
+    # orgininal number of buckets was enough:
+    assert n_buckets_start == n_buckets_end
+
+
 @pytest.mark.parametrize(
     "table_type, dtype",
     [

From bfc517d962a5bfcc39f6fa45224bd68d78d28ea8 Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Wed, 30 Sep 2020 23:12:32 +0200
Subject: [PATCH 3/4] size_hint for hash tables does already the right thing,
 not need for additional checks

---
 pandas/core/frame.py   | 6 ++----
 pandas/core/sorting.py | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index c7585b21abe99..3d4d0d20f70bb 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -5451,15 +5451,13 @@ def duplicated(
         4     True
         dtype: bool
         """
-        from pandas._libs.hashtable import SIZE_HINT_LIMIT, duplicated_int64
+        from pandas._libs.hashtable import duplicated_int64
 
         if self.empty:
             return self._constructor_sliced(dtype=bool)
 
         def f(vals):
-            labels, shape = algorithms.factorize(
-                vals, size_hint=min(len(self), SIZE_HINT_LIMIT)
-            )
+            labels, shape = algorithms.factorize(vals, size_hint=len(self))
             return labels.astype("i8", copy=False), len(shape)
 
         if subset is None:
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index 2c2e0c16a4482..cfbabab491ae4 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -593,7 +593,7 @@ def compress_group_index(group_index, sort: bool = True):
     space can be huge, so this function compresses it, by computing offsets
     (comp_ids) into the list of unique labels (obs_group_ids).
     """
-    size_hint = min(len(group_index), hashtable.SIZE_HINT_LIMIT)
+    size_hint = len(group_index)
     table = hashtable.Int64HashTable(size_hint)
 
     group_index = ensure_int64(group_index)

From c30e410fa390bb0302a24843f43e92b264fd838d Mon Sep 17 00:00:00 2001
From: Egor Dranischnikow <egor.dranischnikow@googlemail.com>
Date: Sat, 30 Jan 2021 16:51:35 +0100
Subject: [PATCH 4/4] improve tests for correct preallocation

---
 pandas/tests/libs/test_hashtable.py | 40 +++++++++++++++++------------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py
index de5b753d9b602..a28e2f22560eb 100644
--- a/pandas/tests/libs/test_hashtable.py
+++ b/pandas/tests/libs/test_hashtable.py
@@ -164,14 +164,18 @@ def test_get_state(self, table_type, dtype):
         assert "upper_bound" in state
 
     def test_no_reallocation(self, table_type, dtype):
-        N = 110
-        keys = np.arange(N).astype(dtype)
-        table = table_type(N)
-        n_buckets_start = table.get_state()["n_buckets"]
-        table.map_locations(keys)
-        n_buckets_end = table.get_state()["n_buckets"]
-        # orgininal number of buckets was enough:
-        assert n_buckets_start == n_buckets_end
+        for N in range(1, 110):
+            keys = np.arange(N).astype(dtype)
+            preallocated_table = table_type(N)
+            n_buckets_start = preallocated_table.get_state()["n_buckets"]
+            preallocated_table.map_locations(keys)
+            n_buckets_end = preallocated_table.get_state()["n_buckets"]
+            # orgininal number of buckets was enough:
+            assert n_buckets_start == n_buckets_end
+            # check with clean table (not too much preallocated)
+            clean_table = table_type()
+            clean_table.map_locations(keys)
+            assert n_buckets_start == clean_table.get_state()["n_buckets"]
 
 
 def test_get_labels_groupby_for_Int64(writable):
@@ -209,14 +213,18 @@ def test_tracemalloc_for_empty_StringHashTable():
 
 
 def test_no_reallocation_StringHashTable():
-    N = 110
-    keys = np.arange(N).astype(np.compat.unicode).astype(np.object_)
-    table = ht.StringHashTable(N)
-    n_buckets_start = table.get_state()["n_buckets"]
-    table.map_locations(keys)
-    n_buckets_end = table.get_state()["n_buckets"]
-    # orgininal number of buckets was enough:
-    assert n_buckets_start == n_buckets_end
+    for N in range(1, 110):
+        keys = np.arange(N).astype(np.compat.unicode).astype(np.object_)
+        preallocated_table = ht.StringHashTable(N)
+        n_buckets_start = preallocated_table.get_state()["n_buckets"]
+        preallocated_table.map_locations(keys)
+        n_buckets_end = preallocated_table.get_state()["n_buckets"]
+        # orgininal number of buckets was enough:
+        assert n_buckets_start == n_buckets_end
+        # check with clean table (not too much preallocated)
+        clean_table = ht.StringHashTable()
+        clean_table.map_locations(keys)
+        assert n_buckets_start == clean_table.get_state()["n_buckets"]
 
 
 @pytest.mark.parametrize(