From d9153d3551c3b4d3e5e9f876ae2c23ce36a43f2a Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Wed, 27 Jan 2021 20:44:01 +0100 Subject: [PATCH 1/4] introduce get_state - a tool to investigate the state of a hashmap --- pandas/_libs/hashtable_class_helper.pxi.in | 30 ++++++++++++++++++++++ pandas/tests/libs/test_hashtable.py | 8 ++++++ 2 files changed, 38 insertions(+) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index a3e72ed858392..5b2ab4a76c9fc 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -420,6 +420,15 @@ cdef class {{name}}HashTable(HashTable): sizeof(Py_ssize_t)) # vals return overhead + for_flags + for_pairs + def get_state(self): + """ returns infos about the state of the hashtable""" + return { + 'n_buckets' : self.table.n_buckets, + 'size' : self.table.size, + 'n_occupied' : self.table.n_occupied, + 'upper_bound' : self.table.upper_bound, + } + cpdef get_item(self, {{dtype}}_t val): cdef: khiter_t k @@ -747,6 +756,15 @@ cdef class StringHashTable(HashTable): sizeof(Py_ssize_t)) # vals return overhead + for_flags + for_pairs + def get_state(self): + """ returns infos about the state of the hashtable""" + return { + 'n_buckets' : self.table.n_buckets, + 'size' : self.table.size, + 'n_occupied' : self.table.n_occupied, + 'upper_bound' : self.table.upper_bound, + } + cpdef get_item(self, str val): cdef: khiter_t k @@ -1072,6 +1090,18 @@ cdef class PyObjectHashTable(HashTable): sizeof(Py_ssize_t)) # vals return overhead + for_flags + for_pairs + def get_state(self): + """ + returns infos about the current state of the hashtable like size, + number of buckets and so on. + """ + return { + 'n_buckets' : self.table.n_buckets, + 'size' : self.table.size, + 'n_occupied' : self.table.n_occupied, + 'upper_bound' : self.table.upper_bound, + } + cpdef get_item(self, object val): cdef: khiter_t k diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 8393312004241..da12d69e82720 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -155,6 +155,14 @@ def test_tracemalloc_for_empty(self, table_type, dtype): del table assert get_allocated_khash_memory() == 0 + def test_get_state(self, table_type, dtype): + table = table_type(1000) + state = table.get_state() + assert state["size"] == 0 + assert state["n_occupied"] == 0 + assert "n_buckets" in state + assert "upper_bound" in state + def test_get_labels_groupby_for_Int64(writable): table = ht.Int64HashTable() From f13fefb0bb25ea0c8784e46a20b461687b862d28 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Wed, 30 Sep 2020 22:49:38 +0200 Subject: [PATCH 2/4] BUG: translate number of elements into number of needed buckets, avoiding rehashing for some cases --- pandas/_libs/hashtable.pyx | 3 ++- pandas/_libs/hashtable_class_helper.pxi.in | 15 ++++++--------- pandas/_libs/hashtable_func_helper.pxi.in | 2 +- pandas/_libs/khash.pxd | 3 +++ pandas/_libs/src/klib/khash_python.h | 10 ++++++++++ pandas/tests/libs/test_hashtable.py | 21 +++++++++++++++++++++ 6 files changed, 43 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 2c7780e0d95fd..3527fe2d8cd8d 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -19,6 +19,7 @@ from pandas._libs.khash cimport ( are_equivalent_float64_t, are_equivalent_khcomplex64_t, are_equivalent_khcomplex128_t, + kh_needed_n_buckets, kh_str_t, khcomplex64_t, khcomplex128_t, @@ -152,7 +153,7 @@ def unique_label_indices(const int64_t[:] labels): ndarray[int64_t, ndim=1] arr Int64VectorData *ud = idx.data - kh_resize_int64(table, min(n, SIZE_HINT_LIMIT)) + kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) with nogil: for i in range(n): diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 5b2ab4a76c9fc..0b6bb170cc531 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -392,9 +392,8 @@ cdef class {{name}}HashTable(HashTable): def __cinit__(self, int64_t size_hint=1): self.table = kh_init_{{dtype}}() - if size_hint is not None: - size_hint = min(size_hint, SIZE_HINT_LIMIT) - kh_resize_{{dtype}}(self.table, size_hint) + size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT) + kh_resize_{{dtype}}(self.table, size_hint) def __len__(self) -> int: return self.table.size @@ -740,9 +739,8 @@ cdef class StringHashTable(HashTable): def __init__(self, int64_t size_hint=1): self.table = kh_init_str() - if size_hint is not None: - size_hint = min(size_hint, SIZE_HINT_LIMIT) - kh_resize_str(self.table, size_hint) + size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT) + kh_resize_str(self.table, size_hint) def __dealloc__(self): if self.table is not NULL: @@ -1062,9 +1060,8 @@ cdef class PyObjectHashTable(HashTable): def __init__(self, int64_t size_hint=1): self.table = kh_init_pymap() - if size_hint is not None: - size_hint = min(size_hint, SIZE_HINT_LIMIT) - kh_resize_pymap(self.table, size_hint) + size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT) + kh_resize_pymap(self.table, size_hint) def __dealloc__(self): if self.table is not NULL: diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 4684ecb8716c0..772d83e67394c 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -121,7 +121,7 @@ def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'): kh_{{ttype}}_t *table = kh_init_{{ttype}}() ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') - kh_resize_{{ttype}}(table, min(n, SIZE_HINT_LIMIT)) + kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) if keep not in ('last', 'first', False): raise ValueError('keep must be either "first", "last" or False') diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index 82b8ca4d443db..ba805e9ff1251 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -120,4 +120,7 @@ cdef extern from "khash_python.h": bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil + khuint_t kh_needed_n_buckets(khuint_t element_n) nogil + + include "khash_for_primitive_helper.pxi" diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index fc7a650eebba4..0073aaf0195c7 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -244,3 +244,13 @@ void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) { void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khuint_t val) { kh_resize_str(table->table, val); } + +// utility function: given the number of elements +// returns number of necessary buckets +khuint_t PANDAS_INLINE kh_needed_n_buckets(khuint_t n_elements){ + khuint_t candidate = n_elements; + kroundup32(candidate); + khuint_t upper_bound = (khuint_t)(candidate * __ac_HASH_UPPER + 0.5); + return (upper_bound < n_elements) ? 2*candidate : candidate; + +} diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index da12d69e82720..de5b753d9b602 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -163,6 +163,16 @@ def test_get_state(self, table_type, dtype): assert "n_buckets" in state assert "upper_bound" in state + def test_no_reallocation(self, table_type, dtype): + N = 110 + keys = np.arange(N).astype(dtype) + table = table_type(N) + n_buckets_start = table.get_state()["n_buckets"] + table.map_locations(keys) + n_buckets_end = table.get_state()["n_buckets"] + # orgininal number of buckets was enough: + assert n_buckets_start == n_buckets_end + def test_get_labels_groupby_for_Int64(writable): table = ht.Int64HashTable() @@ -198,6 +208,17 @@ def test_tracemalloc_for_empty_StringHashTable(): assert get_allocated_khash_memory() == 0 +def test_no_reallocation_StringHashTable(): + N = 110 + keys = np.arange(N).astype(np.compat.unicode).astype(np.object_) + table = ht.StringHashTable(N) + n_buckets_start = table.get_state()["n_buckets"] + table.map_locations(keys) + n_buckets_end = table.get_state()["n_buckets"] + # orgininal number of buckets was enough: + assert n_buckets_start == n_buckets_end + + @pytest.mark.parametrize( "table_type, dtype", [ From bfc517d962a5bfcc39f6fa45224bd68d78d28ea8 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Wed, 30 Sep 2020 23:12:32 +0200 Subject: [PATCH 3/4] size_hint for hash tables does already the right thing, not need for additional checks --- pandas/core/frame.py | 6 ++---- pandas/core/sorting.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c7585b21abe99..3d4d0d20f70bb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5451,15 +5451,13 @@ def duplicated( 4 True dtype: bool """ - from pandas._libs.hashtable import SIZE_HINT_LIMIT, duplicated_int64 + from pandas._libs.hashtable import duplicated_int64 if self.empty: return self._constructor_sliced(dtype=bool) def f(vals): - labels, shape = algorithms.factorize( - vals, size_hint=min(len(self), SIZE_HINT_LIMIT) - ) + labels, shape = algorithms.factorize(vals, size_hint=len(self)) return labels.astype("i8", copy=False), len(shape) if subset is None: diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 2c2e0c16a4482..cfbabab491ae4 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -593,7 +593,7 @@ def compress_group_index(group_index, sort: bool = True): space can be huge, so this function compresses it, by computing offsets (comp_ids) into the list of unique labels (obs_group_ids). """ - size_hint = min(len(group_index), hashtable.SIZE_HINT_LIMIT) + size_hint = len(group_index) table = hashtable.Int64HashTable(size_hint) group_index = ensure_int64(group_index) From c30e410fa390bb0302a24843f43e92b264fd838d Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sat, 30 Jan 2021 16:51:35 +0100 Subject: [PATCH 4/4] improve tests for correct preallocation --- pandas/tests/libs/test_hashtable.py | 40 +++++++++++++++++------------ 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index de5b753d9b602..a28e2f22560eb 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -164,14 +164,18 @@ def test_get_state(self, table_type, dtype): assert "upper_bound" in state def test_no_reallocation(self, table_type, dtype): - N = 110 - keys = np.arange(N).astype(dtype) - table = table_type(N) - n_buckets_start = table.get_state()["n_buckets"] - table.map_locations(keys) - n_buckets_end = table.get_state()["n_buckets"] - # orgininal number of buckets was enough: - assert n_buckets_start == n_buckets_end + for N in range(1, 110): + keys = np.arange(N).astype(dtype) + preallocated_table = table_type(N) + n_buckets_start = preallocated_table.get_state()["n_buckets"] + preallocated_table.map_locations(keys) + n_buckets_end = preallocated_table.get_state()["n_buckets"] + # orgininal number of buckets was enough: + assert n_buckets_start == n_buckets_end + # check with clean table (not too much preallocated) + clean_table = table_type() + clean_table.map_locations(keys) + assert n_buckets_start == clean_table.get_state()["n_buckets"] def test_get_labels_groupby_for_Int64(writable): @@ -209,14 +213,18 @@ def test_tracemalloc_for_empty_StringHashTable(): def test_no_reallocation_StringHashTable(): - N = 110 - keys = np.arange(N).astype(np.compat.unicode).astype(np.object_) - table = ht.StringHashTable(N) - n_buckets_start = table.get_state()["n_buckets"] - table.map_locations(keys) - n_buckets_end = table.get_state()["n_buckets"] - # orgininal number of buckets was enough: - assert n_buckets_start == n_buckets_end + for N in range(1, 110): + keys = np.arange(N).astype(np.compat.unicode).astype(np.object_) + preallocated_table = ht.StringHashTable(N) + n_buckets_start = preallocated_table.get_state()["n_buckets"] + preallocated_table.map_locations(keys) + n_buckets_end = preallocated_table.get_state()["n_buckets"] + # orgininal number of buckets was enough: + assert n_buckets_start == n_buckets_end + # check with clean table (not too much preallocated) + clean_table = ht.StringHashTable() + clean_table.map_locations(keys) + assert n_buckets_start == clean_table.get_state()["n_buckets"] @pytest.mark.parametrize(