Skip to content

BUG: fixing mixup of bucket and element counts in hash tables #39457

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ from pandas._libs.khash cimport (
are_equivalent_float64_t,
are_equivalent_khcomplex64_t,
are_equivalent_khcomplex128_t,
kh_needed_n_buckets,
kh_str_t,
khcomplex64_t,
khcomplex128_t,
Expand Down Expand Up @@ -152,7 +153,7 @@ def unique_label_indices(const int64_t[:] labels):
ndarray[int64_t, ndim=1] arr
Int64VectorData *ud = idx.data

kh_resize_int64(table, min(n, SIZE_HINT_LIMIT))
kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))

with nogil:
for i in range(n):
Expand Down
45 changes: 36 additions & 9 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -392,9 +392,8 @@ cdef class {{name}}HashTable(HashTable):

def __cinit__(self, int64_t size_hint=1):
self.table = kh_init_{{dtype}}()
if size_hint is not None:
size_hint = min(size_hint, SIZE_HINT_LIMIT)
kh_resize_{{dtype}}(self.table, size_hint)
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
kh_resize_{{dtype}}(self.table, size_hint)

def __len__(self) -> int:
return self.table.size
Expand All @@ -420,6 +419,15 @@ cdef class {{name}}HashTable(HashTable):
sizeof(Py_ssize_t)) # vals
return overhead + for_flags + for_pairs

def get_state(self):
""" returns infos about the state of the hashtable"""
return {
'n_buckets' : self.table.n_buckets,
'size' : self.table.size,
'n_occupied' : self.table.n_occupied,
'upper_bound' : self.table.upper_bound,
}

cpdef get_item(self, {{dtype}}_t val):
cdef:
khiter_t k
Expand Down Expand Up @@ -731,9 +739,8 @@ cdef class StringHashTable(HashTable):

def __init__(self, int64_t size_hint=1):
self.table = kh_init_str()
if size_hint is not None:
size_hint = min(size_hint, SIZE_HINT_LIMIT)
kh_resize_str(self.table, size_hint)
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
kh_resize_str(self.table, size_hint)

def __dealloc__(self):
if self.table is not NULL:
Expand All @@ -747,6 +754,15 @@ cdef class StringHashTable(HashTable):
sizeof(Py_ssize_t)) # vals
return overhead + for_flags + for_pairs

def get_state(self):
""" returns infos about the state of the hashtable"""
return {
'n_buckets' : self.table.n_buckets,
'size' : self.table.size,
'n_occupied' : self.table.n_occupied,
'upper_bound' : self.table.upper_bound,
}

cpdef get_item(self, str val):
cdef:
khiter_t k
Expand Down Expand Up @@ -1044,9 +1060,8 @@ cdef class PyObjectHashTable(HashTable):

def __init__(self, int64_t size_hint=1):
self.table = kh_init_pymap()
if size_hint is not None:
size_hint = min(size_hint, SIZE_HINT_LIMIT)
kh_resize_pymap(self.table, size_hint)
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
kh_resize_pymap(self.table, size_hint)

def __dealloc__(self):
if self.table is not NULL:
Expand All @@ -1072,6 +1087,18 @@ cdef class PyObjectHashTable(HashTable):
sizeof(Py_ssize_t)) # vals
return overhead + for_flags + for_pairs

def get_state(self):
"""
returns infos about the current state of the hashtable like size,
number of buckets and so on.
"""
return {
'n_buckets' : self.table.n_buckets,
'size' : self.table.size,
'n_occupied' : self.table.n_occupied,
'upper_bound' : self.table.upper_bound,
}

cpdef get_item(self, object val):
cdef:
khiter_t k
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')

kh_resize_{{ttype}}(table, min(n, SIZE_HINT_LIMIT))
kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))

if keep not in ('last', 'first', False):
raise ValueError('keep must be either "first", "last" or False')
Expand Down
3 changes: 3 additions & 0 deletions pandas/_libs/khash.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -120,4 +120,7 @@ cdef extern from "khash_python.h":

bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil

khuint_t kh_needed_n_buckets(khuint_t element_n) nogil


include "khash_for_primitive_helper.pxi"
10 changes: 10 additions & 0 deletions pandas/_libs/src/klib/khash_python.h
Original file line number Diff line number Diff line change
Expand Up @@ -244,3 +244,13 @@ void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) {
void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khuint_t val) {
kh_resize_str(table->table, val);
}

// utility function: given the number of elements
// returns number of necessary buckets
khuint_t PANDAS_INLINE kh_needed_n_buckets(khuint_t n_elements){
khuint_t candidate = n_elements;
kroundup32(candidate);
khuint_t upper_bound = (khuint_t)(candidate * __ac_HASH_UPPER + 0.5);
return (upper_bound < n_elements) ? 2*candidate : candidate;

}
6 changes: 2 additions & 4 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5451,15 +5451,13 @@ def duplicated(
4 True
dtype: bool
"""
from pandas._libs.hashtable import SIZE_HINT_LIMIT, duplicated_int64
from pandas._libs.hashtable import duplicated_int64

if self.empty:
return self._constructor_sliced(dtype=bool)

def f(vals):
labels, shape = algorithms.factorize(
vals, size_hint=min(len(self), SIZE_HINT_LIMIT)
)
labels, shape = algorithms.factorize(vals, size_hint=len(self))
return labels.astype("i8", copy=False), len(shape)

if subset is None:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,7 +593,7 @@ def compress_group_index(group_index, sort: bool = True):
space can be huge, so this function compresses it, by computing offsets
(comp_ids) into the list of unique labels (obs_group_ids).
"""
size_hint = min(len(group_index), hashtable.SIZE_HINT_LIMIT)
size_hint = len(group_index)
table = hashtable.Int64HashTable(size_hint)

group_index = ensure_int64(group_index)
Expand Down
37 changes: 37 additions & 0 deletions pandas/tests/libs/test_hashtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,28 @@ def test_tracemalloc_for_empty(self, table_type, dtype):
del table
assert get_allocated_khash_memory() == 0

def test_get_state(self, table_type, dtype):
table = table_type(1000)
state = table.get_state()
assert state["size"] == 0
assert state["n_occupied"] == 0
assert "n_buckets" in state
assert "upper_bound" in state

def test_no_reallocation(self, table_type, dtype):
for N in range(1, 110):
keys = np.arange(N).astype(dtype)
preallocated_table = table_type(N)
n_buckets_start = preallocated_table.get_state()["n_buckets"]
preallocated_table.map_locations(keys)
n_buckets_end = preallocated_table.get_state()["n_buckets"]
# orgininal number of buckets was enough:
assert n_buckets_start == n_buckets_end
# check with clean table (not too much preallocated)
clean_table = table_type()
clean_table.map_locations(keys)
assert n_buckets_start == clean_table.get_state()["n_buckets"]


def test_get_labels_groupby_for_Int64(writable):
table = ht.Int64HashTable()
Expand Down Expand Up @@ -190,6 +212,21 @@ def test_tracemalloc_for_empty_StringHashTable():
assert get_allocated_khash_memory() == 0


def test_no_reallocation_StringHashTable():
for N in range(1, 110):
keys = np.arange(N).astype(np.compat.unicode).astype(np.object_)
preallocated_table = ht.StringHashTable(N)
n_buckets_start = preallocated_table.get_state()["n_buckets"]
preallocated_table.map_locations(keys)
n_buckets_end = preallocated_table.get_state()["n_buckets"]
# orgininal number of buckets was enough:
assert n_buckets_start == n_buckets_end
# check with clean table (not too much preallocated)
clean_table = ht.StringHashTable()
clean_table.map_locations(keys)
assert n_buckets_start == clean_table.get_state()["n_buckets"]


@pytest.mark.parametrize(
"table_type, dtype",
[
Expand Down