Skip to content

Commit 18a0aad

Browse files
committed
hash function for tuples
1 parent 8eece3e commit 18a0aad

File tree

1 file changed

+49
-5
lines changed

1 file changed

+49
-5
lines changed

pandas/_libs/src/klib/khash_python.h

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
242242
}
243243

244244

245-
Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val){
245+
Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val) {
246246
//Since Python3.10, nan is no longer has hash 0
247247
if (Py_IS_NAN(val)) {
248248
return 0;
@@ -255,13 +255,13 @@ Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val){
255255
}
256256

257257

258-
Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key){
258+
Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key) {
259259
return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key));
260260
}
261261

262262

263263
// replaces _Py_HashDouble with _Pandas_HashDouble
264-
Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key){
264+
Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) {
265265
Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real);
266266
Py_uhash_t imaghash = (Py_uhash_t)_Pandas_HashDouble(key->cval.imag);
267267
if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) {
@@ -275,11 +275,52 @@ Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key){
275275
}
276276

277277

278-
khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){
278+
khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key);
279+
280+
//we could use any hashing algorithm, this is the original CPython's for tuples
281+
282+
#if SIZEOF_PY_UHASH_T > 4
283+
#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL)
284+
#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL)
285+
#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL)
286+
#define _PandasHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */
287+
#else
288+
#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL)
289+
#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL)
290+
#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)374761393UL)
291+
#define _PandasHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */
292+
#endif
293+
294+
Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) {
295+
Py_ssize_t i, len = Py_SIZE(key);
296+
PyObject **item = key->ob_item;
297+
298+
Py_uhash_t acc = _PandasHASH_XXPRIME_5;
299+
for (i = 0; i < len; i++) {
300+
Py_uhash_t lane = kh_python_hash_func(item[i]);
301+
if (lane == (Py_uhash_t)-1) {
302+
return -1;
303+
}
304+
acc += lane * _PandasHASH_XXPRIME_2;
305+
acc = _PandasHASH_XXROTATE(acc);
306+
acc *= _PandasHASH_XXPRIME_1;
307+
}
308+
309+
/* Add input length, mangled to keep the historical value of hash(()). */
310+
acc += len ^ (_PandasHASH_XXPRIME_5 ^ 3527539UL);
311+
312+
if (acc == (Py_uhash_t)-1) {
313+
return 1546275796;
314+
}
315+
return acc;
316+
}
317+
318+
319+
khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) {
279320
Py_hash_t hash;
280321
// For PyObject_Hash holds:
281322
// hash(0.0) == 0 == hash(-0.0)
282-
// yet for different nan-object different hash-values
323+
// yet for different nan-objects different hash-values
283324
// are possible
284325
if (PyFloat_CheckExact(key)) {
285326
// we cannot use kh_float64_hash_func
@@ -293,6 +334,9 @@ khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){
293334
// and kh_complex128_hash_func doesn't respect it
294335
hash = complexobject_hash((PyComplexObject*)key);
295336
}
337+
else if (PyTuple_CheckExact(key)) {
338+
hash = tupleobject_hash((PyTupleObject*)key);
339+
}
296340
else {
297341
hash = PyObject_Hash(key);
298342
}

0 commit comments

Comments
 (0)