Skip to content

Commit 7f12a1d

Browse files
committed
BUG: issues with hash-function for Float64HashTable (GH21866)
The following issues 1) hash(0.0) != hash(-0.0) 2) hash(x) != hash(y) for different x,y which are nans are solved by setting: 1) hash(-0.0):=hash(0.0) 2) hash(x):=hash(np.nan) for every x which is nan
1 parent 322dbf4 commit 7f12a1d

File tree

3 files changed

+56
-1
lines changed

3 files changed

+56
-1
lines changed

doc/source/whatsnew/v0.24.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,7 @@ Numeric
471471

472472
- Bug in :class:`Series` ``__rmatmul__`` doesn't support matrix vector multiplication (:issue:`21530`)
473473
- Bug in :func:`factorize` fails with read-only array (:issue:`12813`)
474+
- Fixed bug in :func:`unique` handled signed zeros inconsistently: for some inputs 0.0 and -0.0 were treated as equal and for some inputs as different. Now they are treated as equal for all inputs (:issue:`21866`)
474475
-
475476
-
476477

pandas/_libs/src/klib/khash_python.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,20 @@ khint64_t PANDAS_INLINE asint64(double key) {
1919
memcpy(&val, &key, sizeof(double));
2020
return val;
2121
}
22-
#define kh_float64_hash_func(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
22+
23+
// correct for all inputs but not -0.0 and NaNs
24+
#define kh_float64_hash_func_0_NAN(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
25+
26+
// correct for all inputs but not NaNs
27+
#define kh_float64_hash_func_NAN(key) ((key) == 0.0 ? \
28+
kh_float64_hash_func_0_NAN(0.0) : \
29+
kh_float64_hash_func_0_NAN(key))
30+
31+
// correct for all
32+
#define kh_float64_hash_func(key) ((key) != (key) ? \
33+
kh_float64_hash_func_NAN(Py_NAN) : \
34+
kh_float64_hash_func_NAN(key))
35+
2336
#define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
2437

2538
#define KHASH_MAP_INIT_FLOAT64(name, khval_t) \

pandas/tests/test_algos.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from numpy import nan
88
from datetime import datetime
99
from itertools import permutations
10+
import struct
1011
from pandas import (Series, Categorical, CategoricalIndex,
1112
Timestamp, DatetimeIndex, Index, IntervalIndex)
1213
import pandas as pd
@@ -500,6 +501,23 @@ def test_obj_none_preservation(self):
500501

501502
tm.assert_numpy_array_equal(result, expected, strict_nan=True)
502503

504+
def test_signed_zero(self):
505+
a = np.array([-0.0, 0.0])
506+
result = pd.unique(a)
507+
expected = np.array([-0.0]) # 0.0 and -0.0 are equivalent
508+
tm.assert_numpy_array_equal(result, expected)
509+
510+
def test_different_nans(self):
511+
# create different nans from bit-patterns:
512+
NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
513+
NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
514+
assert NAN1 != NAN1
515+
assert NAN2 != NAN2
516+
a = np.array([NAN1, NAN2]) # NAN1 and NAN2 are equivalent
517+
result = pd.unique(a)
518+
expected = np.array([np.nan])
519+
tm.assert_numpy_array_equal(result, expected)
520+
503521

504522
class TestIsin(object):
505523

@@ -1087,6 +1105,29 @@ def test_lookup_nan(self, writable):
10871105
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs),
10881106
dtype=np.int64))
10891107

1108+
def test_add_signed_zeros(self):
1109+
# default hash-function would lead to different hash-buckets
1110+
# for 0.0 and -0.0 if there are more than 2^30 hash-buckets
1111+
# but this would mean 16GB
1112+
N = 4 # 12 * 10**8 would trigger the error, if you have enough memory
1113+
m = ht.Float64HashTable(N)
1114+
m.set_item(0.0, 0)
1115+
m.set_item(-0.0, 0)
1116+
assert len(m) == 1 # 0.0 and -0.0 are equivalent
1117+
1118+
def test_add_different_nans(self):
1119+
# create different nans from bit-patterns:
1120+
NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
1121+
NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
1122+
assert NAN1 != NAN1
1123+
assert NAN2 != NAN2
1124+
# default hash function would lead to different hash-buckets
1125+
# for NAN1 and NAN2 even if there are only 4 buckets:
1126+
m = ht.Float64HashTable()
1127+
m.set_item(NAN1, 0)
1128+
m.set_item(NAN2, 0)
1129+
assert len(m) == 1 # NAN1 and NAN2 are equivalent
1130+
10901131
def test_lookup_overflow(self, writable):
10911132
xs = np.array([1, 2, 2**63], dtype=np.uint64)
10921133
# GH 21688 ensure we can deal with readonly memory views

0 commit comments

Comments
 (0)