BUG: issues with hash-function for Float64HashTable (GH21866)

realead · realead · commit 7f12a1d4d6b5 · 2018-07-21T08:45:29.000+02:00
The following issues

   1)  hash(0.0) != hash(-0.0)
   2)  hash(x) != hash(y) for different x,y which are nans

are solved by setting:

   1) hash(-0.0):=hash(0.0)
   2) hash(x):=hash(np.nan) for every x which is nan
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -471,6 +471,7 @@ Numeric
 
 - Bug in :class:`Series` ``__rmatmul__`` doesn't support matrix vector multiplication (:issue:`21530`)
 - Bug in :func:`factorize` fails with read-only array (:issue:`12813`)
+- Fixed bug in :func:`unique` handled signed zeros inconsistently: for some inputs 0.0 and -0.0 were treated as equal and for some inputs as different. Now they are treated as equal for all inputs (:issue:`21866`)
 -
 -
 
diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h
@@ -19,7 +19,20 @@ khint64_t PANDAS_INLINE asint64(double key) {
   memcpy(&val, &key, sizeof(double));
   return val;
 }
-#define kh_float64_hash_func(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
+
+// correct for all inputs but not -0.0 and NaNs
+#define kh_float64_hash_func_0_NAN(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11)
+
+// correct for all inputs but not NaNs
+#define kh_float64_hash_func_NAN(key) ((key) == 0.0 ?                       \
+                                        kh_float64_hash_func_0_NAN(0.0) : \
+                                        kh_float64_hash_func_0_NAN(key))
+
+// correct for all
+#define kh_float64_hash_func(key) ((key) != (key) ?                       \
+                                   kh_float64_hash_func_NAN(Py_NAN) :     \
+                                   kh_float64_hash_func_NAN(key))
+
 #define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
 
 #define KHASH_MAP_INIT_FLOAT64(name, khval_t)								\
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -7,6 +7,7 @@
 from numpy import nan
 from datetime import datetime
 from itertools import permutations
+import struct
 from pandas import (Series, Categorical, CategoricalIndex,
                     Timestamp, DatetimeIndex, Index, IntervalIndex)
 import pandas as pd
@@ -500,6 +501,23 @@ def test_obj_none_preservation(self):
 
         tm.assert_numpy_array_equal(result, expected, strict_nan=True)
 
+    def test_signed_zero(self):
+        a = np.array([-0.0, 0.0])
+        result = pd.unique(a)
+        expected = np.array([-0.0])  # 0.0 and -0.0 are equivalent
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_different_nans(self):
+        # create different nans from bit-patterns:
+        NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
+        NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
+        assert NAN1 != NAN1
+        assert NAN2 != NAN2
+        a = np.array([NAN1, NAN2])  # NAN1 and NAN2 are equivalent
+        result = pd.unique(a)
+        expected = np.array([np.nan])
+        tm.assert_numpy_array_equal(result, expected)
+
 
 class TestIsin(object):
 
@@ -1087,6 +1105,29 @@ def test_lookup_nan(self, writable):
         tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs),
                                                             dtype=np.int64))
 
+    def test_add_signed_zeros(self):
+        # default hash-function would lead to different hash-buckets
+        # for 0.0 and -0.0 if there are more than 2^30 hash-buckets
+        # but this would mean 16GB
+        N = 4  # 12 * 10**8 would trigger the error, if you have enough memory
+        m = ht.Float64HashTable(N)
+        m.set_item(0.0, 0)
+        m.set_item(-0.0, 0)
+        assert len(m) == 1  # 0.0 and -0.0 are equivalent
+
+    def test_add_different_nans(self):
+        # create different nans from bit-patterns:
+        NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
+        NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
+        assert NAN1 != NAN1
+        assert NAN2 != NAN2
+        # default hash function would lead to different hash-buckets
+        # for NAN1 and NAN2 even if there are only 4 buckets:
+        m = ht.Float64HashTable()
+        m.set_item(NAN1, 0)
+        m.set_item(NAN2, 0)
+        assert len(m) == 1  # NAN1 and NAN2 are equivalent
+
     def test_lookup_overflow(self, writable):
         xs = np.array([1, 2, 2**63], dtype=np.uint64)
         # GH 21688 ensure we can deal with readonly memory views

Original file line number	Diff line number	Diff line change
`@@ -471,6 +471,7 @@ Numeric`
`471`	`471`
`472`	`472`	- Bug in :class:`Series` ``__rmatmul__`` doesn't support matrix vector multiplication (:issue:`21530`)
`473`	`473`	- Bug in :func:`factorize` fails with read-only array (:issue:`12813`)
	`474`	+- Fixed bug in :func:`unique` handled signed zeros inconsistently: for some inputs 0.0 and -0.0 were treated as equal and for some inputs as different. Now they are treated as equal for all inputs (:issue:`21866`)
`474`	`475`	`-`
`475`	`476`	`-`
`476`	`477`