From 3d3074b7b30c82c9fbdc5fd4a8c0412a6636f8e5 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 11 May 2017 22:06:35 +0200
Subject: [PATCH 1/8] PERF: improve hash collision check for single MI labels

---
 pandas/_libs/hashtable_class_helper.pxi.in | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index b80a592669eca..049345bed0fae 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -4,6 +4,8 @@ Template for each `dtype` helper function for hashtable
 WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 """
 
+from pandas.core.dtypes.missing import array_equivalent
+
 #----------------------------------------------------------------------
 # VectorData
 #----------------------------------------------------------------------
@@ -921,6 +923,16 @@ cdef class MultiIndexHashTable(HashTable):
                     "hash collision\nlocs:\n{}\n"
                     "result:\n{}\nmi:\n{}".format(alocs, result, mi))
 
+    def _check_for_collision(self, Py_ssize_t loc, object label):
+        # validate that the loc maps to the actual value
+        # version of _check_for_collisions above for single label (tuple)
+
+        result = self.mi[loc]
+        if not array_equivalent(result, label):
+            raise AssertionError(
+                "hash collision\nloc:\n{}\n"
+                "result:\n{}\nmi:\n{}".format(loc, result, label))
+
     def __contains__(self, object key):
         try:
             self.get_item(key)
@@ -939,8 +951,7 @@ cdef class MultiIndexHashTable(HashTable):
         k = kh_get_uint64(self.table, value)
         if k != self.table.n_buckets:
             loc = self.table.vals[k]
-            locs = np.array([loc], dtype=np.int64)
-            self._check_for_collisions(locs, key)
+            self._check_for_collision(loc, key)
             return loc
         else:
             raise KeyError(key)

From d1d1513a162dcbc9a1d21160212b4768bdb6b000 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 12 May 2017 22:12:58 +0200
Subject: [PATCH 2/8] PERF: specialized hash function for single tuples

---
 pandas/core/indexes/multi.py      |  4 ++--
 pandas/core/util/hashing.py       | 25 ++++++++++++++++++++++++-
 pandas/tests/util/test_hashing.py |  9 ++++++++-
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 3db5633ec30bd..569e16f2141ae 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -748,7 +748,7 @@ def _hashed_indexing_key(self, key):
         we need to stringify if we have mixed levels
 
         """
-        from pandas.core.util.hashing import hash_tuples
+        from pandas.core.util.hashing import hash_tuples, hash_tuple
 
         if not isinstance(key, tuple):
             return hash_tuples(key)
@@ -762,7 +762,7 @@ def f(k, stringify):
             return k
         key = tuple([f(k, stringify)
                      for k, stringify in zip(key, self._have_mixed_levels)])
-        return hash_tuples(key)
+        return hash_tuple(key)
 
     @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs)
     def duplicated(self, keep='first'):
diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
index f0829adc94500..771c181ac5a90 100644
--- a/pandas/core/util/hashing.py
+++ b/pandas/core/util/hashing.py
@@ -164,6 +164,29 @@ def hash_tuples(vals, encoding='utf8', hash_key=None):
     return h
 
 
+def hash_tuple(val, encoding='utf8', hash_key=None):
+    """
+    Hash a single tuple efficiently
+
+    Parameters
+    ----------
+    val : single tuple
+    encoding : string, default 'utf8'
+    hash_key : string key to encode, default to _default_hash_key
+
+    Returns
+    -------
+    hash
+
+    """
+    hashes = (hash_array(np.array([v]), encoding=encoding, hash_key=hash_key,
+                         categorize=False)
+              for v in val)
+    h = _combine_hash_arrays(hashes, len(val))[0]
+
+    return h
+
+
 def _hash_categorical(c, encoding, hash_key):
     """
     Hash a Categorical by hashing its categories, and then mapping the codes
@@ -264,7 +287,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
 
         try:
             vals = hashing.hash_object_array(vals, hash_key, encoding)
-        except TypeError:
+        except (TypeError, ValueError):
             # we have mixed types
             vals = hashing.hash_object_array(vals.astype(str).astype(object),
                                              hash_key, encoding)
diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py
index e1e6e43529a7d..d1a604e4acecd 100644
--- a/pandas/tests/util/test_hashing.py
+++ b/pandas/tests/util/test_hashing.py
@@ -6,7 +6,7 @@
 
 from pandas import DataFrame, Series, Index, MultiIndex
 from pandas.util import hash_array, hash_pandas_object
-from pandas.core.util.hashing import hash_tuples
+from pandas.core.util.hashing import hash_tuples, hash_tuple
 import pandas.util.testing as tm
 
 
@@ -79,6 +79,13 @@ def test_hash_tuples(self):
         result = hash_tuples(tups[0])
         assert result == expected[0]
 
+    def test_hash_tuple(self):
+        # test equivalence between hash_tuples and hash_tuple
+        tup = (1, 'one')
+        result = hash_tuple(tup)
+        expected = hash_tuples([tup])[0]
+        assert result == expected
+
     def test_hash_tuples_err(self):
 
         for val in [5, 'foo', pd.Timestamp('20130101')]:

From 664d2b35bd0ec240c4d638ccf85f21761dcf8fee Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 15 May 2017 12:12:16 +0200
Subject: [PATCH 3/8] feedback

---
 pandas/_libs/hashtable.pxd                 |  2 +
 pandas/_libs/hashtable_class_helper.pxi.in | 10 ++-
 pandas/core/util/hashing.py                | 75 +++++++++++++++++++++-
 pandas/tests/util/test_hashing.py          | 20 ++++--
 4 files changed, 97 insertions(+), 10 deletions(-)

diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd
index 3366751af144d..014da22df3382 100644
--- a/pandas/_libs/hashtable.pxd
+++ b/pandas/_libs/hashtable.pxd
@@ -38,6 +38,8 @@ cdef class MultiIndexHashTable(HashTable):
 
     cpdef get_item(self, object val)
     cpdef set_item(self, object key, Py_ssize_t val)
+    cdef inline void _check_for_collision(self, Py_ssize_t loc, object label)
+
 
 cdef class StringHashTable(HashTable):
     cdef kh_str_t *table
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 049345bed0fae..3ef52c5c59c9d 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -4,7 +4,8 @@ Template for each `dtype` helper function for hashtable
 WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 """
 
-from pandas.core.dtypes.missing import array_equivalent
+from lib cimport is_null_datetimelike
+
 
 #----------------------------------------------------------------------
 # VectorData
@@ -923,12 +924,15 @@ cdef class MultiIndexHashTable(HashTable):
                     "hash collision\nlocs:\n{}\n"
                     "result:\n{}\nmi:\n{}".format(alocs, result, mi))
 
-    def _check_for_collision(self, Py_ssize_t loc, object label):
+    cdef inline void _check_for_collision(self, Py_ssize_t loc, object label):
         # validate that the loc maps to the actual value
         # version of _check_for_collisions above for single label (tuple)
 
         result = self.mi[loc]
-        if not array_equivalent(result, label):
+
+        if not all(l == r or (is_null_datetimelike(l)
+                              and is_null_datetimelike(r))
+                   for l, r in zip(result, label)):
             raise AssertionError(
                 "hash collision\nloc:\n{}\n"
                 "result:\n{}\nmi:\n{}".format(loc, result, label))
diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
index 771c181ac5a90..522b3b9d7bfb8 100644
--- a/pandas/core/util/hashing.py
+++ b/pandas/core/util/hashing.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 from pandas._libs import hashing
+from pandas.compat import string_and_binary_types, text_type
 from pandas.core.dtypes.generic import (
     ABCMultiIndex,
     ABCIndexClass,
@@ -12,6 +13,8 @@
     ABCDataFrame)
 from pandas.core.dtypes.common import (
     is_categorical_dtype, is_list_like)
+from pandas.core.dtypes.missing import isnull
+
 
 # 16 byte long hashing key
 _default_hash_key = '0123456789123456'
@@ -179,9 +182,17 @@ def hash_tuple(val, encoding='utf8', hash_key=None):
     hash
 
     """
-    hashes = (hash_array(np.array([v]), encoding=encoding, hash_key=hash_key,
-                         categorize=False)
+    #def to_array(v):
+    #    dtype, arr = infer_dtype_from_array([v])
+    #    return np.asarray(arr, dtype=dtype)
+
+    #hashes = (hash_array(to_array(v), encoding=encoding, hash_key=hash_key,
+    #                     categorize=False)
+    #          for v in val)
+
+    hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key)
               for v in val)
+
     h = _combine_hash_arrays(hashes, len(val))[0]
 
     return h
@@ -299,3 +310,63 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
     vals *= np.uint64(0x94d049bb133111eb)
     vals ^= vals >> 31
     return vals
+
+
+def _hash_scalar(val, encoding='utf8', hash_key=None):
+    """
+    Hash scalar value
+
+    Returns
+    -------
+    1d uint64 numpy array of hash value, of length 1
+    """
+
+    if hash_key is None:
+        hash_key = _default_hash_key
+
+    if isnull(val):
+        # this is to be consistent with the _hash_categorical implementation
+        return np.array([np.iinfo(np.uint64).max], dtype='u8')
+
+    if isinstance(val, string_and_binary_types + (text_type,)):
+        vals = np.array([val], dtype=object)
+        string_like = True
+    else:
+        vals = np.array([val])
+        string_like = False
+
+    dtype = vals.dtype
+
+    #dtype, vals = infer_dtype_from_array([vals])
+    #if dtype == np.object_:
+    #    vals = np.asarray(vals, dtype='object')
+    #    dtype = vals.dtype
+
+    # we'll be working with everything as 64-bit values, so handle this
+    # 128-bit value early
+    if np.issubdtype(dtype, np.complex128):
+        return hash_array(vals.real) + 23 * hash_array(vals.imag)
+
+    # First, turn whatever array this is into unsigned 64-bit ints, if we can
+    # manage it.
+    elif isinstance(dtype, np.bool):
+        vals = vals.astype('u8')
+    elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
+        vals = vals.view('i8').astype('u8', copy=False)
+    elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
+        vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
+    else:
+        if not string_like:
+            from pandas import Index
+            vals = Index(vals).values
+            return hash_array(vals, hash_key=hash_key, encoding=encoding,
+                              categorize=False)
+        vals = hashing.hash_object_array(vals, hash_key, encoding)
+
+    # Then, redistribute these 64-bit ints within the space of 64-bit ints
+    vals ^= vals >> 30
+    vals *= np.uint64(0xbf58476d1ce4e5b9)
+    vals ^= vals >> 27
+    vals *= np.uint64(0x94d049bb133111eb)
+    vals ^= vals >> 31
+    return vals
diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py
index d1a604e4acecd..f7de891e0e7d9 100644
--- a/pandas/tests/util/test_hashing.py
+++ b/pandas/tests/util/test_hashing.py
@@ -6,7 +6,7 @@
 
 from pandas import DataFrame, Series, Index, MultiIndex
 from pandas.util import hash_array, hash_pandas_object
-from pandas.core.util.hashing import hash_tuples, hash_tuple
+from pandas.core.util.hashing import hash_tuples, hash_tuple, _hash_scalar
 import pandas.util.testing as tm
 
 
@@ -81,10 +81,20 @@ def test_hash_tuples(self):
 
     def test_hash_tuple(self):
         # test equivalence between hash_tuples and hash_tuple
-        tup = (1, 'one')
-        result = hash_tuple(tup)
-        expected = hash_tuples([tup])[0]
-        assert result == expected
+        for tup in [(1, 'one'), (1, np.nan)]:
+            result = hash_tuple(tup)
+            expected = hash_tuples([tup])[0]
+            assert result == expected
+
+    def test_hash_scalar(self):
+        for val in [1, 1.4, 'A', b'A', u'A',  pd.Timestamp("2012-01-01"),
+                    pd.Timestamp("2012-01-01", tz='Europe/Brussels'),
+                    pd.Period('2012-01-01', freq='D'), pd.Timedelta('1 days'),
+                    pd.Interval(0, 1), np.nan, pd.NaT, None]:
+            result = _hash_scalar(val)
+            expected = hash_array(np.array([val], dtype=object),
+                                  categorize=True)
+            assert result[0] == expected[0]
 
     def test_hash_tuples_err(self):
 

From 7cd3cc16b75f4cf1dbf7a90c97f93e59cb703ecc Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 15 May 2017 19:33:22 +0200
Subject: [PATCH 4/8] Simplify to reduce code duplication

---
 pandas/core/util/hashing.py | 62 ++++++++-----------------------------
 1 file changed, 13 insertions(+), 49 deletions(-)

diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
index 522b3b9d7bfb8..cec8ef35c3aaa 100644
--- a/pandas/core/util/hashing.py
+++ b/pandas/core/util/hashing.py
@@ -182,14 +182,6 @@ def hash_tuple(val, encoding='utf8', hash_key=None):
     hash
 
     """
-    #def to_array(v):
-    #    dtype, arr = infer_dtype_from_array([v])
-    #    return np.asarray(arr, dtype=dtype)
-
-    #hashes = (hash_array(to_array(v), encoding=encoding, hash_key=hash_key,
-    #                     categorize=False)
-    #          for v in val)
-
     hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key)
               for v in val)
 
@@ -298,7 +290,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
 
         try:
             vals = hashing.hash_object_array(vals, hash_key, encoding)
-        except (TypeError, ValueError):
+        except TypeError:
             # we have mixed types
             vals = hashing.hash_object_array(vals.astype(str).astype(object),
                                              hash_key, encoding)
@@ -321,52 +313,24 @@ def _hash_scalar(val, encoding='utf8', hash_key=None):
     1d uint64 numpy array of hash value, of length 1
     """
 
-    if hash_key is None:
-        hash_key = _default_hash_key
-
     if isnull(val):
         # this is to be consistent with the _hash_categorical implementation
         return np.array([np.iinfo(np.uint64).max], dtype='u8')
 
     if isinstance(val, string_and_binary_types + (text_type,)):
         vals = np.array([val], dtype=object)
-        string_like = True
     else:
         vals = np.array([val])
-        string_like = False
-
-    dtype = vals.dtype
-
-    #dtype, vals = infer_dtype_from_array([vals])
-    #if dtype == np.object_:
-    #    vals = np.asarray(vals, dtype='object')
-    #    dtype = vals.dtype
 
-    # we'll be working with everything as 64-bit values, so handle this
-    # 128-bit value early
-    if np.issubdtype(dtype, np.complex128):
-        return hash_array(vals.real) + 23 * hash_array(vals.imag)
-
-    # First, turn whatever array this is into unsigned 64-bit ints, if we can
-    # manage it.
-    elif isinstance(dtype, np.bool):
-        vals = vals.astype('u8')
-    elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
-        vals = vals.view('i8').astype('u8', copy=False)
-    elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
-        vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
-    else:
-        if not string_like:
-            from pandas import Index
-            vals = Index(vals).values
-            return hash_array(vals, hash_key=hash_key, encoding=encoding,
-                              categorize=False)
-        vals = hashing.hash_object_array(vals, hash_key, encoding)
-
-    # Then, redistribute these 64-bit ints within the space of 64-bit ints
-    vals ^= vals >> 30
-    vals *= np.uint64(0xbf58476d1ce4e5b9)
-    vals ^= vals >> 27
-    vals *= np.uint64(0x94d049bb133111eb)
-    vals ^= vals >> 31
-    return vals
+        if vals.dtype == np.object_:
+            from pandas import Timestamp, Timedelta, Period, Interval
+            if isinstance(val, (Timestamp, Timedelta)):
+                vals = np.array([val.value])
+            elif isinstance(val, (Period, Interval)):
+                pass
+            else:
+                from pandas import Index
+                vals = Index(vals).values
+
+    return hash_array(vals, hash_key=hash_key, encoding=encoding,
+                      categorize=False)

From 3bd0404f2582403bc660facd5d1932b306b624e4 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 16 May 2017 10:10:50 +0200
Subject: [PATCH 5/8] use infer_dtype_from_scalar

---
 pandas/core/dtypes/cast.py        |  4 ++--
 pandas/core/util/hashing.py       | 18 +++---------------
 pandas/tests/util/test_hashing.py |  4 ++--
 3 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 19d3792f73de7..0089cc94fe6f4 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -333,7 +333,7 @@ def maybe_promote(dtype, fill_value=np.nan):
     return dtype, fill_value
 
 
-def infer_dtype_from_scalar(val, pandas_dtype=False):
+def infer_dtype_from_scalar(val, pandas_dtype=False, use_datetimetz=True):
     """
     interpret the dtype from a scalar
 
@@ -368,7 +368,7 @@ def infer_dtype_from_scalar(val, pandas_dtype=False):
 
     elif isinstance(val, (np.datetime64, datetime)):
         val = tslib.Timestamp(val)
-        if val is tslib.NaT or val.tz is None:
+        if val is tslib.NaT or val.tz is None or not use_datetimetz:
             dtype = np.dtype('M8[ns]')
         else:
             if pandas_dtype:
diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
index cec8ef35c3aaa..87231ab0b7754 100644
--- a/pandas/core/util/hashing.py
+++ b/pandas/core/util/hashing.py
@@ -5,7 +5,6 @@
 
 import numpy as np
 from pandas._libs import hashing
-from pandas.compat import string_and_binary_types, text_type
 from pandas.core.dtypes.generic import (
     ABCMultiIndex,
     ABCIndexClass,
@@ -14,6 +13,7 @@
 from pandas.core.dtypes.common import (
     is_categorical_dtype, is_list_like)
 from pandas.core.dtypes.missing import isnull
+from pandas.core.dtypes.cast import infer_dtype_from_scalar
 
 
 # 16 byte long hashing key
@@ -317,20 +317,8 @@ def _hash_scalar(val, encoding='utf8', hash_key=None):
         # this is to be consistent with the _hash_categorical implementation
         return np.array([np.iinfo(np.uint64).max], dtype='u8')
 
-    if isinstance(val, string_and_binary_types + (text_type,)):
-        vals = np.array([val], dtype=object)
-    else:
-        vals = np.array([val])
-
-        if vals.dtype == np.object_:
-            from pandas import Timestamp, Timedelta, Period, Interval
-            if isinstance(val, (Timestamp, Timedelta)):
-                vals = np.array([val.value])
-            elif isinstance(val, (Period, Interval)):
-                pass
-            else:
-                from pandas import Index
-                vals = Index(vals).values
+    dtype, val = infer_dtype_from_scalar(val, use_datetimetz=False)
+    vals = np.array([val], dtype=dtype)
 
     return hash_array(vals, hash_key=hash_key, encoding=encoding,
                       categorize=False)
diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py
index f7de891e0e7d9..c0efe65371651 100644
--- a/pandas/tests/util/test_hashing.py
+++ b/pandas/tests/util/test_hashing.py
@@ -81,13 +81,13 @@ def test_hash_tuples(self):
 
     def test_hash_tuple(self):
         # test equivalence between hash_tuples and hash_tuple
-        for tup in [(1, 'one'), (1, np.nan)]:
+        for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A')]:
             result = hash_tuple(tup)
             expected = hash_tuples([tup])[0]
             assert result == expected
 
     def test_hash_scalar(self):
-        for val in [1, 1.4, 'A', b'A', u'A',  pd.Timestamp("2012-01-01"),
+        for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"),
                     pd.Timestamp("2012-01-01", tz='Europe/Brussels'),
                     pd.Period('2012-01-01', freq='D'), pd.Timedelta('1 days'),
                     pd.Interval(0, 1), np.nan, pd.NaT, None]:

From 287817a8b9c49eb3c15bc2e49f41b19e7d912084 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 16 May 2017 16:35:46 +0200
Subject: [PATCH 6/8] move check for datetime tz to hashing function

---
 pandas/core/dtypes/cast.py        |  4 ++--
 pandas/core/util/hashing.py       | 12 ++++++++++--
 pandas/tests/util/test_hashing.py | 11 ++++++++---
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 0089cc94fe6f4..19d3792f73de7 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -333,7 +333,7 @@ def maybe_promote(dtype, fill_value=np.nan):
     return dtype, fill_value
 
 
-def infer_dtype_from_scalar(val, pandas_dtype=False, use_datetimetz=True):
+def infer_dtype_from_scalar(val, pandas_dtype=False):
     """
     interpret the dtype from a scalar
 
@@ -368,7 +368,7 @@ def infer_dtype_from_scalar(val, pandas_dtype=False, use_datetimetz=True):
 
     elif isinstance(val, (np.datetime64, datetime)):
         val = tslib.Timestamp(val)
-        if val is tslib.NaT or val.tz is None or not use_datetimetz:
+        if val is tslib.NaT or val.tz is None:
             dtype = np.dtype('M8[ns]')
         else:
             if pandas_dtype:
diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
index 87231ab0b7754..e41ffae9d03c2 100644
--- a/pandas/core/util/hashing.py
+++ b/pandas/core/util/hashing.py
@@ -4,7 +4,7 @@
 import itertools
 
 import numpy as np
-from pandas._libs import hashing
+from pandas._libs import hashing, tslib
 from pandas.core.dtypes.generic import (
     ABCMultiIndex,
     ABCIndexClass,
@@ -317,7 +317,15 @@ def _hash_scalar(val, encoding='utf8', hash_key=None):
         # this is to be consistent with the _hash_categorical implementation
         return np.array([np.iinfo(np.uint64).max], dtype='u8')
 
-    dtype, val = infer_dtype_from_scalar(val, use_datetimetz=False)
+    if getattr(val, 'tzinfo', None) is not None:
+        # for tz-aware datetimes, we need the underlying naive UTC value and
+        # not the tz aware object or pd extension type (as
+        # infer_dtype_from_scalar would do)
+        if not isinstance(val, tslib.Timestamp):
+            val = tslib.Timestamp(val)
+        val = val.tz_convert(None)
+
+    dtype, val = infer_dtype_from_scalar(val)
     vals = np.array([val], dtype=dtype)
 
     return hash_array(vals, hash_key=hash_key, encoding=encoding,
diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py
index c0efe65371651..289592939e3da 100644
--- a/pandas/tests/util/test_hashing.py
+++ b/pandas/tests/util/test_hashing.py
@@ -1,4 +1,5 @@
 import pytest
+import datetime
 
 from warnings import catch_warnings
 import numpy as np
@@ -81,7 +82,8 @@ def test_hash_tuples(self):
 
     def test_hash_tuple(self):
         # test equivalence between hash_tuples and hash_tuple
-        for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A')]:
+        for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'),
+                    ('A', pd.Timestamp("2012-01-01"))]:
             result = hash_tuple(tup)
             expected = hash_tuples([tup])[0]
             assert result == expected
@@ -89,8 +91,11 @@ def test_hash_tuple(self):
     def test_hash_scalar(self):
         for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"),
                     pd.Timestamp("2012-01-01", tz='Europe/Brussels'),
-                    pd.Period('2012-01-01', freq='D'), pd.Timedelta('1 days'),
-                    pd.Interval(0, 1), np.nan, pd.NaT, None]:
+                    datetime.datetime(2012, 1, 1),
+                    pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(),
+                    pd.Timedelta('1 days'), datetime.timedelta(1),
+                    pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1),
+                    np.nan, pd.NaT, None]:
             result = _hash_scalar(val)
             expected = hash_array(np.array([val], dtype=object),
                                   categorize=True)

From 638f011c4deaf4d54fe09df3151f27dcd189a029 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 16 May 2017 16:38:35 +0200
Subject: [PATCH 7/8] update whatsnew

---
 doc/source/whatsnew/v0.20.2.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt
index 10a6b4354290d..7773f5abfb0ba 100644
--- a/doc/source/whatsnew/v0.20.2.txt
+++ b/doc/source/whatsnew/v0.20.2.txt
@@ -27,9 +27,10 @@ Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
 - Performance regression fix when indexing with a list-like (:issue:`16285`)
-- Performance regression fix for small MultiIndexes (:issuse:`16319`)
+- Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`)
 - Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`)
 
+
 .. _whatsnew_0202.bug_fixes:
 
 Bug Fixes

From 8acc9e84c64009ddb3ec87126f0cd814e24260c3 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 16 May 2017 17:15:59 +0200
Subject: [PATCH 8/8] add benchmarks

---
 asv_bench/benchmarks/indexing.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
index e1676715853a4..6a2c9d48c4a28 100644
--- a/asv_bench/benchmarks/indexing.py
+++ b/asv_bench/benchmarks/indexing.py
@@ -227,12 +227,24 @@ def time_multiindex_get_indexer(self):
     def time_multiindex_large_get_loc(self):
         self.mi_large.get_loc((999, 19, 'Z'))
 
+    def time_multiindex_large_get_loc_warm(self):
+        for _ in range(1000):
+            self.mi_large.get_loc((999, 19, 'Z'))
+
     def time_multiindex_med_get_loc(self):
         self.mi_med.get_loc((999, 9, 'A'))
 
+    def time_multiindex_med_get_loc_warm(self):
+        for _ in range(1000):
+            self.mi_med.get_loc((999, 9, 'A'))
+
     def time_multiindex_string_get_loc(self):
         self.mi_small.get_loc((99, 'A', 'A'))
 
+    def time_multiindex_small_get_loc_warm(self):
+        for _ in range(1000):
+            self.mi_small.get_loc((99, 'A', 'A'))
+
     def time_is_monotonic(self):
         self.miint.is_monotonic