From 3d3074b7b30c82c9fbdc5fd4a8c0412a6636f8e5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 11 May 2017 22:06:35 +0200 Subject: [PATCH 1/8] PERF: improve hash collision check for single MI labels --- pandas/_libs/hashtable_class_helper.pxi.in | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index b80a592669eca..049345bed0fae 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -4,6 +4,8 @@ Template for each `dtype` helper function for hashtable WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ +from pandas.core.dtypes.missing import array_equivalent + #---------------------------------------------------------------------- # VectorData #---------------------------------------------------------------------- @@ -921,6 +923,16 @@ cdef class MultiIndexHashTable(HashTable): "hash collision\nlocs:\n{}\n" "result:\n{}\nmi:\n{}".format(alocs, result, mi)) + def _check_for_collision(self, Py_ssize_t loc, object label): + # validate that the loc maps to the actual value + # version of _check_for_collisions above for single label (tuple) + + result = self.mi[loc] + if not array_equivalent(result, label): + raise AssertionError( + "hash collision\nloc:\n{}\n" + "result:\n{}\nmi:\n{}".format(loc, result, label)) + def __contains__(self, object key): try: self.get_item(key) @@ -939,8 +951,7 @@ cdef class MultiIndexHashTable(HashTable): k = kh_get_uint64(self.table, value) if k != self.table.n_buckets: loc = self.table.vals[k] - locs = np.array([loc], dtype=np.int64) - self._check_for_collisions(locs, key) + self._check_for_collision(loc, key) return loc else: raise KeyError(key) From d1d1513a162dcbc9a1d21160212b4768bdb6b000 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 May 2017 22:12:58 +0200 Subject: [PATCH 2/8] PERF: specialized hash function for single tuples --- pandas/core/indexes/multi.py | 4 ++-- pandas/core/util/hashing.py | 25 ++++++++++++++++++++++++- pandas/tests/util/test_hashing.py | 9 ++++++++- 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 3db5633ec30bd..569e16f2141ae 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -748,7 +748,7 @@ def _hashed_indexing_key(self, key): we need to stringify if we have mixed levels """ - from pandas.core.util.hashing import hash_tuples + from pandas.core.util.hashing import hash_tuples, hash_tuple if not isinstance(key, tuple): return hash_tuples(key) @@ -762,7 +762,7 @@ def f(k, stringify): return k key = tuple([f(k, stringify) for k, stringify in zip(key, self._have_mixed_levels)]) - return hash_tuples(key) + return hash_tuple(key) @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs) def duplicated(self, keep='first'): diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index f0829adc94500..771c181ac5a90 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -164,6 +164,29 @@ def hash_tuples(vals, encoding='utf8', hash_key=None): return h +def hash_tuple(val, encoding='utf8', hash_key=None): + """ + Hash a single tuple efficiently + + Parameters + ---------- + val : single tuple + encoding : string, default 'utf8' + hash_key : string key to encode, default to _default_hash_key + + Returns + ------- + hash + + """ + hashes = (hash_array(np.array([v]), encoding=encoding, hash_key=hash_key, + categorize=False) + for v in val) + h = _combine_hash_arrays(hashes, len(val))[0] + + return h + + def _hash_categorical(c, encoding, hash_key): """ Hash a Categorical by hashing its categories, and then mapping the codes @@ -264,7 +287,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): try: vals = hashing.hash_object_array(vals, hash_key, encoding) - except TypeError: + except (TypeError, ValueError): # we have mixed types vals = hashing.hash_object_array(vals.astype(str).astype(object), hash_key, encoding) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index e1e6e43529a7d..d1a604e4acecd 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -6,7 +6,7 @@ from pandas import DataFrame, Series, Index, MultiIndex from pandas.util import hash_array, hash_pandas_object -from pandas.core.util.hashing import hash_tuples +from pandas.core.util.hashing import hash_tuples, hash_tuple import pandas.util.testing as tm @@ -79,6 +79,13 @@ def test_hash_tuples(self): result = hash_tuples(tups[0]) assert result == expected[0] + def test_hash_tuple(self): + # test equivalence between hash_tuples and hash_tuple + tup = (1, 'one') + result = hash_tuple(tup) + expected = hash_tuples([tup])[0] + assert result == expected + def test_hash_tuples_err(self): for val in [5, 'foo', pd.Timestamp('20130101')]: From 664d2b35bd0ec240c4d638ccf85f21761dcf8fee Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 15 May 2017 12:12:16 +0200 Subject: [PATCH 3/8] feedback --- pandas/_libs/hashtable.pxd | 2 + pandas/_libs/hashtable_class_helper.pxi.in | 10 ++- pandas/core/util/hashing.py | 75 +++++++++++++++++++++- pandas/tests/util/test_hashing.py | 20 ++++-- 4 files changed, 97 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 3366751af144d..014da22df3382 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -38,6 +38,8 @@ cdef class MultiIndexHashTable(HashTable): cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) + cdef inline void _check_for_collision(self, Py_ssize_t loc, object label) + cdef class StringHashTable(HashTable): cdef kh_str_t *table diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 049345bed0fae..3ef52c5c59c9d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -4,7 +4,8 @@ Template for each `dtype` helper function for hashtable WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -from pandas.core.dtypes.missing import array_equivalent +from lib cimport is_null_datetimelike + #---------------------------------------------------------------------- # VectorData @@ -923,12 +924,15 @@ cdef class MultiIndexHashTable(HashTable): "hash collision\nlocs:\n{}\n" "result:\n{}\nmi:\n{}".format(alocs, result, mi)) - def _check_for_collision(self, Py_ssize_t loc, object label): + cdef inline void _check_for_collision(self, Py_ssize_t loc, object label): # validate that the loc maps to the actual value # version of _check_for_collisions above for single label (tuple) result = self.mi[loc] - if not array_equivalent(result, label): + + if not all(l == r or (is_null_datetimelike(l) + and is_null_datetimelike(r)) + for l, r in zip(result, label)): raise AssertionError( "hash collision\nloc:\n{}\n" "result:\n{}\nmi:\n{}".format(loc, result, label)) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 771c181ac5a90..522b3b9d7bfb8 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -5,6 +5,7 @@ import numpy as np from pandas._libs import hashing +from pandas.compat import string_and_binary_types, text_type from pandas.core.dtypes.generic import ( ABCMultiIndex, ABCIndexClass, @@ -12,6 +13,8 @@ ABCDataFrame) from pandas.core.dtypes.common import ( is_categorical_dtype, is_list_like) +from pandas.core.dtypes.missing import isnull + # 16 byte long hashing key _default_hash_key = '0123456789123456' @@ -179,9 +182,17 @@ def hash_tuple(val, encoding='utf8', hash_key=None): hash """ - hashes = (hash_array(np.array([v]), encoding=encoding, hash_key=hash_key, - categorize=False) + #def to_array(v): + # dtype, arr = infer_dtype_from_array([v]) + # return np.asarray(arr, dtype=dtype) + + #hashes = (hash_array(to_array(v), encoding=encoding, hash_key=hash_key, + # categorize=False) + # for v in val) + + hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key) for v in val) + h = _combine_hash_arrays(hashes, len(val))[0] return h @@ -299,3 +310,63 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals + + +def _hash_scalar(val, encoding='utf8', hash_key=None): + """ + Hash scalar value + + Returns + ------- + 1d uint64 numpy array of hash value, of length 1 + """ + + if hash_key is None: + hash_key = _default_hash_key + + if isnull(val): + # this is to be consistent with the _hash_categorical implementation + return np.array([np.iinfo(np.uint64).max], dtype='u8') + + if isinstance(val, string_and_binary_types + (text_type,)): + vals = np.array([val], dtype=object) + string_like = True + else: + vals = np.array([val]) + string_like = False + + dtype = vals.dtype + + #dtype, vals = infer_dtype_from_array([vals]) + #if dtype == np.object_: + # vals = np.asarray(vals, dtype='object') + # dtype = vals.dtype + + # we'll be working with everything as 64-bit values, so handle this + # 128-bit value early + if np.issubdtype(dtype, np.complex128): + return hash_array(vals.real) + 23 * hash_array(vals.imag) + + # First, turn whatever array this is into unsigned 64-bit ints, if we can + # manage it. + elif isinstance(dtype, np.bool): + vals = vals.astype('u8') + elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): + vals = vals.view('i8').astype('u8', copy=False) + elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: + vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') + else: + if not string_like: + from pandas import Index + vals = Index(vals).values + return hash_array(vals, hash_key=hash_key, encoding=encoding, + categorize=False) + vals = hashing.hash_object_array(vals, hash_key, encoding) + + # Then, redistribute these 64-bit ints within the space of 64-bit ints + vals ^= vals >> 30 + vals *= np.uint64(0xbf58476d1ce4e5b9) + vals ^= vals >> 27 + vals *= np.uint64(0x94d049bb133111eb) + vals ^= vals >> 31 + return vals diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index d1a604e4acecd..f7de891e0e7d9 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -6,7 +6,7 @@ from pandas import DataFrame, Series, Index, MultiIndex from pandas.util import hash_array, hash_pandas_object -from pandas.core.util.hashing import hash_tuples, hash_tuple +from pandas.core.util.hashing import hash_tuples, hash_tuple, _hash_scalar import pandas.util.testing as tm @@ -81,10 +81,20 @@ def test_hash_tuples(self): def test_hash_tuple(self): # test equivalence between hash_tuples and hash_tuple - tup = (1, 'one') - result = hash_tuple(tup) - expected = hash_tuples([tup])[0] - assert result == expected + for tup in [(1, 'one'), (1, np.nan)]: + result = hash_tuple(tup) + expected = hash_tuples([tup])[0] + assert result == expected + + def test_hash_scalar(self): + for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-01", tz='Europe/Brussels'), + pd.Period('2012-01-01', freq='D'), pd.Timedelta('1 days'), + pd.Interval(0, 1), np.nan, pd.NaT, None]: + result = _hash_scalar(val) + expected = hash_array(np.array([val], dtype=object), + categorize=True) + assert result[0] == expected[0] def test_hash_tuples_err(self): From 7cd3cc16b75f4cf1dbf7a90c97f93e59cb703ecc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 15 May 2017 19:33:22 +0200 Subject: [PATCH 4/8] Simplify to reduce code duplication --- pandas/core/util/hashing.py | 62 ++++++++----------------------------- 1 file changed, 13 insertions(+), 49 deletions(-) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 522b3b9d7bfb8..cec8ef35c3aaa 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -182,14 +182,6 @@ def hash_tuple(val, encoding='utf8', hash_key=None): hash """ - #def to_array(v): - # dtype, arr = infer_dtype_from_array([v]) - # return np.asarray(arr, dtype=dtype) - - #hashes = (hash_array(to_array(v), encoding=encoding, hash_key=hash_key, - # categorize=False) - # for v in val) - hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key) for v in val) @@ -298,7 +290,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): try: vals = hashing.hash_object_array(vals, hash_key, encoding) - except (TypeError, ValueError): + except TypeError: # we have mixed types vals = hashing.hash_object_array(vals.astype(str).astype(object), hash_key, encoding) @@ -321,52 +313,24 @@ def _hash_scalar(val, encoding='utf8', hash_key=None): 1d uint64 numpy array of hash value, of length 1 """ - if hash_key is None: - hash_key = _default_hash_key - if isnull(val): # this is to be consistent with the _hash_categorical implementation return np.array([np.iinfo(np.uint64).max], dtype='u8') if isinstance(val, string_and_binary_types + (text_type,)): vals = np.array([val], dtype=object) - string_like = True else: vals = np.array([val]) - string_like = False - - dtype = vals.dtype - - #dtype, vals = infer_dtype_from_array([vals]) - #if dtype == np.object_: - # vals = np.asarray(vals, dtype='object') - # dtype = vals.dtype - # we'll be working with everything as 64-bit values, so handle this - # 128-bit value early - if np.issubdtype(dtype, np.complex128): - return hash_array(vals.real) + 23 * hash_array(vals.imag) - - # First, turn whatever array this is into unsigned 64-bit ints, if we can - # manage it. - elif isinstance(dtype, np.bool): - vals = vals.astype('u8') - elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): - vals = vals.view('i8').astype('u8', copy=False) - elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: - vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') - else: - if not string_like: - from pandas import Index - vals = Index(vals).values - return hash_array(vals, hash_key=hash_key, encoding=encoding, - categorize=False) - vals = hashing.hash_object_array(vals, hash_key, encoding) - - # Then, redistribute these 64-bit ints within the space of 64-bit ints - vals ^= vals >> 30 - vals *= np.uint64(0xbf58476d1ce4e5b9) - vals ^= vals >> 27 - vals *= np.uint64(0x94d049bb133111eb) - vals ^= vals >> 31 - return vals + if vals.dtype == np.object_: + from pandas import Timestamp, Timedelta, Period, Interval + if isinstance(val, (Timestamp, Timedelta)): + vals = np.array([val.value]) + elif isinstance(val, (Period, Interval)): + pass + else: + from pandas import Index + vals = Index(vals).values + + return hash_array(vals, hash_key=hash_key, encoding=encoding, + categorize=False) From 3bd0404f2582403bc660facd5d1932b306b624e4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 16 May 2017 10:10:50 +0200 Subject: [PATCH 5/8] use infer_dtype_from_scalar --- pandas/core/dtypes/cast.py | 4 ++-- pandas/core/util/hashing.py | 18 +++--------------- pandas/tests/util/test_hashing.py | 4 ++-- 3 files changed, 7 insertions(+), 19 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 19d3792f73de7..0089cc94fe6f4 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -333,7 +333,7 @@ def maybe_promote(dtype, fill_value=np.nan): return dtype, fill_value -def infer_dtype_from_scalar(val, pandas_dtype=False): +def infer_dtype_from_scalar(val, pandas_dtype=False, use_datetimetz=True): """ interpret the dtype from a scalar @@ -368,7 +368,7 @@ def infer_dtype_from_scalar(val, pandas_dtype=False): elif isinstance(val, (np.datetime64, datetime)): val = tslib.Timestamp(val) - if val is tslib.NaT or val.tz is None: + if val is tslib.NaT or val.tz is None or not use_datetimetz: dtype = np.dtype('M8[ns]') else: if pandas_dtype: diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index cec8ef35c3aaa..87231ab0b7754 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -5,7 +5,6 @@ import numpy as np from pandas._libs import hashing -from pandas.compat import string_and_binary_types, text_type from pandas.core.dtypes.generic import ( ABCMultiIndex, ABCIndexClass, @@ -14,6 +13,7 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, is_list_like) from pandas.core.dtypes.missing import isnull +from pandas.core.dtypes.cast import infer_dtype_from_scalar # 16 byte long hashing key @@ -317,20 +317,8 @@ def _hash_scalar(val, encoding='utf8', hash_key=None): # this is to be consistent with the _hash_categorical implementation return np.array([np.iinfo(np.uint64).max], dtype='u8') - if isinstance(val, string_and_binary_types + (text_type,)): - vals = np.array([val], dtype=object) - else: - vals = np.array([val]) - - if vals.dtype == np.object_: - from pandas import Timestamp, Timedelta, Period, Interval - if isinstance(val, (Timestamp, Timedelta)): - vals = np.array([val.value]) - elif isinstance(val, (Period, Interval)): - pass - else: - from pandas import Index - vals = Index(vals).values + dtype, val = infer_dtype_from_scalar(val, use_datetimetz=False) + vals = np.array([val], dtype=dtype) return hash_array(vals, hash_key=hash_key, encoding=encoding, categorize=False) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index f7de891e0e7d9..c0efe65371651 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -81,13 +81,13 @@ def test_hash_tuples(self): def test_hash_tuple(self): # test equivalence between hash_tuples and hash_tuple - for tup in [(1, 'one'), (1, np.nan)]: + for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A')]: result = hash_tuple(tup) expected = hash_tuples([tup])[0] assert result == expected def test_hash_scalar(self): - for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"), + for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"), pd.Timestamp("2012-01-01", tz='Europe/Brussels'), pd.Period('2012-01-01', freq='D'), pd.Timedelta('1 days'), pd.Interval(0, 1), np.nan, pd.NaT, None]: From 287817a8b9c49eb3c15bc2e49f41b19e7d912084 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 16 May 2017 16:35:46 +0200 Subject: [PATCH 6/8] move check for datetime tz to hashing function --- pandas/core/dtypes/cast.py | 4 ++-- pandas/core/util/hashing.py | 12 ++++++++++-- pandas/tests/util/test_hashing.py | 11 ++++++++--- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 0089cc94fe6f4..19d3792f73de7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -333,7 +333,7 @@ def maybe_promote(dtype, fill_value=np.nan): return dtype, fill_value -def infer_dtype_from_scalar(val, pandas_dtype=False, use_datetimetz=True): +def infer_dtype_from_scalar(val, pandas_dtype=False): """ interpret the dtype from a scalar @@ -368,7 +368,7 @@ def infer_dtype_from_scalar(val, pandas_dtype=False, use_datetimetz=True): elif isinstance(val, (np.datetime64, datetime)): val = tslib.Timestamp(val) - if val is tslib.NaT or val.tz is None or not use_datetimetz: + if val is tslib.NaT or val.tz is None: dtype = np.dtype('M8[ns]') else: if pandas_dtype: diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 87231ab0b7754..e41ffae9d03c2 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -4,7 +4,7 @@ import itertools import numpy as np -from pandas._libs import hashing +from pandas._libs import hashing, tslib from pandas.core.dtypes.generic import ( ABCMultiIndex, ABCIndexClass, @@ -317,7 +317,15 @@ def _hash_scalar(val, encoding='utf8', hash_key=None): # this is to be consistent with the _hash_categorical implementation return np.array([np.iinfo(np.uint64).max], dtype='u8') - dtype, val = infer_dtype_from_scalar(val, use_datetimetz=False) + if getattr(val, 'tzinfo', None) is not None: + # for tz-aware datetimes, we need the underlying naive UTC value and + # not the tz aware object or pd extension type (as + # infer_dtype_from_scalar would do) + if not isinstance(val, tslib.Timestamp): + val = tslib.Timestamp(val) + val = val.tz_convert(None) + + dtype, val = infer_dtype_from_scalar(val) vals = np.array([val], dtype=dtype) return hash_array(vals, hash_key=hash_key, encoding=encoding, diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index c0efe65371651..289592939e3da 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -1,4 +1,5 @@ import pytest +import datetime from warnings import catch_warnings import numpy as np @@ -81,7 +82,8 @@ def test_hash_tuples(self): def test_hash_tuple(self): # test equivalence between hash_tuples and hash_tuple - for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A')]: + for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'), + ('A', pd.Timestamp("2012-01-01"))]: result = hash_tuple(tup) expected = hash_tuples([tup])[0] assert result == expected @@ -89,8 +91,11 @@ def test_hash_tuple(self): def test_hash_scalar(self): for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"), pd.Timestamp("2012-01-01", tz='Europe/Brussels'), - pd.Period('2012-01-01', freq='D'), pd.Timedelta('1 days'), - pd.Interval(0, 1), np.nan, pd.NaT, None]: + datetime.datetime(2012, 1, 1), + pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(), + pd.Timedelta('1 days'), datetime.timedelta(1), + pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1), + np.nan, pd.NaT, None]: result = _hash_scalar(val) expected = hash_array(np.array([val], dtype=object), categorize=True) From 638f011c4deaf4d54fe09df3151f27dcd189a029 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 16 May 2017 16:38:35 +0200 Subject: [PATCH 7/8] update whatsnew --- doc/source/whatsnew/v0.20.2.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 10a6b4354290d..7773f5abfb0ba 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -27,9 +27,10 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance regression fix when indexing with a list-like (:issue:`16285`) -- Performance regression fix for small MultiIndexes (:issuse:`16319`) +- Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`) - Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`) + .. _whatsnew_0202.bug_fixes: Bug Fixes From 8acc9e84c64009ddb3ec87126f0cd814e24260c3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 16 May 2017 17:15:59 +0200 Subject: [PATCH 8/8] add benchmarks --- asv_bench/benchmarks/indexing.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index e1676715853a4..6a2c9d48c4a28 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -227,12 +227,24 @@ def time_multiindex_get_indexer(self): def time_multiindex_large_get_loc(self): self.mi_large.get_loc((999, 19, 'Z')) + def time_multiindex_large_get_loc_warm(self): + for _ in range(1000): + self.mi_large.get_loc((999, 19, 'Z')) + def time_multiindex_med_get_loc(self): self.mi_med.get_loc((999, 9, 'A')) + def time_multiindex_med_get_loc_warm(self): + for _ in range(1000): + self.mi_med.get_loc((999, 9, 'A')) + def time_multiindex_string_get_loc(self): self.mi_small.get_loc((99, 'A', 'A')) + def time_multiindex_small_get_loc_warm(self): + for _ in range(1000): + self.mi_small.get_loc((99, 'A', 'A')) + def time_is_monotonic(self): self.miint.is_monotonic