diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 3303483c50e20..42d530508eb87 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -29,10 +29,14 @@ class IsIn(object): def setup(self, dtype): self.s = Series(np.random.randint(1, 10, 100000)).astype(dtype) self.values = [1, 2] + self.values_set = set(self.values) def time_isin(self, dtypes): self.s.isin(self.values) + def time_isin_set(self, dtypes): + self.s.isin(self.values_set) + class IsInFloat64(object): diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ccf5c43280765..dd2eb4d5256d4 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -176,6 +176,7 @@ Performance Improvements int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`) - Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`) - Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`) +- Improved performance of :meth:`Series.isin` and :meth:`DataFrame.isin` when passing a set (:issue:`25507`). - Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`) .. _whatsnew_0250.bug_fixes: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 5ed2e3efe26a1..a147616fffb94 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -21,9 +21,9 @@ is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, is_datetimelike, is_extension_array_dtype, is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype, - is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype, - is_sparse, is_timedelta64_dtype, is_unsigned_integer_dtype, - needs_i8_conversion) + is_object_dtype, is_period_dtype, is_scalar, is_set_like, + is_signed_integer_dtype, is_sparse, is_timedelta64_dtype, + is_unsigned_integer_dtype, needs_i8_conversion) from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, na_value_for_dtype @@ -395,6 +395,14 @@ def isin(comps, values): " to isin(), you passed a [{values_type}]" .format(values_type=type(values).__name__)) + # GH 25507 + # if `values` is a set, directly use it instead of hashing a list + if is_set_like(values): + result = np.empty_like(comps, dtype=np.bool) + for i, comp in enumerate(comps): + result[i] = comp in values + return result + if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): values = construct_1d_object_array_from_listlike(list(values)) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 4be7eb8ddb890..3f2bf373a4851 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -17,7 +17,7 @@ is_array_like, is_bool, is_complex, is_decimal, is_dict_like, is_file_like, is_float, is_hashable, is_integer, is_interval, is_iterator, is_list_like, is_named_tuple, is_nested_list_like, is_number, is_re, is_re_compilable, - is_scalar, is_sequence, is_string_like) + is_scalar, is_sequence, is_set_like, is_string_like) _POSSIBLY_CAST_DTYPES = {np.dtype(t).name for t in ['O', 'int8', 'uint8', 'int16', 'uint16', diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 1a02623fa6072..efcd2f3ef6009 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -301,6 +301,36 @@ def is_list_like(obj, allow_sets=True): and not (allow_sets is False and isinstance(obj, Set))) +def is_set_like(obj): + """ + Check if the object is set-like. + + Parameters + ---------- + obj : The object to check + + Returns + ------- + is_set_like : bool + Whether `obj` has set-like properties. + + Examples + -------- + >>> is_set_like({1, 2}) + True + >>> is_set_like(frozenset([1, 2])) + True + >>> is_set_like(set()) + True + >>> is_set_like(set) + False + >>> is_set_like({1: 2, 3: 4}) + False + """ + + return isinstance(obj, (set, frozenset)) + + def is_array_like(obj): """ Check if the object is array-like. diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 68857d6cc6902..8ddbee3ea2e8b 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -24,7 +24,7 @@ from pandas.core.dtypes.common import ( ensure_categorical, ensure_int32, is_bool, is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, - is_float, is_integer, is_number, is_scalar, is_scipy_sparse, + is_float, is_integer, is_number, is_scalar, is_scipy_sparse, is_set_like, is_timedelta64_dtype, is_timedelta64_ns_dtype) import pandas as pd @@ -103,6 +103,22 @@ def test_is_list_like_disallow_sets(maybe_list_like): assert inference.is_list_like(obj, allow_sets=False) == expected +@pytest.mark.parametrize('obj,expected', [ + ({1, 2}, True), + (set(), True), + (set, False), + ([1, 2], False), + ({1: 2}, False), + (frozenset([1, 2]), True), + (frozenset(), True), + (frozenset, False), + ((1, 2), False), + ([], False), +]) +def test_is_set_like(obj, expected): + assert is_set_like(obj) == expected + + def test_is_sequence(): is_seq = inference.is_sequence assert (is_seq((1, 2))) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 232771750838e..2a15c210559dd 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -803,6 +803,21 @@ def test_different_nans_as_float64(self): expected = np.array([True, True]) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("comps,values,expected", [ + (np.array([1, 2, 3]), {1, np.int(2)}, np.array([True, True, False])), + (np.array(['a', 'b']), {1, 'b'}, np.array([False, True])), + (np.array([1.0, 2.0]), {1, 2}, np.array([True, True])), + (pd.date_range("2019-01-01", "2019-01-03"), + {datetime(2019, 1, 2)}, np.array([False, True, False])), + (pd.Categorical(['a', 'b']), {0, 'b'}, np.array([False, True])), + (np.array([np.nan, float('nan')]), {float('nan')}, + np.array([False, False])) + ]) + def test_set(self, comps, values, expected): + # GH 25507 + actual = algos.isin(comps, values) + assert tm.assert_numpy_array_equal(actual, expected) + class TestValueCounts(object):