From 12fb5ac31cf3f86232be9c6617ead0ce25a33f69 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 21 Jul 2016 16:26:50 +0900 Subject: [PATCH] PERF: Improve duplicated perf --- asv_bench/benchmarks/algorithms.py | 14 +++ doc/source/whatsnew/v0.19.0.txt | 1 + pandas/core/algorithms.py | 50 +++++++++- pandas/core/base.py | 20 ++-- pandas/hashtable.pyx | 84 ++++++++++++++++ pandas/lib.pyx | 40 -------- pandas/tests/indexes/test_multi.py | 2 +- pandas/tests/test_algos.py | 150 +++++++++++++++++++++++++++++ pandas/tests/test_lib.py | 39 -------- 9 files changed, 312 insertions(+), 88 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 310a4c5549e4f..6eac7b4831f0f 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -7,6 +7,11 @@ class algorithm(object): def setup(self): N = 100000 + + self.int_unique = pd.Int64Index(np.arange(N * 5)) + # cache is_unique + self.int_unique.is_unique + self.int = pd.Int64Index(np.arange(N).repeat(5)) self.float = pd.Float64Index(np.random.randn(N).repeat(5)) @@ -15,3 +20,12 @@ def time_int_factorize(self): def time_float_factorize(self): self.int.factorize() + + def time_int_unique_duplicated(self): + self.int_unique.duplicated() + + def time_int_duplicated(self): + self.int.duplicated() + + def time_float_duplicated(self): + self.float.duplicated() diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f3acf403a1d65..e372b051db7ae 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -650,6 +650,7 @@ Performance Improvements - Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`) - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) +- Improved performance of ``Index`` and ``Series`` ``.duplicated`` (:issue:`10235`) - Improved performance of ``Index.difference`` (:issue:`12044`) - Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`) - Improved performance of hashing ``Period`` (:issue:`12817`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 96a8582102cc9..b4a77cf78a22d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -8,7 +8,8 @@ from pandas import compat, lib, tslib, _np_version_under1p8 from pandas.types.cast import _maybe_promote -from pandas.types.generic import ABCPeriodIndex, ABCDatetimeIndex +from pandas.types.generic import (ABCSeries, ABCIndex, ABCPeriodIndex, + ABCDatetimeIndex) from pandas.types.common import (is_integer_dtype, is_int64_dtype, is_categorical_dtype, @@ -448,6 +449,53 @@ def _value_counts_arraylike(values, dropna=True): return keys, counts +def duplicated(values, keep='first'): + """ + Return boolean ndarray denoting duplicate values + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + - ``first`` : Mark duplicates as ``True`` except for the first + occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last + occurrence. + - False : Mark all duplicates as ``True``. + + Returns + ------- + duplicated : ndarray + """ + + dtype = values.dtype + + # no need to revert to original type + if is_datetime_or_timedelta_dtype(dtype) or is_datetimetz(dtype): + if isinstance(values, (ABCSeries, ABCIndex)): + values = values.values.view(np.int64) + else: + values = values.view(np.int64) + elif is_period_arraylike(values): + from pandas.tseries.period import PeriodIndex + values = PeriodIndex(values).asi8 + elif is_categorical_dtype(dtype): + values = values.values.codes + elif isinstance(values, (ABCSeries, ABCIndex)): + values = values.values + + if is_integer_dtype(dtype): + values = _ensure_int64(values) + duplicated = htable.duplicated_int64(values, keep=keep) + elif is_float_dtype(dtype): + values = _ensure_float64(values) + duplicated = htable.duplicated_float64(values, keep=keep) + else: + values = _ensure_object(values) + duplicated = htable.duplicated_object(values, keep=keep) + + return duplicated + + def mode(values): """Returns the mode or mode(s) of the passed Series or ndarray (sorted)""" # must sort because hash order isn't necessarily defined. diff --git a/pandas/core/base.py b/pandas/core/base.py index 8c150d9fbb07e..0f9eb14be40db 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -7,7 +7,7 @@ from pandas.types.missing import isnull from pandas.types.generic import ABCDataFrame, ABCSeries, ABCIndexClass -from pandas.types.common import (_ensure_object, is_object_dtype, +from pandas.types.common import (is_object_dtype, is_list_like, is_scalar) from pandas.core import common as com @@ -1014,6 +1014,7 @@ def is_monotonic(self): """ from pandas import Index return Index(self).is_monotonic + is_monotonic_increasing = is_monotonic @property @@ -1171,6 +1172,10 @@ def searchsorted(self, key, side='left', sorter=None): False: 'first'}) @Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs) def drop_duplicates(self, keep='first', inplace=False): + if isinstance(self, ABCIndexClass): + if self.is_unique: + return self._shallow_copy() + duplicated = self.duplicated(keep=keep) result = self[np.logical_not(duplicated)] if inplace: @@ -1200,13 +1205,14 @@ def drop_duplicates(self, keep='first', inplace=False): False: 'first'}) @Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs) def duplicated(self, keep='first'): - keys = com._values_from_object(_ensure_object(self.values)) - duplicated = lib.duplicated(keys, keep=keep) - try: - return self._constructor(duplicated, + from pandas.core.algorithms import duplicated + if isinstance(self, ABCIndexClass): + if self.is_unique: + return np.zeros(len(self), dtype=np.bool) + return duplicated(self, keep=keep) + else: + return self._constructor(duplicated(self, keep=keep), index=self.index).__finalize__(self) - except AttributeError: - return np.array(duplicated, dtype=bool) # ---------------------------------------------------------------------- # abstracts diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index e1c3733a0449d..18e54621e8bf5 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -1073,6 +1073,90 @@ def mode_int64(int64_t[:] values): return modes[:j+1] + +def duplicated_object(ndarray[object] values, object keep='first'): + cdef: + Py_ssize_t i, n + dict seen = dict() + object row + + n = len(values) + cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8) + + if keep == 'last': + for i from n > i >= 0: + row = values[i] + if row in seen: + result[i] = 1 + else: + seen[row] = i + result[i] = 0 + elif keep == 'first': + for i from 0 <= i < n: + row = values[i] + if row in seen: + result[i] = 1 + else: + seen[row] = i + result[i] = 0 + elif keep is False: + for i from 0 <= i < n: + row = values[i] + if row in seen: + result[i] = 1 + result[seen[row]] = 1 + else: + seen[row] = i + result[i] = 0 + else: + raise ValueError('keep must be either "first", "last" or False') + + return result.view(np.bool_) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def duplicated_float64(ndarray[float64_t, ndim=1] values, + object keep='first'): + cdef: + int ret = 0, k + float64_t value + Py_ssize_t i, n = len(values) + kh_float64_t * table = kh_init_float64() + ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') + + kh_resize_float64(table, min(n, _SIZE_HINT_LIMIT)) + + if keep not in ('last', 'first', False): + raise ValueError('keep must be either "first", "last" or False') + + if keep == 'last': + with nogil: + for i from n > i >=0: + kh_put_float64(table, values[i], &ret) + out[i] = ret == 0 + elif keep == 'first': + with nogil: + for i from 0 <= i < n: + kh_put_float64(table, values[i], &ret) + out[i] = ret == 0 + else: + with nogil: + for i from 0 <= i < n: + value = values[i] + k = kh_get_float64(table, value) + if k != table.n_buckets: + out[table.vals[k]] = 1 + out[i] = 1 + else: + k = kh_put_float64(table, value, &ret) + table.keys[k] = value + table.vals[k] = i + out[i] = 0 + kh_destroy_float64(table) + return out + + @cython.wraparound(False) @cython.boundscheck(False) def duplicated_int64(ndarray[int64_t, ndim=1] values, diff --git a/pandas/lib.pyx b/pandas/lib.pyx index bf1dd1246120b..0473ae79adce5 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1394,46 +1394,6 @@ def fast_zip_fillna(list ndarrays, fill_value=pandas_null): return result -def duplicated(ndarray[object] values, object keep='first'): - cdef: - Py_ssize_t i, n - dict seen = dict() - object row - - n = len(values) - cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8) - - if keep == 'last': - for i from n > i >= 0: - row = values[i] - if row in seen: - result[i] = 1 - else: - seen[row] = i - result[i] = 0 - elif keep == 'first': - for i from 0 <= i < n: - row = values[i] - if row in seen: - result[i] = 1 - else: - seen[row] = i - result[i] = 0 - elif keep is False: - for i from 0 <= i < n: - row = values[i] - if row in seen: - result[i] = 1 - result[seen[row]] = 1 - else: - seen[row] = i - result[i] = 0 - else: - raise ValueError('keep must be either "first", "last" or False') - - return result.view(np.bool_) - - def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups): cdef: Py_ssize_t i, group_size, n, start diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 0b65b6a9d09f5..408f81fe1e982 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1860,7 +1860,7 @@ def check(nlevels, with_nulls): for keep in ['first', 'last', False]: left = mi.duplicated(keep=keep) - right = pd.lib.duplicated(mi.values, keep=keep) + right = pd.hashtable.duplicated_object(mi.values, keep=keep) tm.assert_numpy_array_equal(left, right) # GH5873 diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 3c77d19aa7f3c..9535a3f97955c 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -667,6 +667,156 @@ def test_value_counts_normalized(self): tm.assert_series_equal(result, expected) +class TestDuplicated(tm.TestCase): + + _multiprocess_can_split_ = True + + def test_duplicated_with_nas(self): + keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object) + + result = algos.duplicated(keys) + expected = np.array([False, False, False, True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep='first') + expected = np.array([False, False, False, True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep='last') + expected = np.array([True, False, True, False, False, False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep=False) + expected = np.array([True, False, True, True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + keys = np.empty(8, dtype=object) + for i, t in enumerate(zip([0, 0, np.nan, np.nan] * 2, + [0, np.nan, 0, np.nan] * 2)): + keys[i] = t + + result = algos.duplicated(keys) + falses = [False] * 4 + trues = [True] * 4 + expected = np.array(falses + trues) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep='last') + expected = np.array(trues + falses) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep=False) + expected = np.array(trues + trues) + tm.assert_numpy_array_equal(result, expected) + + def test_numeric_object_likes(self): + cases = [np.array([1, 2, 1, 5, 3, + 2, 4, 1, 5, 6]), + np.array([1.1, 2.2, 1.1, np.nan, 3.3, + 2.2, 4.4, 1.1, np.nan, 6.6]), + np.array([1 + 1j, 2 + 2j, 1 + 1j, 5 + 5j, 3 + 3j, + 2 + 2j, 4 + 4j, 1 + 1j, 5 + 5j, 6 + 6j]), + np.array(['a', 'b', 'a', 'e', 'c', + 'b', 'd', 'a', 'e', 'f'], dtype=object)] + + exp_first = np.array([False, False, True, False, False, + True, False, True, True, False]) + exp_last = np.array([True, True, True, True, False, + False, False, False, False, False]) + exp_false = exp_first | exp_last + + for case in cases: + res_first = algos.duplicated(case, keep='first') + tm.assert_numpy_array_equal(res_first, exp_first) + + res_last = algos.duplicated(case, keep='last') + tm.assert_numpy_array_equal(res_last, exp_last) + + res_false = algos.duplicated(case, keep=False) + tm.assert_numpy_array_equal(res_false, exp_false) + + # index + for idx in [pd.Index(case), pd.Index(case, dtype='category')]: + res_first = idx.duplicated(keep='first') + tm.assert_numpy_array_equal(res_first, exp_first) + + res_last = idx.duplicated(keep='last') + tm.assert_numpy_array_equal(res_last, exp_last) + + res_false = idx.duplicated(keep=False) + tm.assert_numpy_array_equal(res_false, exp_false) + + # series + for s in [pd.Series(case), pd.Series(case, dtype='category')]: + res_first = s.duplicated(keep='first') + tm.assert_series_equal(res_first, pd.Series(exp_first)) + + res_last = s.duplicated(keep='last') + tm.assert_series_equal(res_last, pd.Series(exp_last)) + + res_false = s.duplicated(keep=False) + tm.assert_series_equal(res_false, pd.Series(exp_false)) + + def test_datetime_likes(self): + + dt = ['2011-01-01', '2011-01-02', '2011-01-01', 'NaT', '2011-01-03', + '2011-01-02', '2011-01-04', '2011-01-01', 'NaT', '2011-01-06'] + td = ['1 days', '2 days', '1 days', 'NaT', '3 days', + '2 days', '4 days', '1 days', 'NaT', '6 days'] + + cases = [np.array([pd.Timestamp(d) for d in dt]), + np.array([pd.Timestamp(d, tz='US/Eastern') for d in dt]), + np.array([pd.Period(d, freq='D') for d in dt]), + np.array([np.datetime64(d) for d in dt]), + np.array([pd.Timedelta(d) for d in td])] + + exp_first = np.array([False, False, True, False, False, + True, False, True, True, False]) + exp_last = np.array([True, True, True, True, False, + False, False, False, False, False]) + exp_false = exp_first | exp_last + + for case in cases: + print(case) + res_first = algos.duplicated(case, keep='first') + tm.assert_numpy_array_equal(res_first, exp_first) + + res_last = algos.duplicated(case, keep='last') + tm.assert_numpy_array_equal(res_last, exp_last) + + res_false = algos.duplicated(case, keep=False) + tm.assert_numpy_array_equal(res_false, exp_false) + + # index + for idx in [pd.Index(case), pd.Index(case, dtype='category')]: + res_first = idx.duplicated(keep='first') + tm.assert_numpy_array_equal(res_first, exp_first) + + res_last = idx.duplicated(keep='last') + tm.assert_numpy_array_equal(res_last, exp_last) + + res_false = idx.duplicated(keep=False) + tm.assert_numpy_array_equal(res_false, exp_false) + + # series + for s in [pd.Series(case), pd.Series(case, dtype='category')]: + res_first = s.duplicated(keep='first') + tm.assert_series_equal(res_first, pd.Series(exp_first)) + + res_last = s.duplicated(keep='last') + tm.assert_series_equal(res_last, pd.Series(exp_last)) + + res_false = s.duplicated(keep=False) + tm.assert_series_equal(res_false, pd.Series(exp_false)) + + def test_unique_index(self): + cases = [pd.Index([1, 2, 3]), pd.RangeIndex(0, 3)] + for case in cases: + self.assertTrue(case.is_unique) + tm.assert_numpy_array_equal(case.duplicated(), + np.array([False, False, False])) + + class GroupVarTestMixin(object): def test_group_var_generic_1d(self): diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 80b5e41e881cd..945f8004687cd 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -234,45 +234,6 @@ def test_empty_like(self): self._check_behavior(arr, expected) -def test_duplicated_with_nas(): - keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object) - - result = lib.duplicated(keys) - expected = [False, False, False, True, False, True] - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep='first') - expected = [False, False, False, True, False, True] - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep='last') - expected = [True, False, True, False, False, False] - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep=False) - expected = [True, False, True, True, False, True] - assert (np.array_equal(result, expected)) - - keys = np.empty(8, dtype=object) - for i, t in enumerate(zip([0, 0, np.nan, np.nan] * 2, - [0, np.nan, 0, np.nan] * 2)): - keys[i] = t - - result = lib.duplicated(keys) - falses = [False] * 4 - trues = [True] * 4 - expected = falses + trues - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep='last') - expected = trues + falses - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep=False) - expected = trues + trues - assert (np.array_equal(result, expected)) - - if __name__ == '__main__': import nose