From 12fb5ac31cf3f86232be9c6617ead0ce25a33f69 Mon Sep 17 00:00:00 2001
From: sinhrks <sinhrks@gmail.com>
Date: Thu, 21 Jul 2016 16:26:50 +0900
Subject: [PATCH] PERF: Improve duplicated perf

---
 asv_bench/benchmarks/algorithms.py |  14 +++
 doc/source/whatsnew/v0.19.0.txt    |   1 +
 pandas/core/algorithms.py          |  50 +++++++++-
 pandas/core/base.py                |  20 ++--
 pandas/hashtable.pyx               |  84 ++++++++++++++++
 pandas/lib.pyx                     |  40 --------
 pandas/tests/indexes/test_multi.py |   2 +-
 pandas/tests/test_algos.py         | 150 +++++++++++++++++++++++++++++
 pandas/tests/test_lib.py           |  39 --------
 9 files changed, 312 insertions(+), 88 deletions(-)

diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
index 310a4c5549e4f..6eac7b4831f0f 100644
--- a/asv_bench/benchmarks/algorithms.py
+++ b/asv_bench/benchmarks/algorithms.py
@@ -7,6 +7,11 @@ class algorithm(object):
 
     def setup(self):
         N = 100000
+
+        self.int_unique = pd.Int64Index(np.arange(N * 5))
+        # cache is_unique
+        self.int_unique.is_unique
+
         self.int = pd.Int64Index(np.arange(N).repeat(5))
         self.float = pd.Float64Index(np.random.randn(N).repeat(5))
 
@@ -15,3 +20,12 @@ def time_int_factorize(self):
 
     def time_float_factorize(self):
         self.int.factorize()
+
+    def time_int_unique_duplicated(self):
+        self.int_unique.duplicated()
+
+    def time_int_duplicated(self):
+        self.int.duplicated()
+
+    def time_float_duplicated(self):
+        self.float.duplicated()
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
index f3acf403a1d65..e372b051db7ae 100644
--- a/doc/source/whatsnew/v0.19.0.txt
+++ b/doc/source/whatsnew/v0.19.0.txt
@@ -650,6 +650,7 @@ Performance Improvements
 
 - Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`)
 - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`)
+- Improved performance of ``Index`` and ``Series`` ``.duplicated`` (:issue:`10235`)
 - Improved performance of ``Index.difference`` (:issue:`12044`)
 - Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`)
 - Improved performance of hashing ``Period`` (:issue:`12817`)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 96a8582102cc9..b4a77cf78a22d 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -8,7 +8,8 @@
 
 from pandas import compat, lib, tslib, _np_version_under1p8
 from pandas.types.cast import _maybe_promote
-from pandas.types.generic import ABCPeriodIndex, ABCDatetimeIndex
+from pandas.types.generic import (ABCSeries, ABCIndex, ABCPeriodIndex,
+                                  ABCDatetimeIndex)
 from pandas.types.common import (is_integer_dtype,
                                  is_int64_dtype,
                                  is_categorical_dtype,
@@ -448,6 +449,53 @@ def _value_counts_arraylike(values, dropna=True):
     return keys, counts
 
 
+def duplicated(values, keep='first'):
+    """
+    Return boolean ndarray denoting duplicate values
+
+    Parameters
+    ----------
+    keep : {'first', 'last', False}, default 'first'
+        - ``first`` : Mark duplicates as ``True`` except for the first
+          occurrence.
+        - ``last`` : Mark duplicates as ``True`` except for the last
+          occurrence.
+        - False : Mark all duplicates as ``True``.
+
+    Returns
+    -------
+    duplicated : ndarray
+    """
+
+    dtype = values.dtype
+
+    # no need to revert to original type
+    if is_datetime_or_timedelta_dtype(dtype) or is_datetimetz(dtype):
+        if isinstance(values, (ABCSeries, ABCIndex)):
+            values = values.values.view(np.int64)
+        else:
+            values = values.view(np.int64)
+    elif is_period_arraylike(values):
+        from pandas.tseries.period import PeriodIndex
+        values = PeriodIndex(values).asi8
+    elif is_categorical_dtype(dtype):
+        values = values.values.codes
+    elif isinstance(values, (ABCSeries, ABCIndex)):
+        values = values.values
+
+    if is_integer_dtype(dtype):
+        values = _ensure_int64(values)
+        duplicated = htable.duplicated_int64(values, keep=keep)
+    elif is_float_dtype(dtype):
+        values = _ensure_float64(values)
+        duplicated = htable.duplicated_float64(values, keep=keep)
+    else:
+        values = _ensure_object(values)
+        duplicated = htable.duplicated_object(values, keep=keep)
+
+    return duplicated
+
+
 def mode(values):
     """Returns the mode or mode(s) of the passed Series or ndarray (sorted)"""
     # must sort because hash order isn't necessarily defined.
diff --git a/pandas/core/base.py b/pandas/core/base.py
index 8c150d9fbb07e..0f9eb14be40db 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -7,7 +7,7 @@
 
 from pandas.types.missing import isnull
 from pandas.types.generic import ABCDataFrame, ABCSeries, ABCIndexClass
-from pandas.types.common import (_ensure_object, is_object_dtype,
+from pandas.types.common import (is_object_dtype,
                                  is_list_like, is_scalar)
 
 from pandas.core import common as com
@@ -1014,6 +1014,7 @@ def is_monotonic(self):
         """
         from pandas import Index
         return Index(self).is_monotonic
+
     is_monotonic_increasing = is_monotonic
 
     @property
@@ -1171,6 +1172,10 @@ def searchsorted(self, key, side='left', sorter=None):
                                                    False: 'first'})
     @Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs)
     def drop_duplicates(self, keep='first', inplace=False):
+        if isinstance(self, ABCIndexClass):
+            if self.is_unique:
+                return self._shallow_copy()
+
         duplicated = self.duplicated(keep=keep)
         result = self[np.logical_not(duplicated)]
         if inplace:
@@ -1200,13 +1205,14 @@ def drop_duplicates(self, keep='first', inplace=False):
                                                    False: 'first'})
     @Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs)
     def duplicated(self, keep='first'):
-        keys = com._values_from_object(_ensure_object(self.values))
-        duplicated = lib.duplicated(keys, keep=keep)
-        try:
-            return self._constructor(duplicated,
+        from pandas.core.algorithms import duplicated
+        if isinstance(self, ABCIndexClass):
+            if self.is_unique:
+                return np.zeros(len(self), dtype=np.bool)
+            return duplicated(self, keep=keep)
+        else:
+            return self._constructor(duplicated(self, keep=keep),
                                      index=self.index).__finalize__(self)
-        except AttributeError:
-            return np.array(duplicated, dtype=bool)
 
     # ----------------------------------------------------------------------
     # abstracts
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
index e1c3733a0449d..18e54621e8bf5 100644
--- a/pandas/hashtable.pyx
+++ b/pandas/hashtable.pyx
@@ -1073,6 +1073,90 @@ def mode_int64(int64_t[:] values):
 
     return modes[:j+1]
 
+
+def duplicated_object(ndarray[object] values, object keep='first'):
+    cdef:
+        Py_ssize_t i, n
+        dict seen = dict()
+        object row
+
+    n = len(values)
+    cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)
+
+    if keep == 'last':
+        for i from n > i >= 0:
+            row = values[i]
+            if row in seen:
+                result[i] = 1
+            else:
+                seen[row] = i
+                result[i] = 0
+    elif keep == 'first':
+        for i from 0 <= i < n:
+            row = values[i]
+            if row in seen:
+                result[i] = 1
+            else:
+                seen[row] = i
+                result[i] = 0
+    elif keep is False:
+        for i from 0 <= i < n:
+            row = values[i]
+            if row in seen:
+                result[i] = 1
+                result[seen[row]] = 1
+            else:
+                seen[row] = i
+                result[i] = 0
+    else:
+        raise ValueError('keep must be either "first", "last" or False')
+
+    return result.view(np.bool_)
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def duplicated_float64(ndarray[float64_t, ndim=1] values,
+                       object keep='first'):
+    cdef:
+        int ret = 0, k
+        float64_t value
+        Py_ssize_t i, n = len(values)
+        kh_float64_t * table = kh_init_float64()
+        ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
+
+    kh_resize_float64(table, min(n, _SIZE_HINT_LIMIT))
+
+    if keep not in ('last', 'first', False):
+        raise ValueError('keep must be either "first", "last" or False')
+
+    if keep == 'last':
+        with nogil:
+            for i from n > i >=0:
+                kh_put_float64(table, values[i], &ret)
+                out[i] = ret == 0
+    elif keep == 'first':
+        with nogil:
+            for i from 0 <= i < n:
+                kh_put_float64(table, values[i], &ret)
+                out[i] = ret == 0
+    else:
+        with nogil:
+            for i from 0 <= i < n:
+                value = values[i]
+                k = kh_get_float64(table, value)
+                if k != table.n_buckets:
+                    out[table.vals[k]] = 1
+                    out[i] = 1
+                else:
+                    k = kh_put_float64(table, value, &ret)
+                    table.keys[k] = value
+                    table.vals[k] = i
+                    out[i] = 0
+    kh_destroy_float64(table)
+    return out
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def duplicated_int64(ndarray[int64_t, ndim=1] values,
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
index bf1dd1246120b..0473ae79adce5 100644
--- a/pandas/lib.pyx
+++ b/pandas/lib.pyx
@@ -1394,46 +1394,6 @@ def fast_zip_fillna(list ndarrays, fill_value=pandas_null):
     return result
 
 
-def duplicated(ndarray[object] values, object keep='first'):
-    cdef:
-        Py_ssize_t i, n
-        dict seen = dict()
-        object row
-
-    n = len(values)
-    cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)
-
-    if keep == 'last':
-        for i from n > i >= 0:
-            row = values[i]
-            if row in seen:
-                result[i] = 1
-            else:
-                seen[row] = i
-                result[i] = 0
-    elif keep == 'first':
-        for i from 0 <= i < n:
-            row = values[i]
-            if row in seen:
-                result[i] = 1
-            else:
-                seen[row] = i
-                result[i] = 0
-    elif keep is False:
-        for i from 0 <= i < n:
-            row = values[i]
-            if row in seen:
-                result[i] = 1
-                result[seen[row]] = 1
-            else:
-                seen[row] = i
-                result[i] = 0
-    else:
-        raise ValueError('keep must be either "first", "last" or False')
-
-    return result.view(np.bool_)
-
-
 def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
     cdef:
         Py_ssize_t i, group_size, n, start
diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py
index 0b65b6a9d09f5..408f81fe1e982 100644
--- a/pandas/tests/indexes/test_multi.py
+++ b/pandas/tests/indexes/test_multi.py
@@ -1860,7 +1860,7 @@ def check(nlevels, with_nulls):
 
         for keep in ['first', 'last', False]:
             left = mi.duplicated(keep=keep)
-            right = pd.lib.duplicated(mi.values, keep=keep)
+            right = pd.hashtable.duplicated_object(mi.values, keep=keep)
             tm.assert_numpy_array_equal(left, right)
 
         # GH5873
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 3c77d19aa7f3c..9535a3f97955c 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -667,6 +667,156 @@ def test_value_counts_normalized(self):
             tm.assert_series_equal(result, expected)
 
 
+class TestDuplicated(tm.TestCase):
+
+    _multiprocess_can_split_ = True
+
+    def test_duplicated_with_nas(self):
+        keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object)
+
+        result = algos.duplicated(keys)
+        expected = np.array([False, False, False, True, False, True])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.duplicated(keys, keep='first')
+        expected = np.array([False, False, False, True, False, True])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.duplicated(keys, keep='last')
+        expected = np.array([True, False, True, False, False, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.duplicated(keys, keep=False)
+        expected = np.array([True, False, True, True, False, True])
+        tm.assert_numpy_array_equal(result, expected)
+
+        keys = np.empty(8, dtype=object)
+        for i, t in enumerate(zip([0, 0, np.nan, np.nan] * 2,
+                                  [0, np.nan, 0, np.nan] * 2)):
+            keys[i] = t
+
+        result = algos.duplicated(keys)
+        falses = [False] * 4
+        trues = [True] * 4
+        expected = np.array(falses + trues)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.duplicated(keys, keep='last')
+        expected = np.array(trues + falses)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.duplicated(keys, keep=False)
+        expected = np.array(trues + trues)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_numeric_object_likes(self):
+        cases = [np.array([1, 2, 1, 5, 3,
+                           2, 4, 1, 5, 6]),
+                 np.array([1.1, 2.2, 1.1, np.nan, 3.3,
+                           2.2, 4.4, 1.1, np.nan, 6.6]),
+                 np.array([1 + 1j, 2 + 2j, 1 + 1j, 5 + 5j, 3 + 3j,
+                           2 + 2j, 4 + 4j, 1 + 1j, 5 + 5j, 6 + 6j]),
+                 np.array(['a', 'b', 'a', 'e', 'c',
+                           'b', 'd', 'a', 'e', 'f'], dtype=object)]
+
+        exp_first = np.array([False, False, True, False, False,
+                              True, False, True, True, False])
+        exp_last = np.array([True, True, True, True, False,
+                             False, False, False, False, False])
+        exp_false = exp_first | exp_last
+
+        for case in cases:
+            res_first = algos.duplicated(case, keep='first')
+            tm.assert_numpy_array_equal(res_first, exp_first)
+
+            res_last = algos.duplicated(case, keep='last')
+            tm.assert_numpy_array_equal(res_last, exp_last)
+
+            res_false = algos.duplicated(case, keep=False)
+            tm.assert_numpy_array_equal(res_false, exp_false)
+
+            # index
+            for idx in [pd.Index(case), pd.Index(case, dtype='category')]:
+                res_first = idx.duplicated(keep='first')
+                tm.assert_numpy_array_equal(res_first, exp_first)
+
+                res_last = idx.duplicated(keep='last')
+                tm.assert_numpy_array_equal(res_last, exp_last)
+
+                res_false = idx.duplicated(keep=False)
+                tm.assert_numpy_array_equal(res_false, exp_false)
+
+            # series
+            for s in [pd.Series(case), pd.Series(case, dtype='category')]:
+                res_first = s.duplicated(keep='first')
+                tm.assert_series_equal(res_first, pd.Series(exp_first))
+
+                res_last = s.duplicated(keep='last')
+                tm.assert_series_equal(res_last, pd.Series(exp_last))
+
+                res_false = s.duplicated(keep=False)
+                tm.assert_series_equal(res_false, pd.Series(exp_false))
+
+    def test_datetime_likes(self):
+
+        dt = ['2011-01-01', '2011-01-02', '2011-01-01', 'NaT', '2011-01-03',
+              '2011-01-02', '2011-01-04', '2011-01-01', 'NaT', '2011-01-06']
+        td = ['1 days', '2 days', '1 days', 'NaT', '3 days',
+              '2 days', '4 days', '1 days', 'NaT', '6 days']
+
+        cases = [np.array([pd.Timestamp(d) for d in dt]),
+                 np.array([pd.Timestamp(d, tz='US/Eastern') for d in dt]),
+                 np.array([pd.Period(d, freq='D') for d in dt]),
+                 np.array([np.datetime64(d) for d in dt]),
+                 np.array([pd.Timedelta(d) for d in td])]
+
+        exp_first = np.array([False, False, True, False, False,
+                              True, False, True, True, False])
+        exp_last = np.array([True, True, True, True, False,
+                             False, False, False, False, False])
+        exp_false = exp_first | exp_last
+
+        for case in cases:
+            print(case)
+            res_first = algos.duplicated(case, keep='first')
+            tm.assert_numpy_array_equal(res_first, exp_first)
+
+            res_last = algos.duplicated(case, keep='last')
+            tm.assert_numpy_array_equal(res_last, exp_last)
+
+            res_false = algos.duplicated(case, keep=False)
+            tm.assert_numpy_array_equal(res_false, exp_false)
+
+            # index
+            for idx in [pd.Index(case), pd.Index(case, dtype='category')]:
+                res_first = idx.duplicated(keep='first')
+                tm.assert_numpy_array_equal(res_first, exp_first)
+
+                res_last = idx.duplicated(keep='last')
+                tm.assert_numpy_array_equal(res_last, exp_last)
+
+                res_false = idx.duplicated(keep=False)
+                tm.assert_numpy_array_equal(res_false, exp_false)
+
+            # series
+            for s in [pd.Series(case), pd.Series(case, dtype='category')]:
+                res_first = s.duplicated(keep='first')
+                tm.assert_series_equal(res_first, pd.Series(exp_first))
+
+                res_last = s.duplicated(keep='last')
+                tm.assert_series_equal(res_last, pd.Series(exp_last))
+
+                res_false = s.duplicated(keep=False)
+                tm.assert_series_equal(res_false, pd.Series(exp_false))
+
+    def test_unique_index(self):
+        cases = [pd.Index([1, 2, 3]), pd.RangeIndex(0, 3)]
+        for case in cases:
+            self.assertTrue(case.is_unique)
+            tm.assert_numpy_array_equal(case.duplicated(),
+                                        np.array([False, False, False]))
+
+
 class GroupVarTestMixin(object):
 
     def test_group_var_generic_1d(self):
diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py
index 80b5e41e881cd..945f8004687cd 100644
--- a/pandas/tests/test_lib.py
+++ b/pandas/tests/test_lib.py
@@ -234,45 +234,6 @@ def test_empty_like(self):
         self._check_behavior(arr, expected)
 
 
-def test_duplicated_with_nas():
-    keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object)
-
-    result = lib.duplicated(keys)
-    expected = [False, False, False, True, False, True]
-    assert (np.array_equal(result, expected))
-
-    result = lib.duplicated(keys, keep='first')
-    expected = [False, False, False, True, False, True]
-    assert (np.array_equal(result, expected))
-
-    result = lib.duplicated(keys, keep='last')
-    expected = [True, False, True, False, False, False]
-    assert (np.array_equal(result, expected))
-
-    result = lib.duplicated(keys, keep=False)
-    expected = [True, False, True, True, False, True]
-    assert (np.array_equal(result, expected))
-
-    keys = np.empty(8, dtype=object)
-    for i, t in enumerate(zip([0, 0, np.nan, np.nan] * 2,
-                              [0, np.nan, 0, np.nan] * 2)):
-        keys[i] = t
-
-    result = lib.duplicated(keys)
-    falses = [False] * 4
-    trues = [True] * 4
-    expected = falses + trues
-    assert (np.array_equal(result, expected))
-
-    result = lib.duplicated(keys, keep='last')
-    expected = trues + falses
-    assert (np.array_equal(result, expected))
-
-    result = lib.duplicated(keys, keep=False)
-    expected = trues + trues
-    assert (np.array_equal(result, expected))
-
-
 if __name__ == '__main__':
     import nose