diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index d2167a8b6e9e1..37969a6949157 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -5,22 +5,21 @@ class series_isin_int64(object): goal_time = 0.2 def setup(self): - self.s1 = Series(np.random.randn(10000)) - self.s2 = Series(np.random.randint(1, 10, 10000)) self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64') + self.s4 = Series(np.random.randint(1, 100, 10000000)).astype('int64') self.values = [1, 2] - self.s4 = self.s3.astype('object') def time_series_isin_int64(self): self.s3.isin(self.values) + def time_series_isin_int64_large(self): + self.s4.isin(self.values) + class series_isin_object(object): goal_time = 0.2 def setup(self): - self.s1 = Series(np.random.randn(10000)) - self.s2 = Series(np.random.randint(1, 10, 10000)) self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64') self.values = [1, 2] self.s4 = self.s3.astype('object') @@ -71,4 +70,4 @@ def setup(self): def time_series_nsmallest2(self): self.s2.nsmallest(3, take_last=True) - self.s2.nsmallest(3, take_last=False) \ No newline at end of file + self.s2.nsmallest(3, take_last=False) diff --git a/ci/requirements-2.6.build b/ci/requirements-2.6.build index f8cbd8cef3fef..85148069a9e6a 100644 --- a/ci/requirements-2.6.build +++ b/ci/requirements-2.6.build @@ -1,4 +1,4 @@ -numpy=1.7.0 +numpy=1.7.1 cython=0.19.1 dateutil=1.5 pytz=2013b diff --git a/ci/requirements-2.6.run b/ci/requirements-2.6.run index 6521ca4122ef3..5f8a2fde1409f 100644 --- a/ci/requirements-2.6.run +++ b/ci/requirements-2.6.run @@ -1,14 +1,16 @@ -numpy=1.7.0 +numpy=1.7.1 dateutil=1.5 pytz=2013b scipy=0.11.0 xlwt=0.7.5 xlrd=0.9.2 statsmodels=0.4.3 +bottleneck=0.8.0 +numexpr=2.2.2 +pytables=3.0.0 html5lib=1.0b2 beautiful-soup=4.2.0 psycopg2=2.5.1 -numexpr=1.4.2 pymysql=0.6.0 sqlalchemy=0.7.8 xlsxwriter=0.4.6 diff --git a/ci/requirements-2.7.build b/ci/requirements-2.7.build index df543aaf40f69..3fe6f60aee98b 100644 --- a/ci/requirements-2.7.build +++ b/ci/requirements-2.7.build @@ -1,4 +1,4 @@ dateutil=2.1 pytz=2013b -numpy=1.7.1 +numpy cython=0.19.1 diff --git a/ci/requirements-2.7.run b/ci/requirements-2.7.run index a740966684ab2..86e5934539ebf 100644 --- a/ci/requirements-2.7.run +++ b/ci/requirements-2.7.run @@ -1,10 +1,10 @@ dateutil=2.1 pytz=2013b -numpy=1.7.1 +numpy xlwt=0.7.5 -numexpr=2.2.2 -pytables=3.0.0 -matplotlib=1.3.1 +numexpr +pytables +matplotlib openpyxl=1.6.2 xlrd=0.9.2 sqlalchemy=0.9.6 @@ -12,7 +12,7 @@ lxml=3.2.1 scipy xlsxwriter=0.4.6 boto=2.36.0 -bottleneck=0.8.0 +bottleneck psycopg2=2.5.2 patsy pymysql=0.6.3 diff --git a/ci/requirements-2.7_SLOW.build b/ci/requirements-2.7_SLOW.build index 9558cf00ddf5c..664e8b418def7 100644 --- a/ci/requirements-2.7_SLOW.build +++ b/ci/requirements-2.7_SLOW.build @@ -1,4 +1,4 @@ python-dateutil pytz -numpy +numpy=1.8.2 cython diff --git a/ci/requirements-2.7_SLOW.run b/ci/requirements-2.7_SLOW.run index b6c9250dd775e..f02a7cb8a309a 100644 --- a/ci/requirements-2.7_SLOW.run +++ b/ci/requirements-2.7_SLOW.run @@ -1,7 +1,7 @@ python-dateutil pytz -numpy -matplotlib +numpy=1.8.2 +matplotlib=1.3.1 scipy patsy statsmodels diff --git a/doc/source/install.rst b/doc/source/install.rst index 3c624a9d25a0c..54e7b2d4df350 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -214,7 +214,7 @@ Dependencies ------------ * `setuptools `__ -* `NumPy `__: 1.7.0 or higher +* `NumPy `__: 1.7.1 or higher * `python-dateutil `__ 1.5 or higher * `pytz `__ * Needed for time zone support diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index ec074dfa28bf5..1e240d0786082 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -1034,6 +1034,7 @@ Bug Fixes ~~~~~~~~~ - Bug in incorrection computation of ``.mean()`` on ``timedelta64[ns]`` because of overflow (:issue:`9442`) +- Bug in ``.isin`` on older numpies (:issue: `11232`) - Bug in ``DataFrame.to_html(index=False)`` renders unnecessary ``name`` row (:issue:`10344`) - Bug in ``DataFrame.to_latex()`` the ``column_format`` argument could not be passed (:issue:`9402`) - Bug in ``DatetimeIndex`` when localizing with ``NaT`` (:issue:`10477`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 34bf173d63860..e5347f03b5462 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -6,6 +6,7 @@ from warnings import warn import numpy as np +from pandas import compat, lib, _np_version_under1p8 import pandas.core.common as com import pandas.algos as algos import pandas.hashtable as htable @@ -66,6 +67,54 @@ def unique(values): return _hashtable_algo(f, values.dtype) +def isin(comps, values): + """ + Compute the isin boolean array + + Parameters + ---------- + comps: array-like + values: array-like + + Returns + ------- + boolean array same length as comps + """ + + if not com.is_list_like(comps): + raise TypeError("only list-like objects are allowed to be passed" + " to isin(), you passed a " + "[{0}]".format(type(comps).__name__)) + comps = np.asarray(comps) + if not com.is_list_like(values): + raise TypeError("only list-like objects are allowed to be passed" + " to isin(), you passed a " + "[{0}]".format(type(values).__name__)) + + # GH11232 + # work-around for numpy < 1.8 and comparisions on py3 + # faster for larger cases to use np.in1d + if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000: + f = lambda x, y: np.in1d(x,np.asarray(list(y))) + else: + f = lambda x, y: lib.ismember_int64(x,set(y)) + + # may need i8 conversion for proper membership testing + if com.is_datetime64_dtype(comps): + from pandas.tseries.tools import to_datetime + values = to_datetime(values)._values.view('i8') + comps = comps.view('i8') + elif com.is_timedelta64_dtype(comps): + from pandas.tseries.timedeltas import to_timedelta + values = to_timedelta(values)._values.view('i8') + comps = comps.view('i8') + elif com.is_int64_dtype(comps): + pass + else: + f = lambda x, y: lib.ismember(x, set(values)) + + return f(comps, values) + def _hashtable_algo(f, dtype, return_dtype=None): """ f(HashTable, type_caster) -> result diff --git a/pandas/core/index.py b/pandas/core/index.py index 1daa0e1b52d02..256ece6539b6f 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -14,6 +14,7 @@ from pandas.compat import range, zip, lrange, lzip, u, map from pandas import compat +from pandas.core import algorithms from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate from pandas.util.decorators import (Appender, Substitution, cache_readonly, deprecate, deprecate_kwarg) @@ -108,7 +109,6 @@ class Index(IndexOpsMixin, PandasObject): _is_numeric_dtype = False _engine_type = _index.ObjectEngine - _isin_type = lib.ismember def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, tupleize_cols=True, **kwargs): @@ -1443,7 +1443,7 @@ def __add__(self, other): return Index(np.array(self) + other) def __radd__(self, other): - if com.is_list_like(other): + if is_list_like(other): warnings.warn("using '+' to provide set union with Indexes is deprecated, " "use '|' or .union()", FutureWarning, stacklevel=2) return Index(other + np.array(self)) @@ -1995,10 +1995,9 @@ def isin(self, values, level=None): is_contained : ndarray (boolean dtype) """ - value_set = set(values) if level is not None: self._validate_index_level(level) - return self._isin_type(np.array(self), value_set) + return algorithms.isin(np.array(self), values) def _can_reindex(self, indexer): """ @@ -3097,6 +3096,8 @@ def _is_dtype_compat(self, other): raise TypeError("categories must match existing categories when appending") else: values = other + if not is_list_like(values): + values = [ values ] other = CategoricalIndex(self._create_categorical(self, other, categories=self.categories, ordered=self.ordered)) if not other.isin(values).all(): raise TypeError("cannot append a non-category item to a CategoricalIndex") @@ -3580,7 +3581,6 @@ class Int64Index(NumericIndex): _outer_indexer = _algos.outer_join_indexer_int64 _engine_type = _index.Int64Engine - _isin_type = lib.ismember_int64 def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs): diff --git a/pandas/core/series.py b/pandas/core/series.py index 11645311467d5..f4e3374626011 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -42,7 +42,7 @@ from pandas.compat import zip, u, OrderedDict, StringIO import pandas.core.ops as ops -from pandas.core.algorithms import select_n +from pandas.core import algorithms import pandas.core.common as com import pandas.core.datetools as datetools @@ -1156,8 +1156,7 @@ def mode(self): modes : Series (sorted) """ # TODO: Add option for bins like value_counts() - from pandas.core.algorithms import mode - return mode(self) + return algorithms.mode(self) @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs) @@ -1812,9 +1811,8 @@ def rank(self, method='average', na_option='keep', ascending=True, ------- ranks : Series """ - from pandas.core.algorithms import rank - ranks = rank(self._values, method=method, na_option=na_option, - ascending=ascending, pct=pct) + ranks = algorithms.rank(self._values, method=method, na_option=na_option, + ascending=ascending, pct=pct) return self._constructor(ranks, index=self.index).__finalize__(self) @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @@ -1852,7 +1850,7 @@ def nlargest(self, n=5, keep='first'): >>> s = pd.Series(np.random.randn(1e6)) >>> s.nlargest(10) # only sorts up to the N requested """ - return select_n(self, n=n, keep=keep, method='nlargest') + return algorithms.select_n(self, n=n, keep=keep, method='nlargest') @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) def nsmallest(self, n=5, keep='first'): @@ -1889,7 +1887,7 @@ def nsmallest(self, n=5, keep='first'): >>> s = pd.Series(np.random.randn(1e6)) >>> s.nsmallest(10) # only sorts up to the N requested """ - return select_n(self, n=n, keep=keep, method='nsmallest') + return algorithms.select_n(self, n=n, keep=keep, method='nsmallest') def sortlevel(self, level=0, ascending=True, sort_remaining=True): """ @@ -2353,29 +2351,7 @@ def isin(self, values): dtype: bool """ - if not com.is_list_like(values): - raise TypeError("only list-like objects are allowed to be passed" - " to Series.isin(), you passed a " - "{0!r}".format(type(values).__name__)) - - # may need i8 conversion for proper membership testing - comps = _values_from_object(self) - f = lib.ismember - if com.is_datetime64_dtype(self): - from pandas.tseries.tools import to_datetime - values = Series(to_datetime(values))._values.view('i8') - comps = comps.view('i8') - f = lib.ismember_int64 - elif com.is_timedelta64_dtype(self): - from pandas.tseries.timedeltas import to_timedelta - values = Series(to_timedelta(values))._values.view('i8') - comps = comps.view('i8') - f = lib.ismember_int64 - elif is_int64_dtype(self): - f = lib.ismember_int64 - - value_set = set(values) - result = f(comps, value_set) + result = algorithms.isin(_values_from_object(self), values) return self._constructor(result, index=self.index).__finalize__(self) def between(self, left, right, inclusive=True): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 3c9931b93f9f9..b18bd7b2b3978 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -278,7 +278,69 @@ def test_timedelta64_dtype_array_returned(self): tm.assert_numpy_array_equal(result, expected) self.assertEqual(result.dtype, expected.dtype) +class TestIsin(tm.TestCase): + _multiprocess_can_split_ = True + + def test_invalid(self): + + self.assertRaises(TypeError, lambda : algos.isin(1,1)) + self.assertRaises(TypeError, lambda : algos.isin(1,[1])) + self.assertRaises(TypeError, lambda : algos.isin([1],1)) + + def test_basic(self): + + result = algos.isin([1,2],[1]) + expected = np.array([True,False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.isin(np.array([1,2]),[1]) + expected = np.array([True,False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.isin(pd.Series([1,2]),[1]) + expected = np.array([True,False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.isin(pd.Series([1,2]),pd.Series([1])) + expected = np.array([True,False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.isin(['a','b'],['a']) + expected = np.array([True,False]) + tm.assert_numpy_array_equal(result, expected) + result = algos.isin(pd.Series(['a','b']),pd.Series(['a'])) + expected = np.array([True,False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.isin(['a','b'],[1]) + expected = np.array([False,False]) + tm.assert_numpy_array_equal(result, expected) + + arr = pd.date_range('20130101',periods=3).values + result = algos.isin(arr,[arr[0]]) + expected = np.array([True,False,False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.isin(arr,arr[0:2]) + expected = np.array([True,True,False]) + tm.assert_numpy_array_equal(result, expected) + + arr = pd.timedelta_range('1 day',periods=3).values + result = algos.isin(arr,[arr[0]]) + expected = np.array([True,False,False]) + tm.assert_numpy_array_equal(result, expected) + + + + def test_large(self): + + s = pd.date_range('20000101',periods=2000000,freq='s').values + result = algos.isin(s,s[0:2]) + expected = np.zeros(len(s),dtype=bool) + expected[0] = True + expected[1] = True + tm.assert_numpy_array_equal(result, expected) class TestValueCounts(tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index ebf3af5f46c47..2f4858300293e 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -7,7 +7,7 @@ from pandas import compat import numpy as np -from pandas.core import common as com +from pandas.core import common as com, algorithms from pandas.core.common import is_integer, is_float, AbstractMethodError import pandas.tslib as tslib import pandas.lib as lib @@ -486,8 +486,7 @@ def isin(self, values): except ValueError: return self.asobject.isin(values) - value_set = set(values.asi8) - return lib.ismember_int64(self.asi8, value_set) + return algorithms.isin(self.asi8, values.asi8) def shift(self, n, freq=None): """