diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 70d616ca72c1b..68a7c105be138 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -548,6 +548,7 @@ Performance Improvements - 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`) - 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`) - Significantly improved performance of indexing ``MultiIndex`` with slicers (:issue:`10287`) +- 8x improvement in ``iloc`` using list-like input (:issue:`10791`) - Improved performance of ``Series.isin`` for datetimelike/integer Series (:issue:`10287`) - 20x improvement in ``concat`` of Categoricals when categories are identical (:issue:`10587`) - Improved performance of ``to_datetime`` when specified format string is ISO8601 (:issue:`10178`) @@ -624,7 +625,7 @@ Bug Fixes - Bug in ``read_msgpack`` where DataFrame to decode has duplicate column names (:issue:`9618`) - Bug in ``io.common.get_filepath_or_buffer`` which caused reading of valid S3 files to fail if the bucket also contained keys for which the user does not have read permission (:issue:`10604`) - Bug in vectorised setting of timestamp columns with python ``datetime.date`` and numpy ``datetime64`` (:issue:`10408`, :issue:`10412`) - +- Bug in ``Index.take`` may add unnecessary ``freq`` attribute (:issue:`10791`) - Bug in ``pd.DataFrame`` when constructing an empty DataFrame with a string dtype (:issue:`9428`) - Bug in ``pd.unique`` for arrays with the ``datetime64`` or ``timedelta64`` dtype that meant an array with object dtype was returned instead the original dtype (:issue: `9431`) diff --git a/pandas/core/index.py b/pandas/core/index.py index febcfa37994a3..12ad8a590c304 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1194,7 +1194,7 @@ def _ensure_compat_concat(indexes): return indexes - def take(self, indexer, axis=0): + def take(self, indices, axis=0): """ return a new Index of the values selected by the indexer @@ -1203,11 +1203,9 @@ def take(self, indexer, axis=0): numpy.ndarray.take """ - indexer = com._ensure_platform_int(indexer) - taken = np.array(self).take(indexer) - - # by definition cannot propogate freq - return self._shallow_copy(taken, freq=None) + indices = com._ensure_platform_int(indices) + taken = self.values.take(indices) + return self._shallow_copy(taken) def putmask(self, mask, value): """ diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 3c988943301c0..9a3576a8fd846 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -276,6 +276,11 @@ def test_take(self): expected = ind[indexer] self.assertTrue(result.equals(expected)) + if not isinstance(ind, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + # GH 10791 + with tm.assertRaises(AttributeError): + ind.freq + def test_setops_errorcases(self): for name, idx in compat.iteritems(self.indices): # # non-iterable input @@ -4775,7 +4780,7 @@ def test_repr_roundtrip(self): mi = MultiIndex.from_product([list('ab'),range(3)],names=['first','second']) str(mi) - + if compat.PY3: tm.assert_index_equal(eval(repr(mi)), mi, exact=True) else: @@ -4784,11 +4789,11 @@ def test_repr_roundtrip(self): tm.assert_index_equal(result, mi, exact=False) self.assertEqual(mi.get_level_values('first').inferred_type, 'string') self.assertEqual(result.get_level_values('first').inferred_type, 'unicode') - + mi_u = MultiIndex.from_product([list(u'ab'),range(3)],names=['first','second']) result = eval(repr(mi_u)) - tm.assert_index_equal(result, mi_u, exact=True) - + tm.assert_index_equal(result, mi_u, exact=True) + # formatting if compat.PY3: str(mi) @@ -4810,7 +4815,7 @@ def test_repr_roundtrip(self): mi = MultiIndex.from_product([list(u'abcdefg'),range(10)],names=['first','second']) result = eval(repr(mi_u)) - tm.assert_index_equal(result, mi_u, exact=True) + tm.assert_index_equal(result, mi_u, exact=True) def test_str(self): # tested elsewhere diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 6d20b0128f164..addd070c23eeb 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -182,10 +182,12 @@ def take(self, indices, axis=0): """ Analogous to ndarray.take """ - maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices), len(self)) + indices = com._ensure_int64(indices) + maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) if isinstance(maybe_slice, slice): return self[maybe_slice] - return super(DatetimeIndexOpsMixin, self).take(indices, axis) + taken = self.asi8.take(indices) + return self._shallow_copy(taken, freq=None) def get_duplicates(self): values = Index.get_duplicates(self) diff --git a/vb_suite/indexing.py b/vb_suite/indexing.py index 9fbc070ac3b9d..f2236c48fb002 100644 --- a/vb_suite/indexing.py +++ b/vb_suite/indexing.py @@ -265,3 +265,16 @@ multiindex_slicers = Benchmark('mdt2.loc[idx[test_A-eps_A:test_A+eps_A,test_B-eps_B:test_B+eps_B,test_C-eps_C:test_C+eps_C,test_D-eps_D:test_D+eps_D],:]', setup, start_date=datetime(2015, 1, 1)) + +#---------------------------------------------------------------------- +# take + +setup = common_setup + """ +s = Series(np.random.rand(100000)) +ts = Series(np.random.rand(100000), + index=date_range('2011-01-01', freq='S', periods=100000)) +indexer = [True, False, True, True, False] * 20000 +""" + +series_take_intindex = Benchmark("s.take(indexer)", setup) +series_take_dtindex = Benchmark("ts.take(indexer)", setup)