diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index 65af7b077d80f..3f9016787aab4 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -1,7 +1,8 @@ import numpy as np -from pandas import Series, Index, DatetimeIndex, Timestamp +import pandas.util.testing as tm +from pandas import Series, Index, DatetimeIndex, Timestamp, MultiIndex -from .pandas_vb_common import setup # noqa +from .pandas_vb_common import setup # noqa class SeriesConstructors(object): @@ -21,7 +22,6 @@ class SeriesConstructors(object): def setup(self, data_fmt, with_index): N = 10**4 - np.random.seed(1234) arr = np.random.randn(N) self.data = data_fmt(arr) self.index = np.arange(N) if with_index else None @@ -35,21 +35,32 @@ class SeriesDtypesConstructors(object): goal_time = 0.2 def setup(self): - N = 10**2 + N = 10**4 self.arr = np.random.randn(N, N) self.arr_str = np.array(['foo', 'bar', 'baz'], dtype=object) - - self.data = np.random.randn(N) - self.index = Index(np.arange(N)) - self.s = Series([Timestamp('20110101'), Timestamp('20120101'), Timestamp('20130101')] * N * 10) def time_index_from_array_string(self): Index(self.arr_str) + def time_index_from_array_floats(self): + Index(self.arr) + def time_dtindex_from_series(self): DatetimeIndex(self.s) def time_dtindex_from_index_with_series(self): Index(self.s) + + +class MultiIndexConstructor(object): + + goal_time = 0.2 + + def setup(self): + N = 10**4 + self.iterables = [tm.makeStringIndex(N), range(20)] + + def time_multiindex_from_iterables(self): + MultiIndex.from_product(self.iterables) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index a607168ea0457..d73b216478ad5 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -1,207 +1,151 @@ -from .pandas_vb_common import * +import numpy as np +import pandas.util.testing as tm +from pandas import (Series, date_range, DatetimeIndex, Index, MultiIndex, + RangeIndex) + +from .pandas_vb_common import setup # noqa class SetOperations(object): + + goal_time = 0.2 + params = (['datetime', 'date_string', 'int', 'strings'], + ['intersection', 'union', 'symmetric_difference']) + param_names = ['dtype', 'method'] + + def setup(self, dtype, method): + N = 10**5 + dates_left = date_range('1/1/2000', periods=N, freq='T') + fmt = '%Y-%m-%d %H:%M:%S' + date_str_left = Index(dates_left.strftime(fmt)) + int_left = Index(np.arange(N)) + str_left = tm.makeStringIndex(N) + data = {'datetime': {'left': dates_left, 'right': dates_left[:-1]}, + 'date_string': {'left': date_str_left, + 'right': date_str_left[:-1]}, + 'int': {'left': int_left, 'right': int_left[:-1]}, + 'strings': {'left': str_left, 'right': str_left[:-1]}} + self.left = data[dtype]['left'] + self.right = data[dtype]['right'] + + def time_operation(self, dtype, method): + getattr(self.left, method)(self.right) + + +class SetDisjoint(object): + goal_time = 0.2 def setup(self): - self.rng = date_range('1/1/2000', periods=10000, freq='T') - self.rng2 = self.rng[:(-1)] - - # object index with datetime values - if (self.rng.dtype == object): - self.idx_rng = self.rng.view(Index) - else: - self.idx_rng = self.rng.astype(object) - self.idx_rng2 = self.idx_rng[:(-1)] - - # other datetime - N = 100000 - A = N - 20000 + N = 10**5 B = N + 20000 - self.dtidx1 = DatetimeIndex(range(N)) - self.dtidx2 = DatetimeIndex(range(A, B)) - self.dtidx3 = DatetimeIndex(range(N, B)) - - # integer - self.N = 1000000 - self.options = np.arange(self.N) - self.left = Index( - self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) - self.right = Index( - self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) - - # strings - N = 10000 - strs = tm.rands_array(10, N) - self.leftstr = Index(strs[:N * 2 // 3]) - self.rightstr = Index(strs[N // 3:]) - - def time_datetime_intersection(self): - self.rng.intersection(self.rng2) - - def time_datetime_union(self): - self.rng.union(self.rng2) - - def time_datetime_difference(self): - self.dtidx1.difference(self.dtidx2) + self.datetime_left = DatetimeIndex(range(N)) + self.datetime_right = DatetimeIndex(range(N, B)) def time_datetime_difference_disjoint(self): - self.dtidx1.difference(self.dtidx3) - - def time_datetime_symmetric_difference(self): - self.dtidx1.symmetric_difference(self.dtidx2) - - def time_index_datetime_intersection(self): - self.idx_rng.intersection(self.idx_rng2) - - def time_index_datetime_union(self): - self.idx_rng.union(self.idx_rng2) - - def time_int64_intersection(self): - self.left.intersection(self.right) - - def time_int64_union(self): - self.left.union(self.right) - - def time_int64_difference(self): - self.left.difference(self.right) - - def time_int64_symmetric_difference(self): - self.left.symmetric_difference(self.right) - - def time_str_difference(self): - self.leftstr.difference(self.rightstr) - - def time_str_symmetric_difference(self): - self.leftstr.symmetric_difference(self.rightstr) + self.datetime_left.difference(self.datetime_right) class Datetime(object): + goal_time = 0.2 def setup(self): - self.dr = pd.date_range('20000101', freq='D', periods=10000) + self.dr = date_range('20000101', freq='D', periods=10000) def time_is_dates_only(self): self.dr._is_dates_only -class Float64(object): - goal_time = 0.2 - - def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) - - self.baseidx = np.arange(1000000.0) - - def time_boolean_indexer(self): - self.idx[self.mask] - - def time_boolean_series_indexer(self): - self.idx[self.series_mask] - - def time_construct(self): - Index(self.baseidx) - - def time_div(self): - (self.idx / 2) - - def time_get(self): - self.idx[1] +class Ops(object): - def time_mul(self): - (self.idx * 2) + sample_time = 0.2 + params = ['float', 'int'] + param_names = ['dtype'] - def time_slice_indexer_basic(self): - self.idx[:(-1)] + def setup(self, dtype): + N = 10**6 + indexes = {'int': 'makeIntIndex', 'float': 'makeFloatIndex'} + self.index = getattr(tm, indexes[dtype])(N) - def time_slice_indexer_even(self): - self.idx[::2] + def time_add(self, dtype): + self.index + 2 + def time_subtract(self, dtype): + self.index - 2 -class StringIndex(object): - goal_time = 0.2 + def time_multiply(self, dtype): + self.index * 2 - def setup(self): - self.idx = tm.makeStringIndex(1000000) - self.mask = ((np.arange(1000000) % 3) == 0) - self.series_mask = Series(self.mask) - - def time_boolean_indexer(self): - self.idx[self.mask] - - def time_boolean_series_indexer(self): - self.idx[self.series_mask] + def time_divide(self, dtype): + self.index / 2 - def time_slice_indexer_basic(self): - self.idx[:(-1)] + def time_modulo(self, dtype): + self.index % 2 - def time_slice_indexer_even(self): - self.idx[::2] +class Duplicated(object): -class Multi1(object): goal_time = 0.2 def setup(self): - (n, k) = (200, 5000) - self.levels = [np.arange(n), tm.makeStringIndex(n).values, (1000 + np.arange(n))] - self.labels = [np.random.choice(n, (k * n)) for lev in self.levels] - self.mi = MultiIndex(levels=self.levels, labels=self.labels) - - self.iterables = [tm.makeStringIndex(10000), range(20)] + n, k = 200, 5000 + levels = [np.arange(n), + tm.makeStringIndex(n).values, + 1000 + np.arange(n)] + labels = [np.random.choice(n, (k * n)) for lev in levels] + self.mi = MultiIndex(levels=levels, labels=labels) def time_duplicated(self): self.mi.duplicated() - def time_from_product(self): - MultiIndex.from_product(self.iterables) +class Sortlevel(object): -class Multi2(object): goal_time = 0.2 def setup(self): - self.n = ((((3 * 5) * 7) * 11) * (1 << 10)) - (low, high) = (((-1) << 12), (1 << 12)) - self.f = (lambda k: np.repeat(np.random.randint(low, high, (self.n // k)), k)) - self.i = np.random.permutation(self.n) - self.mi = MultiIndex.from_arrays([self.f(11), self.f(7), self.f(5), self.f(3), self.f(1)])[self.i] + n = 1182720 + low, high = -4096, 4096 + arrs = [np.repeat(np.random.randint(low, high, (n // k)), k) + for k in [11, 7, 5, 3, 1]] + self.mi_int = MultiIndex.from_arrays(arrs)[np.random.permutation(n)] - self.a = np.repeat(np.arange(100), 1000) - self.b = np.tile(np.arange(1000), 100) - self.midx2 = MultiIndex.from_arrays([self.a, self.b]) - self.midx2 = self.midx2.take(np.random.permutation(np.arange(100000))) + a = np.repeat(np.arange(100), 1000) + b = np.tile(np.arange(1000), 100) + self.mi = MultiIndex.from_arrays([a, b]) + self.mi = self.mi.take(np.random.permutation(np.arange(100000))) def time_sortlevel_int64(self): - self.mi.sortlevel() + self.mi_int.sortlevel() def time_sortlevel_zero(self): - self.midx2.sortlevel(0) + self.mi.sortlevel(0) def time_sortlevel_one(self): - self.midx2.sortlevel(1) + self.mi.sortlevel(1) -class Multi3(object): +class MultiIndexValues(object): + goal_time = 0.2 - def setup(self): - self.level1 = range(1000) - self.level2 = date_range(start='1/1/2012', periods=100) - self.mi = MultiIndex.from_product([self.level1, self.level2]) + def setup_cache(self): - def time_datetime_level_values_full(self): - self.mi.copy().values + level1 = range(1000) + level2 = date_range(start='1/1/2012', periods=100) + mi = MultiIndex.from_product([level1, level2]) + return mi - def time_datetime_level_values_sliced(self): - self.mi[:10].values + def time_datetime_level_values_copy(self, mi): + mi.copy().values + + def time_datetime_level_values_sliced(self, mi): + mi[:10].values class Range(object): + goal_time = 0.2 def setup(self): @@ -221,20 +165,60 @@ def time_min_trivial(self): self.idx_inc.min() -class IndexOps(object): +class IndexAppend(object): + goal_time = 0.2 def setup(self): + N = 10000 - self.ridx = [RangeIndex(i * 100, (i + 1) * 100) for i in range(N)] - self.iidx = [idx.astype(int) for idx in self.ridx] - self.oidx = [idx.astype(str) for idx in self.iidx] + self.range_idx = RangeIndex(0, 100) + self.int_idx = self.range_idx.astype(int) + self.obj_idx = self.int_idx.astype(str) + self.range_idxs = [] + self.int_idxs = [] + self.object_idxs = [] + for i in range(1, N): + r_idx = RangeIndex(i * 100, (i + 1) * 100) + self.range_idxs.append(r_idx) + i_idx = r_idx.astype(int) + self.int_idxs.append(i_idx) + o_idx = i_idx.astype(str) + self.object_idxs.append(o_idx) + + def time_append_range_list(self): + self.range_idx.append(self.range_idxs) + + def time_append_int_list(self): + self.int_idx.append(self.int_idxs) + + def time_append_obj_list(self): + self.obj_idx.append(self.object_idxs) + + +class Indexing(object): + + goal_time = 0.2 + params = ['String', 'Float', 'Int'] + param_names = ['dtype'] + + def setup(self, dtype): + N = 10**6 + self.idx = getattr(tm, 'make{}Index'.format(dtype))(N) + self.array_mask = (np.arange(N) % 3) == 0 + self.series_mask = Series(self.array_mask) - def time_concat_range(self): - self.ridx[0].append(self.ridx[1:]) + def time_boolean_array(self, dtype): + self.idx[self.array_mask] - def time_concat_int(self): - self.iidx[0].append(self.iidx[1:]) + def time_boolean_series(self, dtype): + self.idx[self.series_mask] - def time_concat_obj(self): - self.oidx[0].append(self.oidx[1:]) + def time_get(self, dtype): + self.idx[1] + + def time_slice(self, dtype): + self.idx[:-1] + + def time_slice_step(self, dtype): + self.idx[::2] diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index f271b82c758ee..5b12f6ea89614 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -367,5 +367,3 @@ def time_assign_with_setitem(self): np.random.seed(1234) for i in range(100): self.df[i] = np.random.randn(self.N) - -