|
| 1 | +from pandas import * |
| 2 | +from pandas.util.testing import rands |
| 3 | +import pandas._tseries as lib |
| 4 | +import numpy as np |
| 5 | +import matplotlib.pyplot as plt |
| 6 | + |
| 7 | +N = 50000 |
| 8 | +K = 10000 |
| 9 | + |
| 10 | +groups = np.array([rands(10) for _ in xrange(K)], dtype='O') |
| 11 | +groups2 = np.array([rands(10) for _ in xrange(K)], dtype='O') |
| 12 | + |
| 13 | +labels = np.tile(groups, N // K) |
| 14 | +labels2 = np.tile(groups2, N // K) |
| 15 | +data = np.random.randn(N) |
| 16 | + |
| 17 | +def timeit(f, niter): |
| 18 | + import gc, time |
| 19 | + gc.disable() |
| 20 | + start = time.time() |
| 21 | + for _ in xrange(niter): |
| 22 | + f() |
| 23 | + elapsed = (time.time() - start) / niter |
| 24 | + gc.enable() |
| 25 | + return elapsed |
| 26 | + |
| 27 | +def algo1(): |
| 28 | + unique_labels = np.unique(labels) |
| 29 | + result = np.empty(len(unique_labels)) |
| 30 | + for i, label in enumerate(unique_labels): |
| 31 | + result[i] = data[labels == label].sum() |
| 32 | + |
| 33 | +def algo2(): |
| 34 | + unique_labels = np.unique(labels) |
| 35 | + indices = lib.groupby_indices(labels) |
| 36 | + result = np.empty(len(unique_labels)) |
| 37 | + |
| 38 | + for i, label in enumerate(unique_labels): |
| 39 | + result[i] = data.take(indices[label]).sum() |
| 40 | + |
| 41 | +def algo3_nosort(): |
| 42 | + rizer = lib.DictFactorizer() |
| 43 | + labs, counts = rizer.factorize(labels, sort=False) |
| 44 | + k = len(rizer.uniques) |
| 45 | + out = np.empty(k) |
| 46 | + lib.group_add(out, counts, data, labs) |
| 47 | + |
| 48 | +def algo3_sort(): |
| 49 | + rizer = lib.DictFactorizer() |
| 50 | + labs, counts = rizer.factorize(labels, sort=True) |
| 51 | + k = len(rizer.uniques) |
| 52 | + out = np.empty(k) |
| 53 | + lib.group_add(out, counts, data, labs) |
| 54 | + |
| 55 | +import numpy as np |
| 56 | +import random |
| 57 | + |
| 58 | + |
| 59 | +# dict to hold results |
| 60 | +counts = {} |
| 61 | + |
| 62 | +# a hack to generate random key, value pairs. |
| 63 | +# 5k keys, 100k values |
| 64 | +x = np.tile(np.arange(5000, dtype='O'), 20) |
| 65 | +random.shuffle(x) |
| 66 | +xarr = x |
| 67 | +x = [int(y) for y in x] |
| 68 | +data = np.random.uniform(0, 1, 100000) |
| 69 | + |
| 70 | +def f(): |
| 71 | + from itertools import izip |
| 72 | + # groupby sum |
| 73 | + for k, v in izip(x, data): |
| 74 | + try: |
| 75 | + counts[k] += v |
| 76 | + except KeyError: |
| 77 | + counts[k] = v |
| 78 | + |
| 79 | +def f2(): |
| 80 | + rizer = lib.DictFactorizer() |
| 81 | + labs, counts = rizer.factorize(xarr, sort=False) |
| 82 | + k = len(rizer.uniques) |
| 83 | + out = np.empty(k) |
| 84 | + lib.group_add(out, counts, data, labs) |
| 85 | + |
| 86 | +def algo4(): |
| 87 | + rizer = lib.DictFactorizer() |
| 88 | + labs1, _ = rizer.factorize(labels, sort=False) |
| 89 | + k1 = len(rizer.uniques) |
| 90 | + |
| 91 | + rizer = lib.DictFactorizer() |
| 92 | + labs2, _ = rizer.factorize(labels2, sort=False) |
| 93 | + k2 = len(rizer.uniques) |
| 94 | + |
| 95 | + group_id = labs1 * k2 + labs2 |
| 96 | + max_group = k1 * k2 |
| 97 | + |
| 98 | + if max_group > 1e6: |
| 99 | + rizer = lib.Int64Factorizer(len(group_id)) |
| 100 | + group_id, _ = rizer.factorize(group_id.astype('i8'), sort=True) |
| 101 | + max_group = len(rizer.uniques) |
| 102 | + |
| 103 | + out = np.empty(max_group) |
| 104 | + counts = np.zeros(max_group, dtype='i4') |
| 105 | + lib.group_add(out, counts, data, group_id) |
| 106 | + |
| 107 | +# cumtime percall filename:lineno(function) |
| 108 | +# 0.592 0.592 <string>:1(<module>) |
| 109 | + # 0.584 0.006 groupby_ex.py:37(algo3_nosort) |
| 110 | + # 0.535 0.005 {method 'factorize' of DictFactorizer' objects} |
| 111 | + # 0.047 0.000 {pandas._tseries.group_add} |
| 112 | + # 0.002 0.000 numeric.py:65(zeros_like) |
| 113 | + # 0.001 0.000 {method 'fill' of 'numpy.ndarray' objects} |
| 114 | + # 0.000 0.000 {numpy.core.multiarray.empty_like} |
| 115 | + # 0.000 0.000 {numpy.core.multiarray.empty} |
| 116 | + |
| 117 | +# UNIQUE timings |
| 118 | + |
| 119 | +# N = 10000000 |
| 120 | +# K = 500000 |
| 121 | + |
| 122 | +# groups = np.array([rands(10) for _ in xrange(K)], dtype='O') |
| 123 | + |
| 124 | +# labels = np.tile(groups, N // K) |
| 125 | +data = np.random.randn(N) |
| 126 | + |
| 127 | +data = np.random.randn(N) |
| 128 | + |
| 129 | +Ks = [100, 1000, 5000, 10000, 25000, 50000, 100000] |
| 130 | + |
| 131 | +# Ks = [500000, 1000000, 2500000, 5000000, 10000000] |
| 132 | + |
| 133 | +import psutil |
| 134 | +import os |
| 135 | +import gc |
| 136 | + |
| 137 | +pid = os.getpid() |
| 138 | +proc = psutil.Process(pid) |
| 139 | + |
| 140 | +def dict_unique(values, expected_K, sort=False, memory=False): |
| 141 | + if memory: |
| 142 | + gc.collect() |
| 143 | + before_mem = proc.get_memory_info().rss |
| 144 | + |
| 145 | + rizer = lib.DictFactorizer() |
| 146 | + result = rizer.unique_int64(values) |
| 147 | + |
| 148 | + if memory: |
| 149 | + result = proc.get_memory_info().rss - before_mem |
| 150 | + return result |
| 151 | + |
| 152 | + if sort: |
| 153 | + result.sort() |
| 154 | + assert(len(result) == expected_K) |
| 155 | + return result |
| 156 | + |
| 157 | +def khash_unique(values, expected_K, size_hint=False, sort=False, |
| 158 | + memory=False): |
| 159 | + if memory: |
| 160 | + gc.collect() |
| 161 | + before_mem = proc.get_memory_info().rss |
| 162 | + |
| 163 | + if size_hint: |
| 164 | + rizer = lib.Factorizer(len(values)) |
| 165 | + else: |
| 166 | + rizer = lib.Factorizer(100) |
| 167 | + |
| 168 | + result = [] |
| 169 | + result = rizer.unique(values) |
| 170 | + |
| 171 | + if memory: |
| 172 | + result = proc.get_memory_info().rss - before_mem |
| 173 | + return result |
| 174 | + |
| 175 | + if sort: |
| 176 | + result.sort() |
| 177 | + assert(len(result) == expected_K) |
| 178 | + |
| 179 | +def khash_unique_str(values, expected_K, size_hint=False, sort=False, |
| 180 | + memory=False): |
| 181 | + if memory: |
| 182 | + gc.collect() |
| 183 | + before_mem = proc.get_memory_info().rss |
| 184 | + |
| 185 | + if size_hint: |
| 186 | + rizer = lib.StringHashTable(len(values)) |
| 187 | + else: |
| 188 | + rizer = lib.StringHashTable(100) |
| 189 | + |
| 190 | + result = [] |
| 191 | + result = rizer.unique(values) |
| 192 | + |
| 193 | + if memory: |
| 194 | + result = proc.get_memory_info().rss - before_mem |
| 195 | + return result |
| 196 | + |
| 197 | + if sort: |
| 198 | + result.sort() |
| 199 | + assert(len(result) == expected_K) |
| 200 | + |
| 201 | +def khash_unique_int64(values, expected_K, size_hint=False, sort=False): |
| 202 | + if size_hint: |
| 203 | + rizer = lib.Int64HashTable(len(values)) |
| 204 | + else: |
| 205 | + rizer = lib.Int64HashTable(100) |
| 206 | + |
| 207 | + result = [] |
| 208 | + result = rizer.unique(values) |
| 209 | + |
| 210 | + if sort: |
| 211 | + result.sort() |
| 212 | + assert(len(result) == expected_K) |
| 213 | + |
| 214 | +def hash_bench(): |
| 215 | + numpy = [] |
| 216 | + dict_based = [] |
| 217 | + dict_based_sort = [] |
| 218 | + khash_hint = [] |
| 219 | + khash_nohint = [] |
| 220 | + for K in Ks: |
| 221 | + print K |
| 222 | + # groups = np.array([rands(10) for _ in xrange(K)]) |
| 223 | + # labels = np.tile(groups, N // K).astype('O') |
| 224 | + |
| 225 | + groups = np.random.randint(0, 100000000000L, size=K) |
| 226 | + labels = np.tile(groups, N // K) |
| 227 | + dict_based.append(timeit(lambda: dict_unique(labels, K), 20)) |
| 228 | + khash_nohint.append(timeit(lambda: khash_unique_int64(labels, K), 20)) |
| 229 | + khash_hint.append(timeit(lambda: khash_unique_int64(labels, K, |
| 230 | + size_hint=True), 20)) |
| 231 | + |
| 232 | + # memory, hard to get |
| 233 | + # dict_based.append(np.mean([dict_unique(labels, K, memory=True) |
| 234 | + # for _ in xrange(10)])) |
| 235 | + # khash_nohint.append(np.mean([khash_unique(labels, K, memory=True) |
| 236 | + # for _ in xrange(10)])) |
| 237 | + # khash_hint.append(np.mean([khash_unique(labels, K, size_hint=True, memory=True) |
| 238 | + # for _ in xrange(10)])) |
| 239 | + |
| 240 | + # dict_based_sort.append(timeit(lambda: dict_unique(labels, K, |
| 241 | + # sort=True), 10)) |
| 242 | + # numpy.append(timeit(lambda: np.unique(labels), 10)) |
| 243 | + |
| 244 | + # unique_timings = DataFrame({'numpy.unique' : numpy, |
| 245 | + # 'dict, no sort' : dict_based, |
| 246 | + # 'dict, sort' : dict_based_sort}, |
| 247 | + # columns=['dict, no sort', |
| 248 | + # 'dict, sort', 'numpy.unique'], |
| 249 | + # index=Ks) |
| 250 | + |
| 251 | + unique_timings = DataFrame({'dict' : dict_based, |
| 252 | + 'khash, preallocate' : khash_hint, |
| 253 | + 'khash' : khash_nohint}, |
| 254 | + columns=['khash, preallocate', 'khash', 'dict'], |
| 255 | + index=Ks) |
| 256 | + |
| 257 | + unique_timings.plot(kind='bar', legend=False) |
| 258 | + plt.legend(loc='best') |
| 259 | + plt.title('Unique on 100,000 values, int64') |
| 260 | + plt.xlabel('Number of unique labels') |
| 261 | + plt.ylabel('Mean execution time') |
| 262 | + |
| 263 | + |
| 264 | + plt.show() |
0 commit comments