Skip to content

Commit 271407c

Browse files
committed
TST: add bench_unique.py
1 parent fcb2005 commit 271407c

File tree

1 file changed

+264
-0
lines changed

1 file changed

+264
-0
lines changed

bench/bench_unique.py

Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
from pandas import *
2+
from pandas.util.testing import rands
3+
import pandas._tseries as lib
4+
import numpy as np
5+
import matplotlib.pyplot as plt
6+
7+
N = 50000
8+
K = 10000
9+
10+
groups = np.array([rands(10) for _ in xrange(K)], dtype='O')
11+
groups2 = np.array([rands(10) for _ in xrange(K)], dtype='O')
12+
13+
labels = np.tile(groups, N // K)
14+
labels2 = np.tile(groups2, N // K)
15+
data = np.random.randn(N)
16+
17+
def timeit(f, niter):
18+
import gc, time
19+
gc.disable()
20+
start = time.time()
21+
for _ in xrange(niter):
22+
f()
23+
elapsed = (time.time() - start) / niter
24+
gc.enable()
25+
return elapsed
26+
27+
def algo1():
28+
unique_labels = np.unique(labels)
29+
result = np.empty(len(unique_labels))
30+
for i, label in enumerate(unique_labels):
31+
result[i] = data[labels == label].sum()
32+
33+
def algo2():
34+
unique_labels = np.unique(labels)
35+
indices = lib.groupby_indices(labels)
36+
result = np.empty(len(unique_labels))
37+
38+
for i, label in enumerate(unique_labels):
39+
result[i] = data.take(indices[label]).sum()
40+
41+
def algo3_nosort():
42+
rizer = lib.DictFactorizer()
43+
labs, counts = rizer.factorize(labels, sort=False)
44+
k = len(rizer.uniques)
45+
out = np.empty(k)
46+
lib.group_add(out, counts, data, labs)
47+
48+
def algo3_sort():
49+
rizer = lib.DictFactorizer()
50+
labs, counts = rizer.factorize(labels, sort=True)
51+
k = len(rizer.uniques)
52+
out = np.empty(k)
53+
lib.group_add(out, counts, data, labs)
54+
55+
import numpy as np
56+
import random
57+
58+
59+
# dict to hold results
60+
counts = {}
61+
62+
# a hack to generate random key, value pairs.
63+
# 5k keys, 100k values
64+
x = np.tile(np.arange(5000, dtype='O'), 20)
65+
random.shuffle(x)
66+
xarr = x
67+
x = [int(y) for y in x]
68+
data = np.random.uniform(0, 1, 100000)
69+
70+
def f():
71+
from itertools import izip
72+
# groupby sum
73+
for k, v in izip(x, data):
74+
try:
75+
counts[k] += v
76+
except KeyError:
77+
counts[k] = v
78+
79+
def f2():
80+
rizer = lib.DictFactorizer()
81+
labs, counts = rizer.factorize(xarr, sort=False)
82+
k = len(rizer.uniques)
83+
out = np.empty(k)
84+
lib.group_add(out, counts, data, labs)
85+
86+
def algo4():
87+
rizer = lib.DictFactorizer()
88+
labs1, _ = rizer.factorize(labels, sort=False)
89+
k1 = len(rizer.uniques)
90+
91+
rizer = lib.DictFactorizer()
92+
labs2, _ = rizer.factorize(labels2, sort=False)
93+
k2 = len(rizer.uniques)
94+
95+
group_id = labs1 * k2 + labs2
96+
max_group = k1 * k2
97+
98+
if max_group > 1e6:
99+
rizer = lib.Int64Factorizer(len(group_id))
100+
group_id, _ = rizer.factorize(group_id.astype('i8'), sort=True)
101+
max_group = len(rizer.uniques)
102+
103+
out = np.empty(max_group)
104+
counts = np.zeros(max_group, dtype='i4')
105+
lib.group_add(out, counts, data, group_id)
106+
107+
# cumtime percall filename:lineno(function)
108+
# 0.592 0.592 <string>:1(<module>)
109+
# 0.584 0.006 groupby_ex.py:37(algo3_nosort)
110+
# 0.535 0.005 {method 'factorize' of DictFactorizer' objects}
111+
# 0.047 0.000 {pandas._tseries.group_add}
112+
# 0.002 0.000 numeric.py:65(zeros_like)
113+
# 0.001 0.000 {method 'fill' of 'numpy.ndarray' objects}
114+
# 0.000 0.000 {numpy.core.multiarray.empty_like}
115+
# 0.000 0.000 {numpy.core.multiarray.empty}
116+
117+
# UNIQUE timings
118+
119+
# N = 10000000
120+
# K = 500000
121+
122+
# groups = np.array([rands(10) for _ in xrange(K)], dtype='O')
123+
124+
# labels = np.tile(groups, N // K)
125+
data = np.random.randn(N)
126+
127+
data = np.random.randn(N)
128+
129+
Ks = [100, 1000, 5000, 10000, 25000, 50000, 100000]
130+
131+
# Ks = [500000, 1000000, 2500000, 5000000, 10000000]
132+
133+
import psutil
134+
import os
135+
import gc
136+
137+
pid = os.getpid()
138+
proc = psutil.Process(pid)
139+
140+
def dict_unique(values, expected_K, sort=False, memory=False):
141+
if memory:
142+
gc.collect()
143+
before_mem = proc.get_memory_info().rss
144+
145+
rizer = lib.DictFactorizer()
146+
result = rizer.unique_int64(values)
147+
148+
if memory:
149+
result = proc.get_memory_info().rss - before_mem
150+
return result
151+
152+
if sort:
153+
result.sort()
154+
assert(len(result) == expected_K)
155+
return result
156+
157+
def khash_unique(values, expected_K, size_hint=False, sort=False,
158+
memory=False):
159+
if memory:
160+
gc.collect()
161+
before_mem = proc.get_memory_info().rss
162+
163+
if size_hint:
164+
rizer = lib.Factorizer(len(values))
165+
else:
166+
rizer = lib.Factorizer(100)
167+
168+
result = []
169+
result = rizer.unique(values)
170+
171+
if memory:
172+
result = proc.get_memory_info().rss - before_mem
173+
return result
174+
175+
if sort:
176+
result.sort()
177+
assert(len(result) == expected_K)
178+
179+
def khash_unique_str(values, expected_K, size_hint=False, sort=False,
180+
memory=False):
181+
if memory:
182+
gc.collect()
183+
before_mem = proc.get_memory_info().rss
184+
185+
if size_hint:
186+
rizer = lib.StringHashTable(len(values))
187+
else:
188+
rizer = lib.StringHashTable(100)
189+
190+
result = []
191+
result = rizer.unique(values)
192+
193+
if memory:
194+
result = proc.get_memory_info().rss - before_mem
195+
return result
196+
197+
if sort:
198+
result.sort()
199+
assert(len(result) == expected_K)
200+
201+
def khash_unique_int64(values, expected_K, size_hint=False, sort=False):
202+
if size_hint:
203+
rizer = lib.Int64HashTable(len(values))
204+
else:
205+
rizer = lib.Int64HashTable(100)
206+
207+
result = []
208+
result = rizer.unique(values)
209+
210+
if sort:
211+
result.sort()
212+
assert(len(result) == expected_K)
213+
214+
def hash_bench():
215+
numpy = []
216+
dict_based = []
217+
dict_based_sort = []
218+
khash_hint = []
219+
khash_nohint = []
220+
for K in Ks:
221+
print K
222+
# groups = np.array([rands(10) for _ in xrange(K)])
223+
# labels = np.tile(groups, N // K).astype('O')
224+
225+
groups = np.random.randint(0, 100000000000L, size=K)
226+
labels = np.tile(groups, N // K)
227+
dict_based.append(timeit(lambda: dict_unique(labels, K), 20))
228+
khash_nohint.append(timeit(lambda: khash_unique_int64(labels, K), 20))
229+
khash_hint.append(timeit(lambda: khash_unique_int64(labels, K,
230+
size_hint=True), 20))
231+
232+
# memory, hard to get
233+
# dict_based.append(np.mean([dict_unique(labels, K, memory=True)
234+
# for _ in xrange(10)]))
235+
# khash_nohint.append(np.mean([khash_unique(labels, K, memory=True)
236+
# for _ in xrange(10)]))
237+
# khash_hint.append(np.mean([khash_unique(labels, K, size_hint=True, memory=True)
238+
# for _ in xrange(10)]))
239+
240+
# dict_based_sort.append(timeit(lambda: dict_unique(labels, K,
241+
# sort=True), 10))
242+
# numpy.append(timeit(lambda: np.unique(labels), 10))
243+
244+
# unique_timings = DataFrame({'numpy.unique' : numpy,
245+
# 'dict, no sort' : dict_based,
246+
# 'dict, sort' : dict_based_sort},
247+
# columns=['dict, no sort',
248+
# 'dict, sort', 'numpy.unique'],
249+
# index=Ks)
250+
251+
unique_timings = DataFrame({'dict' : dict_based,
252+
'khash, preallocate' : khash_hint,
253+
'khash' : khash_nohint},
254+
columns=['khash, preallocate', 'khash', 'dict'],
255+
index=Ks)
256+
257+
unique_timings.plot(kind='bar', legend=False)
258+
plt.legend(loc='best')
259+
plt.title('Unique on 100,000 values, int64')
260+
plt.xlabel('Number of unique labels')
261+
plt.ylabel('Mean execution time')
262+
263+
264+
plt.show()

0 commit comments

Comments
 (0)