Skip to content

Commit 517cc63

Browse files
committed
ENH: more merge benchmarks
1 parent 35a18f4 commit 517cc63

File tree

3 files changed

+135
-44
lines changed

3 files changed

+135
-44
lines changed

bench/bench_merge.R

Lines changed: 92 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,23 @@ for (i in 1:N) {
77
indices[i] <- paste(sample(letters, 10), collapse="")
88
indices2[i] <- paste(sample(letters, 10), collapse="")
99
}
10-
left <- data.frame(key=rep(indices, 10),
11-
key2=rep(indices2, 10),
12-
value=rnorm(100000))
13-
right <- data.frame(key=indices,
14-
key2=indices2,
15-
value2=rnorm(10000))
10+
left <- data.frame(key=rep(indices[1:8000], 10),
11+
key2=rep(indices2[1:8000], 10),
12+
value=rnorm(80000))
13+
right <- data.frame(key=indices[2001:10000],
14+
key2=indices2[2001:10000],
15+
value2=rnorm(8000))
1616

17-
right2 <- data.frame(key=rep(indices, 2),
18-
key2=rep(indices2, 2),
19-
value2=rnorm(20000))
17+
right2 <- data.frame(key=rep(right$key, 2),
18+
key2=rep(right$key2, 2),
19+
value2=rnorm(16000))
20+
21+
left.dt <- data.table(left, key=c("key", "key2"))
22+
right.dt <- data.table(right, key=c("key", "key2"))
23+
right2.dt <- data.table(right2, key=c("key", "key2"))
24+
25+
# left.dt2 <- data.table(left)
26+
# right.dt2 <- data.table(right)
2027

2128
## left <- data.frame(key=rep(indices[1:1000], 10),
2229
## key2=rep(indices2[1:1000], 10),
@@ -47,7 +54,23 @@ outer.join <- function(sort=FALSE) {
4754
}
4855

4956
inner.join <- function(sort=FALSE) {
50-
result <- base::merge(left, right, sort=sort)
57+
result <- base::merge(left, right, all=FALSE, sort=sort)
58+
}
59+
60+
left.join.dt <- function(sort=FALSE) {
61+
result <- merge(left.dt, right.dt, all.x=TRUE, sort=sort)
62+
}
63+
64+
right.join.dt <- function(sort=FALSE) {
65+
result <- merge(left.dt, right.dt, all.y=TRUE, sort=sort)
66+
}
67+
68+
outer.join.dt <- function(sort=FALSE) {
69+
result <- merge(left.dt, right.dt, all=TRUE, sort=sort)
70+
}
71+
72+
inner.join.dt <- function(sort=FALSE) {
73+
result <- merge(left.dt, right.dt, all=FALSE, sort=sort)
5174
}
5275

5376
plyr.join <- function(type) {
@@ -57,6 +80,8 @@ plyr.join <- function(type) {
5780

5881
sort.options <- c(FALSE, TRUE)
5982

83+
# many-to-one
84+
6085
results <- matrix(nrow=3, ncol=3)
6186
colnames(results) <- c("base::merge", "plyr", "data.table")
6287
rownames(results) <- c("inner", "outer", "left")
@@ -65,25 +90,68 @@ base.functions <- c(inner.join, outer.join, left.join)
6590
plyr.functions <- c(function() plyr.join("inner"),
6691
function() plyr.join("full"),
6792
function() plyr.join("left"))
68-
dt.functions <- c(inner.join, outer.join, left.join)
93+
dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt)
6994
for (i in 1:3) {
7095
base.func <- base.functions[[i]]
7196
plyr.func <- plyr.functions[[i]]
72-
## dt.func <- dt.functions[[i]]
97+
dt.func <- dt.functions[[i]]
7398
results[i, 1] <- timeit(base.func)
7499
results[i, 2] <- timeit(plyr.func)
100+
results[i, 3] <- timeit(dt.func)
101+
}
102+
103+
104+
# many-to-many
105+
106+
left.join <- function(sort=FALSE) {
107+
result <- base::merge(left, right2, all.x=TRUE, sort=sort)
108+
}
109+
110+
right.join <- function(sort=FALSE) {
111+
result <- base::merge(left, right2, all.y=TRUE, sort=sort)
112+
}
113+
114+
outer.join <- function(sort=FALSE) {
115+
result <- base::merge(left, right2, all=TRUE, sort=sort)
116+
}
117+
118+
inner.join <- function(sort=FALSE) {
119+
result <- base::merge(left, right2, all=FALSE, sort=sort)
120+
}
121+
122+
left.join.dt <- function(sort=FALSE) {
123+
result <- merge(left.dt, right2.dt, all.x=TRUE, sort=sort)
124+
}
125+
126+
right.join.dt <- function(sort=FALSE) {
127+
result <- merge(left.dt, right2.dt, all.y=TRUE, sort=sort)
75128
}
76129

77-
## do.something <- function(df, f) {
78-
## f(df)
79-
## }
80-
## df <- matrix(nrow=4, ncol=2)
81-
## functions <- c(colSums, rowSums)
82-
## g <- functions[1]
83-
## do.something(df, function(df) g(df))
130+
outer.join.dt <- function(sort=FALSE) {
131+
result <- merge(left.dt, right2.dt, all=TRUE, sort=sort)
132+
}
133+
134+
inner.join.dt <- function(sort=FALSE) {
135+
result <- merge(left.dt, right2.dt, all=FALSE, sort=sort)
136+
}
137+
138+
sort.options <- c(FALSE, TRUE)
139+
140+
# many-to-one
84141

85-
## dont_sort sort
86-
## inner 0.2297 0.2286
87-
## outer 1.1811 1.2843
88-
## left 0.6706 0.7766
89-
## right 0.2995 0.3371
142+
results <- matrix(nrow=3, ncol=2)
143+
colnames(results) <- c("base::merge", "data.table")
144+
rownames(results) <- c("inner", "outer", "left")
145+
146+
base.functions <- c(inner.join, outer.join, left.join)
147+
plyr.functions <- c(function() plyr.join("inner"),
148+
function() plyr.join("full"),
149+
function() plyr.join("left"))
150+
dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt)
151+
for (i in 1:3) {
152+
base.func <- base.functions[[i]]
153+
plyr.func <- plyr.functions[[i]]
154+
dt.func <- dt.functions[[i]]
155+
results[i, 1] <- timeit(base.func)
156+
results[i, 2] <- timeit(dt.func)
157+
}

bench/bench_merge.py

Lines changed: 38 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -32,42 +32,50 @@ def get_test_data(ngroups=100, n=N):
3232
import time
3333
from pandas.util.testing import rands
3434
N = 10000
35+
3536
indices = np.array([rands(10) for _ in xrange(N)], dtype='O')
37+
indices2 = np.array([rands(10) for _ in xrange(N)], dtype='O')
38+
key = np.tile(indices[:8000], 10)
39+
key2 = np.tile(indices2[:8000], 10)
3640

37-
key = np.tile(indices, 10)
38-
key2 = key.copy()
39-
random.shuffle(key2)
40-
indices2 = indices.copy()
41-
random.shuffle(indices2)
4241
left = DataFrame({'key' : key, 'key2':key2,
43-
'value' : np.random.randn(100000)})
44-
right = DataFrame({'key': indices, 'key2':indices2,
45-
'value2' : np.random.randn(10000)})
46-
join_methods = ['inner', 'outer', 'left', 'right']
47-
results = DataFrame(index=join_methods, columns=[False, True])
42+
'value' : np.random.randn(80000)})
43+
right = DataFrame({'key': indices[2000:], 'key2':indices2[2000:],
44+
'value2' : np.random.randn(8000)})
45+
46+
right2 = right.append(right, ignore_index=True)
47+
48+
49+
join_methods = ['inner', 'outer', 'left'] #, 'right']
50+
results = DataFrame(index=join_methods, columns=[False])
4851
niter = 10
49-
for sort in [False, True]:
52+
for sort in [False]:
5053
for join_method in join_methods:
51-
f = lambda: merge(left, right, how=join_method, sort=sort)
54+
f = lambda: merge(left, right2, how=join_method, sort=sort)
5255
gc.disable()
5356
start = time.time()
5457
for _ in xrange(niter):
5558
f()
5659
elapsed = (time.time() - start) / niter
5760
gc.enable()
5861
results[sort][join_method] = elapsed
59-
results.columns = ['dont_sort', 'sort']
62+
results.columns = ['pandas']
63+
# results.columns = ['dont_sort', 'sort']
6064

6165

6266
# R results
6367
from StringIO import StringIO
64-
r_results = read_table(StringIO("""dont_sort sort
65-
inner 0.2297 0.2286
66-
outer 1.1811 1.2843
67-
left 0.6706 0.7766
68-
right 0.2995 0.3371
68+
# many to one
69+
r_results = read_table(StringIO("""base::merge plyr data.table
70+
inner 0.2172 0.1197 0.1035
71+
outer 0.3362 0.1658 0.1930
72+
left 0.2559 0.1217 0.1559
6973
"""), sep='\s+')
7074

75+
all_results = results.join(r_results)
76+
77+
all_results = all_results.div(all_results['pandas'], axis=0)
78+
7179
sort_results = DataFrame.from_items([('pandas', results['sort']),
7280
('R', r_results['sort'])])
7381
sort_results['Ratio'] = sort_results['R'] / sort_results['pandas']
@@ -77,3 +85,15 @@ def get_test_data(ngroups=100, n=N):
7785
('R', r_results['dont_sort'])])
7886
nosort_results['Ratio'] = sort_results['R'] / sort_results['pandas']
7987

88+
# many to many
89+
90+
from StringIO import StringIO
91+
# many to one
92+
r_results = read_table(StringIO("""base::merge data.table
93+
inner 0.4503 0.1278
94+
outer 0.7973 0.2347
95+
left 0.5433 0.1877
96+
"""), sep='\s+')
97+
98+
all_results = results.join(r_results)
99+
all_results = all_results.div(all_results['pandas'], axis=0)

pandas/stats/plm.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -367,8 +367,11 @@ def resid(self):
367367
@cache_readonly
368368
def _rmse_raw(self):
369369
"""Returns the raw rmse values."""
370-
X = self._x_trans.values
371-
Y = self._y_trans.values.squeeze()
370+
X = self._x.values
371+
Y = self._y.values.squeeze()
372+
373+
# X = self._x_trans.values
374+
# Y = self._y_trans.values.squeeze()
372375

373376
resid = Y - np.dot(X, self._beta_raw)
374377
ss = (resid ** 2).sum()

0 commit comments

Comments
 (0)