Skip to content

Commit bb3231e

Browse files
committed
Merge PR #2632
2 parents 4fc78a4 + 5f4ae2a commit bb3231e

File tree

167 files changed

+5442
-4556
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

167 files changed

+5442
-4556
lines changed

bench/bench_dense_to_sparse.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,3 @@
1212
this_rng = rng2[:-i]
1313
data[100:] = np.nan
1414
series[i] = SparseSeries(data, index=this_rng)
15-

bench/bench_get_put_value.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,39 +4,46 @@
44
N = 1000
55
K = 50
66

7+
78
def _random_index(howmany):
89
return Index([rands(10) for _ in xrange(howmany)])
910

1011
df = DataFrame(np.random.randn(N, K), index=_random_index(N),
1112
columns=_random_index(K))
1213

14+
1315
def get1():
1416
for col in df.columns:
1517
for row in df.index:
1618
_ = df[col][row]
1719

20+
1821
def get2():
1922
for col in df.columns:
2023
for row in df.index:
2124
_ = df.get_value(row, col)
2225

26+
2327
def put1():
2428
for col in df.columns:
2529
for row in df.index:
2630
df[col][row] = 0
2731

32+
2833
def put2():
2934
for col in df.columns:
3035
for row in df.index:
3136
df.set_value(row, col, 0)
3237

38+
3339
def resize1():
3440
buf = DataFrame()
3541
for col in df.columns:
3642
for row in df.index:
3743
buf = buf.set_value(row, col, 5.)
3844
return buf
3945

46+
4047
def resize2():
4148
from collections import defaultdict
4249

bench/bench_groupby.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,19 @@
1212
random.shuffle(foo)
1313
random.shuffle(foo2)
1414

15-
df = DataFrame({'A' : foo,
16-
'B' : foo2,
17-
'C' : np.random.randn(n * k)})
15+
df = DataFrame({'A': foo,
16+
'B': foo2,
17+
'C': np.random.randn(n * k)})
1818

1919
import pandas._sandbox as sbx
2020

21+
2122
def f():
2223
table = sbx.StringHashTable(len(df))
2324
ret = table.factorize(df['A'])
2425
return ret
26+
27+
2528
def g():
2629
table = sbx.PyObjectHashTable(len(df))
2730
ret = table.factorize(df['A'])

bench/bench_join_panel.py

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,55 @@
1-
# reasonably effecient
1+
# reasonably efficient
2+
23

34
def create_panels_append(cls, panels):
45
""" return an append list of panels """
5-
panels = [ a for a in panels if a is not None ]
6+
panels = [a for a in panels if a is not None]
67
# corner cases
78
if len(panels) == 0:
89
return None
910
elif len(panels) == 1:
1011
return panels[0]
1112
elif len(panels) == 2 and panels[0] == panels[1]:
1213
return panels[0]
13-
#import pdb; pdb.set_trace()
14+
# import pdb; pdb.set_trace()
1415
# create a joint index for the axis
16+
1517
def joint_index_for_axis(panels, axis):
1618
s = set()
1719
for p in panels:
18-
s.update(list(getattr(p,axis)))
20+
s.update(list(getattr(p, axis)))
1921
return sorted(list(s))
22+
2023
def reindex_on_axis(panels, axis, axis_reindex):
2124
new_axis = joint_index_for_axis(panels, axis)
22-
new_panels = [ p.reindex(**{ axis_reindex : new_axis, 'copy' : False}) for p in panels ]
25+
new_panels = [p.reindex(**{axis_reindex: new_axis,
26+
'copy': False}) for p in panels]
2327
return new_panels, new_axis
24-
# create the joint major index, dont' reindex the sub-panels - we are appending
28+
# create the joint major index, dont' reindex the sub-panels - we are
29+
# appending
2530
major = joint_index_for_axis(panels, 'major_axis')
2631
# reindex on minor axis
2732
panels, minor = reindex_on_axis(panels, 'minor_axis', 'minor')
2833
# reindex on items
2934
panels, items = reindex_on_axis(panels, 'items', 'items')
3035
# concatenate values
3136
try:
32-
values = np.concatenate([ p.values for p in panels ],axis=1)
37+
values = np.concatenate([p.values for p in panels], axis=1)
3338
except (Exception), detail:
34-
raise Exception("cannot append values that dont' match dimensions! -> [%s] %s" % (','.join([ "%s" % p for p in panels ]),str(detail)))
35-
#pm('append - create_panel')
36-
p = Panel(values, items = items, major_axis = major, minor_axis = minor )
37-
#pm('append - done')
39+
raise Exception("cannot append values that dont' match dimensions! -> [%s] %s"
40+
% (','.join(["%s" % p for p in panels]), str(detail)))
41+
# pm('append - create_panel')
42+
p = Panel(values, items=items, major_axis=major,
43+
minor_axis=minor)
44+
# pm('append - done')
3845
return p
3946

4047

41-
42-
# does the job but inefficient (better to handle like you read a table in pytables...e.g create a LongPanel then convert to Wide)
43-
48+
# does the job but inefficient (better to handle like you read a table in
49+
# pytables...e.g create a LongPanel then convert to Wide)
4450
def create_panels_join(cls, panels):
4551
""" given an array of panels's, create a single panel """
46-
panels = [ a for a in panels if a is not None ]
52+
panels = [a for a in panels if a is not None]
4753
# corner cases
4854
if len(panels) == 0:
4955
return None
@@ -62,16 +68,18 @@ def create_panels_join(cls, panels):
6268
for minor_i, minor_index in panel.minor_axis.indexMap.items():
6369
for major_i, major_index in panel.major_axis.indexMap.items():
6470
try:
65-
d[(minor_i,major_i,item)] = values[item_index,major_index,minor_index]
71+
d[(minor_i, major_i, item)] = values[item_index, major_index, minor_index]
6672
except:
6773
pass
6874
# stack the values
6975
minor = sorted(list(minor))
7076
major = sorted(list(major))
7177
items = sorted(list(items))
7278
# create the 3d stack (items x columns x indicies)
73-
data = np.dstack([ np.asarray([ np.asarray([ d.get((minor_i,major_i,item),np.nan) for item in items ]) for major_i in major ]).transpose() for minor_i in minor ])
79+
data = np.dstack([np.asarray([np.asarray([d.get((minor_i, major_i, item), np.nan)
80+
for item in items])
81+
for major_i in major]).transpose()
82+
for minor_i in minor])
7483
# construct the panel
7584
return Panel(data, items, major, minor)
7685
add_class_method(Panel, create_panels_join, 'join_many')
77-

bench/bench_khash_dict.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,15 @@
1616
pid = os.getpid()
1717
proc = psutil.Process(pid)
1818

19+
1920
def object_test_data(n):
2021
pass
2122

23+
2224
def string_test_data(n):
2325
return np.array([rands(10) for _ in xrange(n)], dtype='O')
2426

27+
2528
def int_test_data(n):
2629
return np.arange(n, dtype='i8')
2730

@@ -30,17 +33,21 @@ def int_test_data(n):
3033
#----------------------------------------------------------------------
3134
# Benchmark 1: map_locations
3235

36+
3337
def map_locations_python_object():
3438
arr = string_test_data(N)
3539
return _timeit(lambda: lib.map_indices_object(arr))
3640

41+
3742
def map_locations_khash_object():
3843
arr = string_test_data(N)
44+
3945
def f():
4046
table = sbx.PyObjectHashTable(len(arr))
4147
table.map_locations(arr)
4248
return _timeit(f)
4349

50+
4451
def _timeit(f, iterations=10):
4552
start = time.time()
4653
for _ in xrange(iterations):
@@ -51,17 +58,20 @@ def _timeit(f, iterations=10):
5158
#----------------------------------------------------------------------
5259
# Benchmark 2: lookup_locations
5360

61+
5462
def lookup_python(values):
5563
table = lib.map_indices_object(values)
5664
return _timeit(lambda: lib.merge_indexer_object(values, table))
5765

66+
5867
def lookup_khash(values):
5968
table = sbx.PyObjectHashTable(len(values))
6069
table.map_locations(values)
6170
locs = table.lookup_locations(values)
6271
# elapsed = _timeit(lambda: table.lookup_locations2(values))
6372
return table
6473

74+
6575
def leak(values):
6676
for _ in xrange(100):
6777
print proc.get_memory_info()
@@ -75,4 +85,3 @@ def leak(values):
7585

7686
#----------------------------------------------------------------------
7787
# Benchmark 4: factorize
78-

bench/bench_merge.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
N = 10000
66
ngroups = 10
77

8+
89
def get_test_data(ngroups=100, n=N):
910
unique_groups = range(ngroups)
1011
arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object)
@@ -38,10 +39,10 @@ def get_test_data(ngroups=100, n=N):
3839
key = np.tile(indices[:8000], 10)
3940
key2 = np.tile(indices2[:8000], 10)
4041

41-
left = DataFrame({'key' : key, 'key2':key2,
42-
'value' : np.random.randn(80000)})
43-
right = DataFrame({'key': indices[2000:], 'key2':indices2[2000:],
44-
'value2' : np.random.randn(8000)})
42+
left = DataFrame({'key': key, 'key2': key2,
43+
'value': np.random.randn(80000)})
44+
right = DataFrame({'key': indices[2000:], 'key2': indices2[2000:],
45+
'value2': np.random.randn(8000)})
4546

4647
right2 = right.append(right, ignore_index=True)
4748

@@ -78,7 +79,8 @@ def get_test_data(ngroups=100, n=N):
7879

7980
all_results = all_results.div(all_results['pandas'], axis=0)
8081

81-
all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr', 'base::merge']]
82+
all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr',
83+
'base::merge']]
8284

8385
sort_results = DataFrame.from_items([('pandas', results['sort']),
8486
('R', r_results['base::merge'])])
@@ -102,4 +104,5 @@ def get_test_data(ngroups=100, n=N):
102104

103105
all_results = presults.join(r_results)
104106
all_results = all_results.div(all_results['pandas'], axis=0)
105-
all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr', 'base::merge']]
107+
all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr',
108+
'base::merge']]

bench/bench_merge_sqlite.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@
1313
key = np.tile(indices[:8000], 10)
1414
key2 = np.tile(indices2[:8000], 10)
1515

16-
left = DataFrame({'key' : key, 'key2':key2,
17-
'value' : np.random.randn(80000)})
18-
right = DataFrame({'key': indices[2000:], 'key2':indices2[2000:],
19-
'value2' : np.random.randn(8000)})
16+
left = DataFrame({'key': key, 'key2': key2,
17+
'value': np.random.randn(80000)})
18+
right = DataFrame({'key': indices[2000:], 'key2': indices2[2000:],
19+
'value2': np.random.randn(8000)})
2020

2121
# right2 = right.append(right, ignore_index=True)
2222
# right = right2
@@ -30,8 +30,10 @@
3030
create_sql_indexes = True
3131

3232
conn = sqlite3.connect(':memory:')
33-
conn.execute('create table left( key varchar(10), key2 varchar(10), value int);')
34-
conn.execute('create table right( key varchar(10), key2 varchar(10), value2 int);')
33+
conn.execute(
34+
'create table left( key varchar(10), key2 varchar(10), value int);')
35+
conn.execute(
36+
'create table right( key varchar(10), key2 varchar(10), value2 int);')
3537
conn.executemany('insert into left values (?, ?, ?)',
3638
zip(key, key2, left['value']))
3739
conn.executemany('insert into right values (?, ?, ?)',
@@ -43,7 +45,7 @@
4345
conn.execute('create index right_ix on right(key, key2)')
4446

4547

46-
join_methods = ['inner', 'left outer', 'left'] # others not supported
48+
join_methods = ['inner', 'left outer', 'left'] # others not supported
4749
sql_results = DataFrame(index=join_methods, columns=[False])
4850
niter = 5
4951
for sort in [False]:
@@ -61,8 +63,8 @@
6163

6264
if sort:
6365
sql = '%s order by key, key2' % sql
64-
f = lambda: list(conn.execute(sql)) # list fetches results
65-
g = lambda: conn.execute(sql) # list fetches results
66+
f = lambda: list(conn.execute(sql)) # list fetches results
67+
g = lambda: conn.execute(sql) # list fetches results
6668
gc.disable()
6769
start = time.time()
6870
# for _ in xrange(niter):
@@ -74,7 +76,7 @@
7476
conn.commit()
7577

7678
sql_results[sort][join_method] = elapsed
77-
sql_results.columns = ['sqlite3'] # ['dont_sort', 'sort']
79+
sql_results.columns = ['sqlite3'] # ['dont_sort', 'sort']
7880
sql_results.index = ['inner', 'outer', 'left']
7981

8082
sql = """select *

bench/bench_sparse.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,13 @@
1111
arr1 = np.arange(N)
1212
index = Index(np.arange(N))
1313

14-
off = N//10
15-
arr1[off : 2 * off] = np.NaN
16-
arr1[4*off: 5 * off] = np.NaN
17-
arr1[8*off: 9 * off] = np.NaN
14+
off = N // 10
15+
arr1[off: 2 * off] = np.NaN
16+
arr1[4 * off: 5 * off] = np.NaN
17+
arr1[8 * off: 9 * off] = np.NaN
1818

1919
arr2 = np.arange(N)
20-
arr2[3 * off // 2: 2 * off + off // 2] = np.NaN
20+
arr2[3 * off // 2: 2 * off + off // 2] = np.NaN
2121
arr2[8 * off + off // 2: 9 * off + off // 2] = np.NaN
2222

2323
s1 = SparseSeries(arr1, index=index)
@@ -38,6 +38,7 @@
3838

3939
sdf = dm.to_sparse()
4040

41+
4142
def new_data_like(sdf):
4243
new_data = {}
4344
for col, series in sdf.iteritems():
@@ -52,22 +53,22 @@ def new_data_like(sdf):
5253
# for col, ser in dm.iteritems():
5354
# data[col] = SparseSeries(ser)
5455

55-
dwp = Panel.fromDict({'foo' : dm})
56+
dwp = Panel.fromDict({'foo': dm})
5657
# sdf = SparseDataFrame(data)
5758

5859

5960
lp = stack_sparse_frame(sdf)
6061

6162

62-
swp = SparsePanel({'A' : sdf})
63-
swp = SparsePanel({'A' : sdf,
64-
'B' : sdf,
65-
'C' : sdf,
66-
'D' : sdf})
63+
swp = SparsePanel({'A': sdf})
64+
swp = SparsePanel({'A': sdf,
65+
'B': sdf,
66+
'C': sdf,
67+
'D': sdf})
6768

6869
y = sdf
69-
x = SparsePanel({'x1' : sdf + new_data_like(sdf) / 10,
70-
'x2' : sdf + new_data_like(sdf) / 10})
70+
x = SparsePanel({'x1': sdf + new_data_like(sdf) / 10,
71+
'x2': sdf + new_data_like(sdf) / 10})
7172

7273
dense_y = sdf
7374
dense_x = x.to_dense()
@@ -89,4 +90,3 @@ def new_data_like(sdf):
8990
reload(face)
9091

9192
# model = face.ols(y=y, x=x)
92-

0 commit comments

Comments
 (0)