Skip to content

Commit 63defad

Browse files
authored
Benchmarks tsne rfr (#91)
1 parent 9f28ce7 commit 63defad

File tree

10 files changed

+389
-125
lines changed

10 files changed

+389
-125
lines changed

bench.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,10 @@ def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None,
503503
alg_instance_params = dict(alg_instance.attributes())
504504
else:
505505
alg_instance_params = dict(alg_instance.get_params())
506+
if ('min_samples_split' in alg_instance_params
507+
and 'handle' in alg_instance_params):
508+
alg_instance_params['dtype'] = str(
509+
alg_instance_params['dtype'])
506510
result['algorithm_parameters'].update(alg_instance_params)
507511
if alg_params is not None:
508512
result['algorithm_parameters'].update(alg_params)

configs/cuml_config.json

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,40 @@
136136
"max-leaf-nodes": 131072,
137137
"max-features": 0.2
138138
},
139+
{
140+
"algorithm": "df_regr",
141+
"dtype": "float32",
142+
"dataset": [
143+
{
144+
"source": "npy",
145+
"name": "year_prediction_msd",
146+
"training":
147+
{
148+
"x": "data/year_prediction_msd_x_train.npy",
149+
"y": "data/year_prediction_msd_y_train.npy"
150+
},
151+
"testing":
152+
{
153+
"x": "data/year_prediction_msd_x_test.npy",
154+
"y": "data/year_prediction_msd_y_test.npy"
155+
}
156+
},
157+
{
158+
"source": "npy",
159+
"name": "airline_regression",
160+
"training":
161+
{
162+
"x": "data/airline_regression_x_train.npy",
163+
"y": "data/airline_regression_y_train.npy"
164+
},
165+
"testing":
166+
{
167+
"x": "data/airline_regression_x_test.npy",
168+
"y": "data/airline_regression_y_test.npy"
169+
}
170+
}
171+
]
172+
},
139173
{
140174
"algorithm": "ridge",
141175
"dataset": [
@@ -564,6 +598,26 @@
564598
"alpha": 2.0,
565599
"l1_ratio": 0.5,
566600
"tol": 1e-4
601+
},
602+
{
603+
"algorithm": "tsne",
604+
"dtype": "float32",
605+
"dataset": [
606+
{
607+
"source": "npy",
608+
"name": "mnist",
609+
"training":
610+
{
611+
"x": "data/mnist_x_train.npy",
612+
"y": "data/mnist_y_train.npy"
613+
},
614+
"testing":
615+
{
616+
"x": "data/mnist_x_test.npy",
617+
"y": "data/mnist_y_test.npy"
618+
}
619+
}
620+
]
567621
}
568622
]
569623
}

configs/skl_config.json

100755100644
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,40 @@
148148
"max-leaf-nodes": 131072,
149149
"max-features": 0.2
150150
},
151+
{
152+
"algorithm": "df_regr",
153+
"dtype": "float32",
154+
"dataset": [
155+
{
156+
"source": "npy",
157+
"name": "year_prediction_msd",
158+
"training":
159+
{
160+
"x": "data/year_prediction_msd_x_train.npy",
161+
"y": "data/year_prediction_msd_y_train.npy"
162+
},
163+
"testing":
164+
{
165+
"x": "data/year_prediction_msd_x_test.npy",
166+
"y": "data/year_prediction_msd_y_test.npy"
167+
}
168+
},
169+
{
170+
"source": "npy",
171+
"name": "airline_regression",
172+
"training":
173+
{
174+
"x": "data/airline_regression_x_train.npy",
175+
"y": "data/airline_regression_y_train.npy"
176+
},
177+
"testing":
178+
{
179+
"x": "data/airline_regression_x_test.npy",
180+
"y": "data/airline_regression_y_test.npy"
181+
}
182+
}
183+
]
184+
},
151185
{
152186
"algorithm": "ridge",
153187
"dataset": [
@@ -731,6 +765,25 @@
731765
"alpha": 2.0,
732766
"l1_ratio": 0.5,
733767
"tol": 1e-4
768+
},
769+
{
770+
"algorithm": "tsne",
771+
"dataset": [
772+
{
773+
"source": "npy",
774+
"name": "mnist",
775+
"training":
776+
{
777+
"x": "data/mnist_x_train.npy",
778+
"y": "data/mnist_y_train.npy"
779+
},
780+
"testing":
781+
{
782+
"x": "data/mnist_x_test.npy",
783+
"y": "data/mnist_y_test.npy"
784+
}
785+
}
786+
]
734787
}
735788
]
736789
}

configs/testing/sklearn.json

100755100644
Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@
244244
"n_features": 10,
245245
"training": {
246246
"n_samples": 1000
247-
}
247+
}
248248
}
249249
],
250250
"include-y": "",
@@ -323,6 +323,23 @@
323323
}
324324
}
325325
]
326+
},
327+
{
328+
"algorithm": "tsne",
329+
"dataset": [
330+
{
331+
"source": "synthetic",
332+
"type": "classification",
333+
"n_classes": 5,
334+
"n_features": 10,
335+
"training": {
336+
"n_samples": 1000
337+
},
338+
"testing": {
339+
"n_samples": 20
340+
}
341+
}
342+
]
326343
}
327344
]
328345
}

cuml_bench/df_regr.py

Lines changed: 7 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,44 +15,35 @@
1515
# ===============================================================================
1616

1717
import argparse
18-
1918
import bench
2019
from cuml.ensemble import RandomForestRegressor
2120

2221
parser = argparse.ArgumentParser(description='cuml random forest '
2322
'regression benchmark')
2423

25-
parser.add_argument('--criterion', type=str, default='mse',
26-
choices=('mse', 'mae'),
27-
help='The function to measure the quality of a split')
2824
parser.add_argument('--split-algorithm', type=str, default='hist',
2925
choices=('hist', 'global_quantile'),
3026
help='The algorithm to determine how '
3127
'nodes are split in the tree')
3228
parser.add_argument('--num-trees', type=int, default=100,
3329
help='Number of trees in the forest')
34-
parser.add_argument('--max-features', type=bench.float_or_int, default=None,
30+
parser.add_argument('--max-features', type=bench.float_or_int, default=1.0,
3531
help='Upper bound on features used at each split')
36-
parser.add_argument('--max-depth', type=int, default=None,
32+
parser.add_argument('--max-depth', type=int, default=16,
3733
help='Upper bound on depth of constructed trees')
3834
parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2,
3935
help='Minimum samples number for node splitting')
4036
parser.add_argument('--max-leaf-nodes', type=int, default=-1,
4137
help='Maximum leaf nodes per tree')
42-
parser.add_argument('--min-impurity-decrease', type=float, default=0.,
38+
parser.add_argument('--min-impurity-decrease', type=float, default=0.0,
4339
help='Needed impurity decrease for node splitting')
4440
parser.add_argument('--no-bootstrap', dest='bootstrap', default=True,
4541
action='store_false', help="Don't control bootstraping")
4642

4743
params = bench.parse_args(parser)
4844

4945
# Load and convert data
50-
X_train, X_test, y_train, y_test = bench.load_data(params)
51-
52-
if params.criterion == 'mse':
53-
params.criterion = 2
54-
else:
55-
params.criterion = 3
46+
X_train, X_test, y_train, y_test = bench.load_data(params, int_label=True)
5647

5748
if params.split_algorithm == 'hist':
5849
params.split_algorithm = 0
@@ -61,15 +52,15 @@
6152

6253
# Create our random forest regressor
6354
regr = RandomForestRegressor(
64-
split_criterion=params.criterion,
65-
split_algo=params.split_algorithm,
6655
n_estimators=params.num_trees,
67-
max_depth=params.max_depth,
56+
split_algo=params.split_algorithm,
6857
max_features=params.max_features,
6958
min_samples_split=params.min_samples_split,
59+
max_depth=params.max_depth,
7060
max_leaves=params.max_leaf_nodes,
7161
min_impurity_decrease=params.min_impurity_decrease,
7262
bootstrap=params.bootstrap,
63+
7364
)
7465

7566

@@ -82,7 +73,6 @@ def predict(regr, X):
8273

8374

8475
fit_time, _ = bench.measure_function_time(fit, regr, X_train, y_train, params=params)
85-
8676
y_pred = predict(regr, X_train)
8777
train_rmse = bench.rmse_score(y_pred, y_train)
8878

cuml_bench/tsne.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import argparse
2+
import bench
3+
from cuml.manifold import TSNE
4+
5+
parser = argparse.ArgumentParser(description='cuml tsne')
6+
7+
parser.add_argument('--n-components', type=int, default=2,
8+
help='The dimension of the embedded space.')
9+
parser.add_argument('--early-exaggeration', type=float, default=12.0,
10+
help='This factor increases the attractive forces between points '
11+
'and allows points to move around more freely, '
12+
'finding their nearest neighbors more easily.')
13+
parser.add_argument('--learning-rate', type=float, default=200.0,
14+
help='The learning rate for t-SNE is usually in the range [10.0, 1000.0].')
15+
parser.add_argument('--angle', type=float, default=0.5,
16+
help='Angular size. This is the trade-off between speed and accuracy.')
17+
parser.add_argument('--min-grad-norm', type=float, default=1e-7,
18+
help='If the gradient norm is below this threshold,'
19+
'the optimization is stopped.')
20+
parser.add_argument('--random-state', type=int, default=1234)
21+
params = bench.parse_args(parser)
22+
23+
# Load and convert data
24+
X, _, _, _ = bench.load_data(params)
25+
26+
# Create our random forest regressor
27+
tsne = TSNE(n_components=params.n_components, early_exaggeration=params.early_exaggeration,
28+
learning_rate=params.learning_rate, angle=params.angle,
29+
min_grad_norm=params.min_grad_norm, random_state=params.random_state)
30+
31+
fit_time, _ = bench.measure_function_time(tsne.fit, X, params=params)
32+
# Need to investigate how to compare sklearn and cuml metrics for tsne
33+
34+
bench.print_output(library='cuml', algorithm='tsne',
35+
stages=['training'], params=params,
36+
functions=['tsne.fit'],
37+
times=[fit_time], metric_type=None,
38+
metrics=None, data=[X],
39+
alg_instance=tsne)

0 commit comments

Comments
 (0)