Skip to content

Commit 621d986

Browse files
itearslShvets Kirill
authored and
Shvets Kirill
committed
TSNE and RF: fix cuml RF regressor benchmark and add cuml TSNE benchmark(without divergence)
1 parent f72f3a5 commit 621d986

File tree

5 files changed

+21
-36
lines changed

5 files changed

+21
-36
lines changed

bench.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,8 @@ def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None,
503503
alg_instance_params = dict(alg_instance.attributes())
504504
else:
505505
alg_instance_params = dict(alg_instance.get_params())
506+
if 'min_samples_leaf' in alg_instance_params:
507+
alg_instance_params['dtype'] = str(alg_instance_params['dtype'])
506508
result['algorithm_parameters'].update(alg_instance_params)
507509
if alg_params is not None:
508510
result['algorithm_parameters'].update(alg_params)

cuml_bench/df_regr.py

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -15,46 +15,36 @@
1515
# ===============================================================================
1616

1717
import argparse
18-
import pandas as pd
1918
import bench
19+
import cuml
2020
from cuml.ensemble import RandomForestRegressor
2121

2222
parser = argparse.ArgumentParser(description='cuml random forest '
2323
'regression benchmark')
2424

25-
parser.add_argument('--criterion', type=str, default='mse',
26-
choices=('mse', 'mae'),
27-
help='The function to measure the quality of a split')
2825
parser.add_argument('--split-algorithm', type=str, default='hist',
2926
choices=('hist', 'global_quantile'),
3027
help='The algorithm to determine how '
3128
'nodes are split in the tree')
3229
parser.add_argument('--num-trees', type=int, default=100,
3330
help='Number of trees in the forest')
34-
parser.add_argument('--max-features', type=bench.float_or_int, default=None,
31+
parser.add_argument('--max-features', type=bench.float_or_int, default=1.0,
3532
help='Upper bound on features used at each split')
36-
parser.add_argument('--max-depth', type=int, default=None,
33+
parser.add_argument('--max-depth', type=int, default=16,
3734
help='Upper bound on depth of constructed trees')
3835
parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2,
3936
help='Minimum samples number for node splitting')
4037
parser.add_argument('--max-leaf-nodes', type=int, default=-1,
4138
help='Maximum leaf nodes per tree')
42-
parser.add_argument('--min-impurity-decrease', type=float, default=0.,
39+
parser.add_argument('--min-impurity-decrease', type=float, default=0.0,
4340
help='Needed impurity decrease for node splitting')
4441
parser.add_argument('--no-bootstrap', dest='bootstrap', default=True,
4542
action='store_false', help="Don't control bootstraping")
4643

4744
params = bench.parse_args(parser)
4845

4946
# Load and convert data
50-
X_train, X_test, y_train, y_test = bench.load_data(params)
51-
y_train = y_test.values.ravel()
52-
y_train = y_test.values.ravel()
53-
54-
if params.criterion == 'mse':
55-
params.criterion = 2
56-
else:
57-
params.criterion = 3
47+
X_train, X_test, y_train, y_test = bench.load_data(params, int_label=True)
5848

5949
if params.split_algorithm == 'hist':
6050
params.split_algorithm = 0
@@ -63,18 +53,19 @@
6353

6454
# Create our random forest regressor
6555
regr = RandomForestRegressor(
66-
split_criterion=params.criterion,
67-
split_algo=params.split_algorithm,
6856
n_estimators=params.num_trees,
69-
max_depth=params.max_depth,
57+
split_algo=params.split_algorithm,
7058
max_features=params.max_features,
7159
min_samples_split=params.min_samples_split,
60+
max_depth=params.max_depth,
7261
max_leaves=params.max_leaf_nodes,
7362
min_impurity_decrease=params.min_impurity_decrease,
7463
bootstrap=params.bootstrap,
64+
7565
)
7666

7767

68+
7869
def fit(regr, X, y):
7970
return regr.fit(X, y)
8071

@@ -84,7 +75,6 @@ def predict(regr, X):
8475

8576

8677
fit_time, _ = bench.measure_function_time(fit, regr, X_train, y_train, params=params)
87-
8878
y_pred = predict(regr, X_train)
8979
train_rmse = bench.rmse_score(y_pred, y_train)
9080

cuml_bench/tsne.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import argparse
22
import pandas as pd
3+
import cuml
34
import bench
45
from cuml.manifold import TSNE
56

@@ -16,25 +17,22 @@
1617
parser.add_argument('--min-grad-norm', type=float, default=1e-7,
1718
help='If the gradient norm is below this threshold, the optimization will be stopped.')
1819
parser.add_argument('--random-state', type=int, default=1234)
19-
2020
params = bench.parse_args(parser)
2121

2222
# Load and convert data
23-
X_train, X_test, _, _ = bench.load_data(params)
24-
full_x = pd.concat([X_train, X_test])
23+
X, _, _, _ = bench.load_data(params)
2524

2625
# Create our random forest regressor
2726
tsne = TSNE(n_components=params.n_components, early_exaggeration=params.early_exaggeration,
2827
learning_rate=params.learning_rate, angle=params.angle,
2928
min_grad_norm=params.min_grad_norm, random_state=params.random_state)
3029

31-
fit_time, _ = bench.measure_function_time(tsne.fit, full_x, params=params)
32-
33-
divergence = tsne.kl_divergence_
30+
fit_time, _ = bench.measure_function_time(tsne.fit, X, params=params)
31+
# divergence = tsne.kl_divergence_
3432

3533
bench.print_output(library='cuml', algorithm='tsne',
3634
stages=['training'], params=params,
3735
functions=['tsne.fit'],
38-
times=[fit_time], metric_type='divergence',
39-
metrics=[divergence], data=[full_x],
36+
times=[fit_time], metric_type=None,
37+
metrics=None, data=[X],
4038
alg_instance=tsne)

sklearn_bench/df_regr.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
# ===============================================================================
1616

1717
import argparse
18-
1918
import bench
2019

2120

sklearn_bench/tsne.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,25 +15,21 @@
1515
# ===============================================================================
1616

1717
import argparse
18-
1918
import bench
2019
import pandas as pd
2120

2221
def main():
2322
from sklearn.manifold import TSNE
2423

2524
# Load and convert data
26-
X_train, X_test, _, _ = bench.load_data(params)
27-
full_x = pd.concat([X_train, X_test])
25+
X, _, _, _ = bench.load_data(params)
2826

2927
# Create our TSNE model
3028
tsne = TSNE(n_components=params.n_components, early_exaggeration=params.early_exaggeration,
3129
learning_rate=params.learning_rate, angle=params.angle,
3230
min_grad_norm=params.min_grad_norm, random_state=params.random_state)
3331

34-
fit_time, _ = bench.measure_function_time(tsne.fit, full_x, params=params)
35-
36-
divergence = tsne.kl_divergence_
32+
fit_time, _ = bench.measure_function_time(tsne.fit, X, params=params)
3733

3834
bench.print_output(
3935
library='sklearn',
@@ -44,7 +40,7 @@ def main():
4440
times=[fit_time],
4541
metric_type='divergence',
4642
metrics=[divergence],
47-
data=[full_x],
43+
data=[X],
4844
alg_instance=tsne,
4945
)
5046

@@ -64,6 +60,6 @@ def main():
6460
parser.add_argument('--min-grad-norm', type=float, default=1e-7,
6561
help='If the gradient norm is below this threshold, the optimization will be stopped.')
6662
parser.add_argument('--random-state', type=int, default=1234)
67-
63+
6864
params = bench.parse_args(parser)
6965
bench.run_with_context(params, main)

0 commit comments

Comments
 (0)