Skip to content

Benchmarks tsne rfr #91

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 34 commits into from
Sep 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
4d10210
Benchmarks: add Random Forest Regression and TSNE benchmarks. 10M sam…
itearsl Aug 17, 2021
4565974
benchmarks: add cuml benchmarks for Random Forest Regressor and TSNE
itearsl Aug 17, 2021
e871445
benchmarks: add cuml benchmarks for Random Forest Regressor and TSNE
itearsl Sep 9, 2021
59ae6ed
bugfix
itearsl Sep 9, 2021
f72f3a5
bugfix
itearsl Sep 15, 2021
621d986
TSNE and RF: fix cuml RF regressor benchmark and add cuml TSNE benchm…
itearsl Sep 16, 2021
d133005
Merge master, resolve conflicts
itearsl Sep 16, 2021
a8281c0
Bugfix
itearsl Sep 16, 2021
32c2412
add prints in betch.py
itearsl Sep 16, 2021
f9291ab
Add tsne in azura tests
Sep 16, 2021
6fd18d9
bugfix
Sep 16, 2021
9926f3d
fix pep8 and 'float32' bug
itearsl Sep 17, 2021
13aaeac
fix pep8
itearsl Sep 17, 2021
6ec6c52
fix pep8
itearsl Sep 17, 2021
1ea9d66
fix banch errors
itearsl Sep 17, 2021
823ef34
fix banch errors
itearsl Sep 17, 2021
80986e8
test
itearsl Sep 17, 2021
f127efd
test
itearsl Sep 17, 2021
d262328
test
itearsl Sep 17, 2021
fa3401c
fixed bug with JSON parsing
itearsl Sep 17, 2021
e5114af
fix codefactor
itearsl Sep 17, 2021
31c8aad
Fix arguments description
itearsl Sep 20, 2021
bac85b5
Fix batch.py
itearsl Sep 20, 2021
8acb449
fix bench.py
itearsl Sep 20, 2021
836982e
fix pep8
itearsl Sep 20, 2021
4d29feb
final fix cuml bench
itearsl Sep 24, 2021
433c3b9
fix pep8
itearsl Sep 24, 2021
6ee5ca9
fix codefactor
itearsl Sep 24, 2021
75a0076
fix codefactor
itearsl Sep 24, 2021
07a7581
fix codefactor
itearsl Sep 24, 2021
e448b40
fix codefactor
itearsl Sep 24, 2021
5f3066a
fix codefactor
itearsl Sep 24, 2021
7ee5d02
Update cuml_bench/tsne.py
itearsl Sep 24, 2021
b4e1519
fix codefactor
itearsl Sep 24, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,10 @@ def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None,
alg_instance_params = dict(alg_instance.attributes())
else:
alg_instance_params = dict(alg_instance.get_params())
if ('min_samples_split' in alg_instance_params
and 'handle' in alg_instance_params):
alg_instance_params['dtype'] = str(
alg_instance_params['dtype'])
result['algorithm_parameters'].update(alg_instance_params)
if alg_params is not None:
result['algorithm_parameters'].update(alg_params)
Expand Down
54 changes: 54 additions & 0 deletions configs/cuml_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,40 @@
"max-leaf-nodes": 131072,
"max-features": 0.2
},
{
"algorithm": "df_regr",
"dtype": "float32",
"dataset": [
{
"source": "npy",
"name": "year_prediction_msd",
"training":
{
"x": "data/year_prediction_msd_x_train.npy",
"y": "data/year_prediction_msd_y_train.npy"
},
"testing":
{
"x": "data/year_prediction_msd_x_test.npy",
"y": "data/year_prediction_msd_y_test.npy"
}
},
{
"source": "npy",
"name": "airline_regression",
"training":
{
"x": "data/airline_regression_x_train.npy",
"y": "data/airline_regression_y_train.npy"
},
"testing":
{
"x": "data/airline_regression_x_test.npy",
"y": "data/airline_regression_y_test.npy"
}
}
]
},
{
"algorithm": "ridge",
"dataset": [
Expand Down Expand Up @@ -564,6 +598,26 @@
"alpha": 2.0,
"l1_ratio": 0.5,
"tol": 1e-4
},
{
"algorithm": "tsne",
"dtype": "float32",
"dataset": [
{
"source": "npy",
"name": "mnist",
"training":
{
"x": "data/mnist_x_train.npy",
"y": "data/mnist_y_train.npy"
},
"testing":
{
"x": "data/mnist_x_test.npy",
"y": "data/mnist_y_test.npy"
}
}
]
}
]
}
53 changes: 53 additions & 0 deletions configs/skl_config.json
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,40 @@
"max-leaf-nodes": 131072,
"max-features": 0.2
},
{
"algorithm": "df_regr",
"dtype": "float32",
"dataset": [
{
"source": "npy",
"name": "year_prediction_msd",
"training":
{
"x": "data/year_prediction_msd_x_train.npy",
"y": "data/year_prediction_msd_y_train.npy"
},
"testing":
{
"x": "data/year_prediction_msd_x_test.npy",
"y": "data/year_prediction_msd_y_test.npy"
}
},
{
"source": "npy",
"name": "airline_regression",
"training":
{
"x": "data/airline_regression_x_train.npy",
"y": "data/airline_regression_y_train.npy"
},
"testing":
{
"x": "data/airline_regression_x_test.npy",
"y": "data/airline_regression_y_test.npy"
}
}
]
},
{
"algorithm": "ridge",
"dataset": [
Expand Down Expand Up @@ -731,6 +765,25 @@
"alpha": 2.0,
"l1_ratio": 0.5,
"tol": 1e-4
},
{
"algorithm": "tsne",
"dataset": [
{
"source": "npy",
"name": "mnist",
"training":
{
"x": "data/mnist_x_train.npy",
"y": "data/mnist_y_train.npy"
},
"testing":
{
"x": "data/mnist_x_test.npy",
"y": "data/mnist_y_test.npy"
}
}
]
}
]
}
19 changes: 18 additions & 1 deletion configs/testing/sklearn.json
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@
"n_features": 10,
"training": {
"n_samples": 1000
}
}
}
],
"include-y": "",
Expand Down Expand Up @@ -323,6 +323,23 @@
}
}
]
},
{
"algorithm": "tsne",
"dataset": [
{
"source": "synthetic",
"type": "classification",
"n_classes": 5,
"n_features": 10,
"training": {
"n_samples": 1000
},
"testing": {
"n_samples": 20
}
}
]
}
]
}
24 changes: 7 additions & 17 deletions cuml_bench/df_regr.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,44 +15,35 @@
# ===============================================================================

import argparse

import bench
from cuml.ensemble import RandomForestRegressor

parser = argparse.ArgumentParser(description='cuml random forest '
'regression benchmark')

parser.add_argument('--criterion', type=str, default='mse',
choices=('mse', 'mae'),
help='The function to measure the quality of a split')
parser.add_argument('--split-algorithm', type=str, default='hist',
choices=('hist', 'global_quantile'),
help='The algorithm to determine how '
'nodes are split in the tree')
parser.add_argument('--num-trees', type=int, default=100,
help='Number of trees in the forest')
parser.add_argument('--max-features', type=bench.float_or_int, default=None,
parser.add_argument('--max-features', type=bench.float_or_int, default=1.0,
help='Upper bound on features used at each split')
parser.add_argument('--max-depth', type=int, default=None,
parser.add_argument('--max-depth', type=int, default=16,
help='Upper bound on depth of constructed trees')
parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2,
help='Minimum samples number for node splitting')
parser.add_argument('--max-leaf-nodes', type=int, default=-1,
help='Maximum leaf nodes per tree')
parser.add_argument('--min-impurity-decrease', type=float, default=0.,
parser.add_argument('--min-impurity-decrease', type=float, default=0.0,
help='Needed impurity decrease for node splitting')
parser.add_argument('--no-bootstrap', dest='bootstrap', default=True,
action='store_false', help="Don't control bootstraping")

params = bench.parse_args(parser)

# Load and convert data
X_train, X_test, y_train, y_test = bench.load_data(params)

if params.criterion == 'mse':
params.criterion = 2
else:
params.criterion = 3
X_train, X_test, y_train, y_test = bench.load_data(params, int_label=True)

if params.split_algorithm == 'hist':
params.split_algorithm = 0
Expand All @@ -61,15 +52,15 @@

# Create our random forest regressor
regr = RandomForestRegressor(
split_criterion=params.criterion,
split_algo=params.split_algorithm,
n_estimators=params.num_trees,
max_depth=params.max_depth,
split_algo=params.split_algorithm,
max_features=params.max_features,
min_samples_split=params.min_samples_split,
max_depth=params.max_depth,
max_leaves=params.max_leaf_nodes,
min_impurity_decrease=params.min_impurity_decrease,
bootstrap=params.bootstrap,

)


Expand All @@ -82,7 +73,6 @@ def predict(regr, X):


fit_time, _ = bench.measure_function_time(fit, regr, X_train, y_train, params=params)

y_pred = predict(regr, X_train)
train_rmse = bench.rmse_score(y_pred, y_train)

Expand Down
39 changes: 39 additions & 0 deletions cuml_bench/tsne.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import argparse
import bench
from cuml.manifold import TSNE

parser = argparse.ArgumentParser(description='cuml tsne')

parser.add_argument('--n-components', type=int, default=2,
help='The dimension of the embedded space.')
parser.add_argument('--early-exaggeration', type=float, default=12.0,
help='This factor increases the attractive forces between points '
'and allows points to move around more freely, '
'finding their nearest neighbors more easily.')
parser.add_argument('--learning-rate', type=float, default=200.0,
help='The learning rate for t-SNE is usually in the range [10.0, 1000.0].')
parser.add_argument('--angle', type=float, default=0.5,
help='Angular size. This is the trade-off between speed and accuracy.')
parser.add_argument('--min-grad-norm', type=float, default=1e-7,
help='If the gradient norm is below this threshold,'
'the optimization is stopped.')
parser.add_argument('--random-state', type=int, default=1234)
params = bench.parse_args(parser)

# Load and convert data
X, _, _, _ = bench.load_data(params)

# Create our random forest regressor
tsne = TSNE(n_components=params.n_components, early_exaggeration=params.early_exaggeration,
learning_rate=params.learning_rate, angle=params.angle,
min_grad_norm=params.min_grad_norm, random_state=params.random_state)

fit_time, _ = bench.measure_function_time(tsne.fit, X, params=params)
# Need to investigate how to compare sklearn and cuml metrics for tsne

bench.print_output(library='cuml', algorithm='tsne',
stages=['training'], params=params,
functions=['tsne.fit'],
times=[fit_time], metric_type=None,
metrics=None, data=[X],
alg_instance=tsne)
Loading