Skip to content

Commit f64ae68

Browse files
author
Igor Rukhovich
committed
Merge branch 'mypy-applying' into xgb-nvidia-datasets
2 parents 670c289 + dc0e9c9 commit f64ae68

File tree

11 files changed

+436
-396
lines changed

11 files changed

+436
-396
lines changed

datasets/load_datasets.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ def try_load_dataset(dataset_name: str, output_directory: Path) -> bool:
5959
if dataset_name in dataset_loaders:
6060
try:
6161
return dataset_loaders[dataset_name](output_directory)
62-
except BaseException:
63-
logging.warning("Internal error loading dataset")
62+
except BaseException as ex:
63+
logging.warning(f"Internal error loading dataset:\n{ex}")
6464
return False
6565
else:
6666
logging.warning(f"There is no script to download the dataset: {dataset_name}. "

sklearn_bench/df_clsf.py

Lines changed: 52 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -20,60 +20,64 @@
2020
import numpy as np
2121
from sklearn.metrics import accuracy_score
2222

23-
parser = argparse.ArgumentParser(description='scikit-learn random forest '
24-
'classification benchmark')
2523

26-
parser.add_argument('--criterion', type=str, default='gini',
27-
choices=('gini', 'entropy'),
28-
help='The function to measure the quality of a split')
29-
parser.add_argument('--num-trees', type=int, default=100,
30-
help='Number of trees in the forest')
31-
parser.add_argument('--max-features', type=bench.float_or_int, default=None,
32-
help='Upper bound on features used at each split')
33-
parser.add_argument('--max-depth', type=int, default=None,
34-
help='Upper bound on depth of constructed trees')
35-
parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2,
36-
help='Minimum samples number for node splitting')
37-
parser.add_argument('--max-leaf-nodes', type=int, default=None,
38-
help='Maximum leaf nodes per tree')
39-
parser.add_argument('--min-impurity-decrease', type=float, default=0.,
40-
help='Needed impurity decrease for node splitting')
41-
parser.add_argument('--no-bootstrap', dest='bootstrap', default=True,
42-
action='store_false', help="Don't control bootstraping")
24+
def main():
25+
from sklearn.ensemble import RandomForestClassifier
4326

44-
params = bench.parse_args(parser)
27+
# Load and convert data
28+
X_train, X_test, y_train, y_test = bench.load_data(params)
4529

46-
if not params.no_intel_optimized:
47-
from sklearn.ensemble import RandomForestClassifier
30+
# Create our random forest classifier
31+
clf = RandomForestClassifier(criterion=params.criterion,
32+
n_estimators=params.num_trees,
33+
max_depth=params.max_depth,
34+
max_features=params.max_features,
35+
min_samples_split=params.min_samples_split,
36+
max_leaf_nodes=params.max_leaf_nodes,
37+
min_impurity_decrease=params.min_impurity_decrease,
38+
bootstrap=params.bootstrap,
39+
random_state=params.seed,
40+
n_jobs=params.n_jobs)
41+
42+
params.n_classes = len(np.unique(y_train))
43+
44+
fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params)
45+
y_pred = clf.predict(X_train)
46+
train_acc = 100 * accuracy_score(y_pred, y_train)
4847

49-
# Load and convert data
50-
X_train, X_test, y_train, y_test = bench.load_data(params)
48+
predict_time, y_pred = bench.measure_function_time(
49+
clf.predict, X_test, params=params)
50+
test_acc = 100 * accuracy_score(y_pred, y_test)
5151

52-
# Create our random forest classifier
53-
clf = RandomForestClassifier(criterion=params.criterion,
54-
n_estimators=params.num_trees,
55-
max_depth=params.max_depth,
56-
max_features=params.max_features,
57-
min_samples_split=params.min_samples_split,
58-
max_leaf_nodes=params.max_leaf_nodes,
59-
min_impurity_decrease=params.min_impurity_decrease,
60-
bootstrap=params.bootstrap,
61-
random_state=params.seed,
62-
n_jobs=params.n_jobs)
52+
bench.print_output(library='sklearn', algorithm='decision_forest_classification',
53+
stages=['training', 'prediction'], params=params,
54+
functions=['df_clsf.fit', 'df_clsf.predict'],
55+
times=[fit_time, predict_time], accuracy_type='accuracy[%]',
56+
accuracies=[train_acc, test_acc], data=[X_train, X_test],
57+
alg_instance=clf)
6358

64-
params.n_classes = len(np.unique(y_train))
6559

66-
fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params)
67-
y_pred = clf.predict(X_train)
68-
train_acc = 100 * accuracy_score(y_pred, y_train)
60+
if __name__ == "__main__":
61+
parser = argparse.ArgumentParser(description='scikit-learn random forest '
62+
'classification benchmark')
6963

70-
predict_time, y_pred = bench.measure_function_time(
71-
clf.predict, X_test, params=params)
72-
test_acc = 100 * accuracy_score(y_pred, y_test)
64+
parser.add_argument('--criterion', type=str, default='gini',
65+
choices=('gini', 'entropy'),
66+
help='The function to measure the quality of a split')
67+
parser.add_argument('--num-trees', type=int, default=100,
68+
help='Number of trees in the forest')
69+
parser.add_argument('--max-features', type=bench.float_or_int, default=None,
70+
help='Upper bound on features used at each split')
71+
parser.add_argument('--max-depth', type=int, default=None,
72+
help='Upper bound on depth of constructed trees')
73+
parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2,
74+
help='Minimum samples number for node splitting')
75+
parser.add_argument('--max-leaf-nodes', type=int, default=None,
76+
help='Maximum leaf nodes per tree')
77+
parser.add_argument('--min-impurity-decrease', type=float, default=0.,
78+
help='Needed impurity decrease for node splitting')
79+
parser.add_argument('--no-bootstrap', dest='bootstrap', default=True,
80+
action='store_false', help="Don't control bootstraping")
7381

74-
bench.print_output(library='sklearn', algorithm='decision_forest_classification',
75-
stages=['training', 'prediction'], params=params,
76-
functions=['df_clsf.fit', 'df_clsf.predict'],
77-
times=[fit_time, predict_time], accuracy_type='accuracy[%]',
78-
accuracies=[train_acc, test_acc], data=[X_train, X_test],
79-
alg_instance=clf)
82+
params = bench.parse_args(parser)
83+
bench.run_with_context(params, main)

sklearn_bench/df_regr.py

Lines changed: 52 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -18,60 +18,64 @@
1818

1919
import bench
2020

21-
parser = argparse.ArgumentParser(description='scikit-learn random forest '
22-
'regression benchmark')
2321

24-
parser.add_argument('--criterion', type=str, default='mse',
25-
choices=('mse', 'mae'),
26-
help='The function to measure the quality of a split')
27-
parser.add_argument('--num-trees', type=int, default=100,
28-
help='Number of trees in the forest')
29-
parser.add_argument('--max-features', type=bench.float_or_int, default=None,
30-
help='Upper bound on features used at each split')
31-
parser.add_argument('--max-depth', type=int, default=None,
32-
help='Upper bound on depth of constructed trees')
33-
parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2,
34-
help='Minimum samples number for node splitting')
35-
parser.add_argument('--max-leaf-nodes', type=int, default=None,
36-
help='Grow trees with max_leaf_nodes in best-first fashion'
37-
'if it is not None')
38-
parser.add_argument('--min-impurity-decrease', type=float, default=0.,
39-
help='Needed impurity decrease for node splitting')
40-
parser.add_argument('--no-bootstrap', dest='bootstrap', default=True,
41-
action='store_false', help="Don't control bootstraping")
22+
def main():
23+
from sklearn.ensemble import RandomForestRegressor
4224

43-
params = bench.parse_args(parser)
25+
# Load and convert data
26+
X_train, X_test, y_train, y_test = bench.load_data(params)
4427

45-
if not params.no_intel_optimized:
46-
from sklearn.ensemble import RandomForestRegressor
28+
# Create our random forest regressor
29+
regr = RandomForestRegressor(criterion=params.criterion,
30+
n_estimators=params.num_trees,
31+
max_depth=params.max_depth,
32+
max_features=params.max_features,
33+
min_samples_split=params.min_samples_split,
34+
max_leaf_nodes=params.max_leaf_nodes,
35+
min_impurity_decrease=params.min_impurity_decrease,
36+
bootstrap=params.bootstrap,
37+
random_state=params.seed,
38+
n_jobs=params.n_jobs)
39+
40+
fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params)
41+
42+
y_pred = regr.predict(X_train)
43+
train_rmse = bench.rmse_score(y_pred, y_train)
4744

48-
# Load and convert data
49-
X_train, X_test, y_train, y_test = bench.load_data(params)
45+
predict_time, y_pred = bench.measure_function_time(
46+
regr.predict, X_test, params=params)
47+
test_rmse = bench.rmse_score(y_pred, y_test)
5048

51-
# Create our random forest regressor
52-
regr = RandomForestRegressor(criterion=params.criterion,
53-
n_estimators=params.num_trees,
54-
max_depth=params.max_depth,
55-
max_features=params.max_features,
56-
min_samples_split=params.min_samples_split,
57-
max_leaf_nodes=params.max_leaf_nodes,
58-
min_impurity_decrease=params.min_impurity_decrease,
59-
bootstrap=params.bootstrap,
60-
random_state=params.seed,
61-
n_jobs=params.n_jobs)
49+
bench.print_output(library='sklearn', algorithm='decision_forest_regression',
50+
stages=['training', 'prediction'], params=params,
51+
functions=['df_regr.fit', 'df_regr.predict'],
52+
times=[fit_time, predict_time], accuracy_type='rmse',
53+
accuracies=[train_rmse, test_rmse], data=[X_train, X_test],
54+
alg_instance=regr)
6255

63-
fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params)
6456

65-
y_pred = regr.predict(X_train)
66-
train_rmse = bench.rmse_score(y_pred, y_train)
57+
if __name__ == "__main__":
58+
parser = argparse.ArgumentParser(description='scikit-learn random forest '
59+
'regression benchmark')
6760

68-
predict_time, y_pred = bench.measure_function_time(
69-
regr.predict, X_test, params=params)
70-
test_rmse = bench.rmse_score(y_pred, y_test)
61+
parser.add_argument('--criterion', type=str, default='mse',
62+
choices=('mse', 'mae'),
63+
help='The function to measure the quality of a split')
64+
parser.add_argument('--num-trees', type=int, default=100,
65+
help='Number of trees in the forest')
66+
parser.add_argument('--max-features', type=bench.float_or_int, default=None,
67+
help='Upper bound on features used at each split')
68+
parser.add_argument('--max-depth', type=int, default=None,
69+
help='Upper bound on depth of constructed trees')
70+
parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2,
71+
help='Minimum samples number for node splitting')
72+
parser.add_argument('--max-leaf-nodes', type=int, default=None,
73+
help='Grow trees with max_leaf_nodes in best-first fashion'
74+
'if it is not None')
75+
parser.add_argument('--min-impurity-decrease', type=float, default=0.,
76+
help='Needed impurity decrease for node splitting')
77+
parser.add_argument('--no-bootstrap', dest='bootstrap', default=True,
78+
action='store_false', help="Don't control bootstraping")
7179

72-
bench.print_output(library='sklearn', algorithm='decision_forest_regression',
73-
stages=['training', 'prediction'], params=params,
74-
functions=['df_regr.fit', 'df_regr.predict'],
75-
times=[fit_time, predict_time], accuracy_type='rmse',
76-
accuracies=[train_rmse, test_rmse], data=[X_train, X_test],
77-
alg_instance=regr)
80+
params = bench.parse_args(parser)
81+
bench.run_with_context(params, main)

sklearn_bench/distances.py

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,23 +18,27 @@
1818

1919
import bench
2020

21-
parser = argparse.ArgumentParser(description='scikit-learn pairwise distances '
22-
'benchmark')
23-
parser.add_argument('--metric', default='cosine',
24-
choices=['cosine', 'correlation'],
25-
help='Metric to test for pairwise distances')
26-
params = bench.parse_args(parser)
27-
28-
if not params.no_intel_optimized:
21+
22+
def main():
2923
from sklearn.metrics.pairwise import pairwise_distances
3024

31-
# Load data
32-
X, _, _, _ = bench.load_data(params, generated_data=['X_train'], add_dtype=True)
25+
# Load data
26+
X, _, _, _ = bench.load_data(params, generated_data=['X_train'], add_dtype=True)
27+
28+
time, _ = bench.measure_function_time(pairwise_distances, X, metric=params.metric,
29+
n_jobs=params.n_jobs, params=params)
30+
31+
bench.print_output(library='sklearn', algorithm='distances', stages=['computation'],
32+
params=params, functions=[params.metric.capitalize()], times=[time],
33+
accuracy_type=None, accuracies=[None], data=[X],
34+
alg_params={'metric': params.metric})
3335

34-
time, _ = bench.measure_function_time(pairwise_distances, X, metric=params.metric,
35-
n_jobs=params.n_jobs, params=params)
3636

37-
bench.print_output(library='sklearn', algorithm='distances', stages=['computation'],
38-
params=params, functions=[params.metric.capitalize()], times=[time],
39-
accuracy_type=None, accuracies=[None], data=[X],
40-
alg_params={'metric': params.metric})
37+
if __name__ == "__main__":
38+
parser = argparse.ArgumentParser(description='scikit-learn pairwise distances '
39+
'benchmark')
40+
parser.add_argument('--metric', default='cosine',
41+
choices=['cosine', 'correlation'],
42+
help='Metric to test for pairwise distances')
43+
params = bench.parse_args(parser)
44+
bench.run_with_context(params, main)

sklearn_bench/elasticnet.py

Lines changed: 44 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -18,45 +18,49 @@
1818

1919
import bench
2020

21-
parser = argparse.ArgumentParser(description='scikit-learn elastic-net regression '
22-
'benchmark')
23-
parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=False,
24-
action='store_false',
25-
help="Don't fit intercept (assume data already centered)")
26-
parser.add_argument('--alpha', dest='alpha', type=float, default=1.0,
27-
help='Regularization parameter')
28-
parser.add_argument('--maxiter', type=int, default=1000,
29-
help='Maximum iterations for the iterative solver')
30-
parser.add_argument('--l1_ratio', dest='l1_ratio', type=float, default=0.5,
31-
help='Regularization parameter')
32-
parser.add_argument('--tol', type=float, default=0.0,
33-
help='Tolerance for solver.')
34-
params = bench.parse_args(parser)
35-
36-
if not params.no_intel_optimized:
21+
22+
def main():
3723
from sklearn.linear_model import ElasticNet
3824

39-
# Load data
40-
X_train, X_test, y_train, y_test = bench.load_data(params)
41-
42-
# Create our regression object
43-
regr = ElasticNet(fit_intercept=params.fit_intercept, l1_ratio=params.l1_ratio,
44-
alpha=params.alpha, tol=params.tol,
45-
max_iter=params.maxiter, copy_X=False)
46-
# Time fit
47-
fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params)
48-
49-
# Time predict
50-
predict_time, pred_train = bench.measure_function_time(regr.predict,
51-
X_train, params=params)
52-
53-
train_rmse = bench.rmse_score(pred_train, y_train)
54-
pred_test = regr.predict(X_test)
55-
test_rmse = bench.rmse_score(pred_test, y_test)
56-
57-
bench.print_output(library='sklearn', algorithm='elastic-net',
58-
stages=['training', 'prediction'], params=params,
59-
functions=['ElasticNet.fit', 'ElasticNet.predict'],
60-
times=[fit_time, predict_time], accuracy_type='rmse',
61-
accuracies=[train_rmse, test_rmse], data=[X_train, X_train],
62-
alg_instance=regr)
25+
# Load data
26+
X_train, X_test, y_train, y_test = bench.load_data(params)
27+
28+
# Create our regression object
29+
regr = ElasticNet(fit_intercept=params.fit_intercept, l1_ratio=params.l1_ratio,
30+
alpha=params.alpha, tol=params.tol,
31+
max_iter=params.maxiter, copy_X=False)
32+
# Time fit
33+
fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params)
34+
35+
# Time predict
36+
predict_time, pred_train = bench.measure_function_time(regr.predict,
37+
X_train, params=params)
38+
39+
train_rmse = bench.rmse_score(pred_train, y_train)
40+
pred_test = regr.predict(X_test)
41+
test_rmse = bench.rmse_score(pred_test, y_test)
42+
43+
bench.print_output(library='sklearn', algorithm='elastic-net',
44+
stages=['training', 'prediction'], params=params,
45+
functions=['ElasticNet.fit', 'ElasticNet.predict'],
46+
times=[fit_time, predict_time], accuracy_type='rmse',
47+
accuracies=[train_rmse, test_rmse], data=[X_train, X_train],
48+
alg_instance=regr)
49+
50+
51+
if __name__ == "__main__":
52+
parser = argparse.ArgumentParser(description='scikit-learn elastic-net regression '
53+
'benchmark')
54+
parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=False,
55+
action='store_false',
56+
help="Don't fit intercept (assume data already centered)")
57+
parser.add_argument('--alpha', dest='alpha', type=float, default=1.0,
58+
help='Regularization parameter')
59+
parser.add_argument('--maxiter', type=int, default=1000,
60+
help='Maximum iterations for the iterative solver')
61+
parser.add_argument('--l1_ratio', dest='l1_ratio', type=float, default=0.5,
62+
help='Regularization parameter')
63+
parser.add_argument('--tol', type=float, default=0.0,
64+
help='Tolerance for solver.')
65+
params = bench.parse_args(parser)
66+
bench.run_with_context(params, main)

0 commit comments

Comments
 (0)