From c9f616593fec5be29d03c759eb2936923fd2735f Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Tue, 20 Jul 2021 15:52:39 +0300 Subject: [PATCH 01/27] minor fixes --- configs/blogs/skl_2021_3.json | 2 +- sklearn_bench/kmeans.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/configs/blogs/skl_2021_3.json b/configs/blogs/skl_2021_3.json index 30ced64e4..c3e2f409b 100644 --- a/configs/blogs/skl_2021_3.json +++ b/configs/blogs/skl_2021_3.json @@ -307,7 +307,7 @@ } ], "nu": [0.25], - "kernel": ["sigmoid"] + "kernel": ["poly"] }, { "algorithm": "svr", diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index 4df5ba03f..a7da397b0 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -48,7 +48,8 @@ def main(): def fit_kmeans(X, X_init): alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, - max_iter=params.maxiter, init=X_init, n_init=1) + max_iter=params.maxiter, init=X_init, n_init=params.n_init, + algorithm=params.algorithm) alg.fit(X) return alg @@ -83,5 +84,9 @@ def fit_kmeans(X, X_init): parser.add_argument('--maxiter', type=int, default=100, help='Maximum number of iterations') parser.add_argument('--n-clusters', type=int, help='Number of clusters') + parser.add_argument('--algorithm', type=str, default='full', + help='K-means algorithm to use') + parser.add_argument('--n_init', type=int, default=10, + help='Number of time the k-means algorithm will be run with different centroid seeds') params = bench.parse_args(parser) bench.run_with_context(params, main) From 7c33dcebfd10a98e318d985e5b309f453e320101 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Tue, 20 Jul 2021 15:56:22 +0300 Subject: [PATCH 02/27] pep8 --- sklearn_bench/kmeans.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index a7da397b0..076af45e2 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -87,6 +87,7 @@ def fit_kmeans(X, X_init): parser.add_argument('--algorithm', type=str, default='full', help='K-means algorithm to use') parser.add_argument('--n_init', type=int, default=10, - help='Number of time the k-means algorithm will be run with different centroid seeds') + help='Number of time the k-means algorithm ' + 'will be run with different centroid seeds') params = bench.parse_args(parser) bench.run_with_context(params, main) From 46bf47841ecad96ace1da719636872739a7a507f Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Tue, 20 Jul 2021 16:03:11 +0300 Subject: [PATCH 03/27] random state --- sklearn_bench/kmeans.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index 076af45e2..6d3afa0e3 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -49,7 +49,7 @@ def main(): def fit_kmeans(X, X_init): alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, max_iter=params.maxiter, init=X_init, n_init=params.n_init, - algorithm=params.algorithm) + algorithm=params.algorithm, random_state=params.random_state) alg.fit(X) return alg @@ -89,5 +89,7 @@ def fit_kmeans(X, X_init): parser.add_argument('--n_init', type=int, default=10, help='Number of time the k-means algorithm ' 'will be run with different centroid seeds') + parser.add_argument('--random_state', type=int, default=777, + help='Random state') params = bench.parse_args(parser) bench.run_with_context(params, main) From e739ebafed97c98a0d751a04659f1aae2bfe4c72 Mon Sep 17 00:00:00 2001 From: "denis.kulandin" Date: Thu, 29 Jul 2021 11:01:46 +0300 Subject: [PATCH 04/27] size of datasets --- datasets/loader_classification.py | 41 ++++++++++++++++++++----------- datasets/loader_multiclass.py | 24 +++++++++++------- datasets/loader_regression.py | 14 ++++++----- 3 files changed, 49 insertions(+), 30 deletions(-) diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index 5c8a91121..e4f0417b9 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -68,9 +68,11 @@ def airline(dataset_dir: Path) -> bool: Airline dataset http://kt.ijs.si/elena_ikonomovska/data.html - TaskType:binclass - NumberOfFeatures:13 - NumberOfInstances:115M + Classification task. n_classes = 2. + airline X train dataset (92055213, 13) + airline y train dataset (92055213, 1) + airline X test dataset (23013804, 13) + airline y test dataset (23013804, 1) """ dataset_name = 'airline' os.makedirs(dataset_dir, exist_ok=True) @@ -126,9 +128,12 @@ def airline(dataset_dir: Path) -> bool: def airline_ohe(dataset_dir: Path) -> bool: """ Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf - TaskType:binclass - NumberOfFeatures:700 - NumberOfInstances:10100000 + + Classification task. n_classes = 2. + airline-ohe X train dataset (1000000, 692) + airline-ohe y train dataset (1000000, 1) + airline-ohe X test dataset (100000, 692) + airline-ohe y test dataset (100000, 1) """ dataset_name = 'airline-ohe' os.makedirs(dataset_dir, exist_ok=True) @@ -289,9 +294,11 @@ def epsilon(dataset_dir: Path) -> bool: Epsilon dataset https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html - TaskType:binclass - NumberOfFeatures:2000 - NumberOfInstances:500K + Classification task. n_classes = 2. + epsilon X train dataset (400000, 2000) + epsilon y train dataset (400000, 1) + epsilon X test dataset (100000, 2000) + epsilon y test dataset (100000, 1) """ dataset_name = 'epsilon' os.makedirs(dataset_dir, exist_ok=True) @@ -444,9 +451,11 @@ def higgs(dataset_dir: Path) -> bool: Higgs dataset from UCI machine learning repository https://archive.ics.uci.edu/ml/datasets/HIGGS - TaskType:binclass - NumberOfFeatures:28 - NumberOfInstances:11M + Classification task. n_classes = 2. + higgs X train dataset (8799999, 28) + higgs y train dataset (8799999, 1) + higgs X test dataset (2200000, 28) + higgs y test dataset (2200000, 1) """ dataset_name = 'higgs' os.makedirs(dataset_dir, exist_ok=True) @@ -479,9 +488,11 @@ def higgs_one_m(dataset_dir: Path) -> bool: Only first 1.5M samples is taken - TaskType:binclass - NumberOfFeatures:28 - NumberOfInstances:1.5M + Classification task. n_classes = 2. + higgs1m X train dataset (1000000, 28) + higgs1m y train dataset (1000000, 1) + higgs1m X test dataset (500000, 28) + higgs1m y test dataset (500000, 1) """ dataset_name = 'higgs1m' os.makedirs(dataset_dir, exist_ok=True) diff --git a/datasets/loader_multiclass.py b/datasets/loader_multiclass.py index 0c2013a1f..cd58b21b0 100644 --- a/datasets/loader_multiclass.py +++ b/datasets/loader_multiclass.py @@ -99,9 +99,11 @@ def covtype(dataset_dir: Path) -> bool: https://archive.ics.uci.edu/ml/datasets/covertype y contains 7 unique class labels from 1 to 7 inclusive. - TaskType:multiclass - NumberOfFeatures:54 - NumberOfInstances:581012 + Classification task. n_classes = 7. + covtype X train dataset (464809, 54) + covtype y train dataset (464809, 1) + covtype X test dataset (116203, 54) + covtype y test dataset (116203, 1) """ dataset_name = 'covtype' os.makedirs(dataset_dir, exist_ok=True) @@ -125,9 +127,11 @@ def letters(dataset_dir: Path) -> bool: """ http://archive.ics.uci.edu/ml/datasets/Letter+Recognition - TaskType:multiclass - NumberOfFeatures:16 - NumberOfInstances:20.000 + Classification task. n_classes = 26. + letters X train dataset (16000, 16) + letters y train dataset (16000, 1) + letters X test dataset (4000, 16) + letters y test dataset (4000, 1) """ dataset_name = 'letters' os.makedirs(dataset_dir, exist_ok=True) @@ -204,9 +208,11 @@ def msrank(dataset_dir: Path) -> bool: """ Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf - TaskType:multiclass - NumberOfFeatures:137 - NumberOfInstances:1.2M + Multiclass classification task + msrank X train dataset (958671, 137) + msrank y train dataset (958671, 1) + msrank X test dataset (241521, 137) + msrank y test dataset (241521, 1) """ dataset_name = 'msrank' os.makedirs(dataset_dir, exist_ok=True) diff --git a/datasets/loader_regression.py b/datasets/loader_regression.py index 2f330f799..4d90da2c6 100644 --- a/datasets/loader_regression.py +++ b/datasets/loader_regression.py @@ -32,9 +32,10 @@ def abalone(dataset_dir: Path) -> bool: """ https://archive.ics.uci.edu/ml/machine-learning-databases/abalone - TaskType:regression - NumberOfFeatures:8 - NumberOfInstances:4177 + abalone x train dataset (3341, 8) + abalone y train dataset (3341, 1) + abalone x test dataset (836, 8) + abalone y train dataset (836, 1) """ dataset_name = 'abalone' os.makedirs(dataset_dir, exist_ok=True) @@ -196,9 +197,10 @@ def year_prediction_msd(dataset_dir: Path) -> bool: YearPredictionMSD dataset from UCI repository https://archive.ics.uci.edu/ml/datasets/yearpredictionmsd - TaskType:regression - NumberOfFeatures:90 - NumberOfInstances:515345 + year_prediction_msd x train dataset (463715, 11) + year_prediction_msd y train dataset (463715, 1) + year_prediction_msd x test dataset (51630, 11) + year_prediction_msd y train dataset (51630, 1) """ dataset_name = 'year_prediction_msd' os.makedirs(dataset_dir, exist_ok=True) From 957465f4b3f2f3d7c705437e70d40bac3e257c47 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Thu, 29 Jul 2021 13:23:06 +0300 Subject: [PATCH 05/27] apply comments --- datasets/loader_classification.py | 2 +- datasets/loader_multiclass.py | 4 ++-- datasets/loader_regression.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index e4f0417b9..5a5d9df74 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -128,7 +128,7 @@ def airline(dataset_dir: Path) -> bool: def airline_ohe(dataset_dir: Path) -> bool: """ Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf - + Classification task. n_classes = 2. airline-ohe X train dataset (1000000, 692) airline-ohe y train dataset (1000000, 1) diff --git a/datasets/loader_multiclass.py b/datasets/loader_multiclass.py index cd58b21b0..34033a714 100644 --- a/datasets/loader_multiclass.py +++ b/datasets/loader_multiclass.py @@ -208,7 +208,7 @@ def msrank(dataset_dir: Path) -> bool: """ Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf - Multiclass classification task + Classification task. n_classes = 5. msrank X train dataset (958671, 137) msrank y train dataset (958671, 1) msrank X test dataset (241521, 137) @@ -270,7 +270,7 @@ def sensit(dataset_dir: Path) -> bool: Author: M. Duarte, Y. H. Hu Source: [original](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets) - Multiclass classification task + Classification task. n_classes = 3. sensit X train dataset (78822, 100) sensit y train dataset (78822, 1) sensit X test dataset (19706, 100) diff --git a/datasets/loader_regression.py b/datasets/loader_regression.py index 4d90da2c6..3a5553e61 100644 --- a/datasets/loader_regression.py +++ b/datasets/loader_regression.py @@ -197,9 +197,9 @@ def year_prediction_msd(dataset_dir: Path) -> bool: YearPredictionMSD dataset from UCI repository https://archive.ics.uci.edu/ml/datasets/yearpredictionmsd - year_prediction_msd x train dataset (463715, 11) + year_prediction_msd x train dataset (463715, 90) year_prediction_msd y train dataset (463715, 1) - year_prediction_msd x test dataset (51630, 11) + year_prediction_msd x test dataset (51630, 90) year_prediction_msd y train dataset (51630, 1) """ dataset_name = 'year_prediction_msd' From bc7e96480a5b267df8e1aca32b28b844fb349444 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Fri, 30 Jul 2021 10:27:26 +0300 Subject: [PATCH 06/27] dbscan eps fix --- sklearn_bench/dbscan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn_bench/dbscan.py b/sklearn_bench/dbscan.py index 6765b2970..bf7f9da95 100644 --- a/sklearn_bench/dbscan.py +++ b/sklearn_bench/dbscan.py @@ -50,7 +50,7 @@ def main(): if __name__ == "__main__": parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark') - parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10., + parser.add_argument('-e', '--eps', '--epsilon', type=float, default=0.5, help='Radius of neighborhood of a point') parser.add_argument('-m', '--min-samples', default=5, type=int, help='The minimum number of samples required in a ' From a37921cdae47cc4228493deb42ed6c23340dec8e Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Mon, 2 Aug 2021 08:46:59 +0300 Subject: [PATCH 07/27] extend output result --- bench.py | 42 ++++++++++++++++++++++++++---- sklearn_bench/dbscan.py | 5 +++- sklearn_bench/df_clsf.py | 31 +++++++++++++++------- sklearn_bench/df_regr.py | 24 +++++++++++------ sklearn_bench/elasticnet.py | 28 ++++++++++++-------- sklearn_bench/kmeans.py | 21 +++++++++------ sklearn_bench/knn_clsf.py | 51 ++++++++++++++++++++++++------------- sklearn_bench/lasso.py | 28 ++++++++++++-------- sklearn_bench/linear.py | 25 ++++++++++-------- sklearn_bench/log_reg.py | 31 +++++++++++++++------- sklearn_bench/nusvc.py | 39 +++++++++++++++------------- sklearn_bench/nusvr.py | 23 ++++++++++++----- sklearn_bench/pca.py | 18 ++++++++----- sklearn_bench/ridge.py | 26 ++++++++++++------- sklearn_bench/svm.py | 40 ++++++++++++++++------------- sklearn_bench/svr.py | 22 +++++++++++----- 16 files changed, 304 insertions(+), 150 deletions(-) diff --git a/bench.py b/bench.py index cd26c166e..199bd52a5 100644 --- a/bench.py +++ b/bench.py @@ -339,19 +339,46 @@ def columnwise_score(y, yp, score_func): def accuracy_score(y, yp): - return columnwise_score(y, yp, lambda y1, y2: np.mean(y1 == y2)) + from sklearn.metrics import accuracy_score as sklearn_accuracy + y = convert_to_numpy(y) + yp = convert_to_numpy(yp) + return sklearn_accuracy(y, yp) def log_loss(y, yp): from sklearn.metrics import log_loss as sklearn_log_loss y = convert_to_numpy(y) yp = convert_to_numpy(yp) - return sklearn_log_loss(y, yp) + try: + res = sklearn_log_loss(y, yp) + except Exception: + res = -1 + return res + + +def roc_auc_score(y, yp, multi_class='ovr'): + from sklearn.metrics import roc_auc_score as sklearn_roc_auc + y = convert_to_numpy(y) + yp = convert_to_numpy(yp) + try: + res = sklearn_roc_auc(y, yp, multi_class=multi_class) + except: + res = -1 + return res def rmse_score(y, yp): - return columnwise_score( - y, yp, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2)))) + from sklearn.metrics import mean_squared_error as sklearn_mse + y = convert_to_numpy(y) + yp = convert_to_numpy(yp) + return sklearn_mse(y, yp) + + +def r2_score(y, yp): + from sklearn.metrics import r2_score as sklearn_r2_score + y = convert_to_numpy(y) + yp = convert_to_numpy(yp) + return sklearn_r2_score(y, yp) def convert_data(data, dtype, data_order, data_format): @@ -497,7 +524,12 @@ def print_output(library, algorithm, stages, params, functions, data[i], alg_instance, alg_params) result.update({'time[s]': times[i]}) if accuracy_type is not None: - result.update({f'{accuracy_type}': accuracies[i]}) + if isinstance(accuracy_type, str): + result.update({f'{accuracy_type}': accuracies[i]}) + elif isinstance(accuracy_type, list): + for j in range(len(accuracy_type)): + if accuracies[j][i] is not None: + result.update({f'{accuracy_type[j]}': accuracies[j][i]}) if hasattr(params, 'n_classes'): result['input_data'].update({'classes': params.n_classes}) if hasattr(params, 'n_clusters'): diff --git a/sklearn_bench/dbscan.py b/sklearn_bench/dbscan.py index bf7f9da95..e8191c0db 100644 --- a/sklearn_bench/dbscan.py +++ b/sklearn_bench/dbscan.py @@ -40,7 +40,10 @@ def main(): labels = dbscan.labels_ params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0) - acc = davies_bouldin_score(X, labels) + try: + acc = davies_bouldin_score(X, labels) + except: + acc = -1 bench.print_output(library='sklearn', algorithm='dbscan', stages=['training'], params=params, functions=['DBSCAN'], times=[time], diff --git a/sklearn_bench/df_clsf.py b/sklearn_bench/df_clsf.py index bb69185a3..0d48b82f5 100644 --- a/sklearn_bench/df_clsf.py +++ b/sklearn_bench/df_clsf.py @@ -18,7 +18,6 @@ import bench import numpy as np -from sklearn.metrics import accuracy_score def main(): @@ -43,18 +42,32 @@ def main(): fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) y_pred = clf.predict(X_train) - train_acc = 100 * accuracy_score(y_pred, y_train) + train_acc = bench.accuracy_score(y_train, y_pred) + train_log_loss = bench.log_loss(y_train, y_pred) + train_roc_auc = bench.roc_auc_score(y_train, y_pred) predict_time, y_pred = bench.measure_function_time( clf.predict, X_test, params=params) - test_acc = 100 * accuracy_score(y_pred, y_test) + test_acc = bench.accuracy_score(y_test, y_pred) + test_log_loss = bench.log_loss(y_test, y_pred) + test_roc_auc = bench.roc_auc_score(y_test, y_pred) - bench.print_output(library='sklearn', algorithm='decision_forest_classification', - stages=['training', 'prediction'], params=params, - functions=['df_clsf.fit', 'df_clsf.predict'], - times=[fit_time, predict_time], accuracy_type='accuracy[%]', - accuracies=[train_acc, test_acc], data=[X_train, X_test], - alg_instance=clf) + bench.print_output( + library='sklearn', + algorithm='decision_forest_classification', + stages=['training', 'prediction'], + params=params, + functions=['df_clsf.fit', 'df_clsf.predict'], + times=[fit_time, predict_time], + accuracy_type=['accuracy', 'log_loss', 'roc_auc'], + accuracies=[ + [train_acc, test_acc], + [train_log_loss, test_log_loss], + [train_roc_auc, test_roc_auc], + ], + data=[X_train, X_test], + alg_instance=clf, + ) if __name__ == "__main__": diff --git a/sklearn_bench/df_regr.py b/sklearn_bench/df_regr.py index 53d3c8afd..61a24f918 100644 --- a/sklearn_bench/df_regr.py +++ b/sklearn_bench/df_regr.py @@ -40,18 +40,26 @@ def main(): fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) y_pred = regr.predict(X_train) - train_rmse = bench.rmse_score(y_pred, y_train) + train_rmse = bench.rmse_score(y_train, y_pred) + train_r2 = bench.r2_score(y_train, y_pred) predict_time, y_pred = bench.measure_function_time( regr.predict, X_test, params=params) - test_rmse = bench.rmse_score(y_pred, y_test) + test_rmse = bench.rmse_score(y_test, y_pred) + test_r2 = bench.r2_score(y_test, y_pred) - bench.print_output(library='sklearn', algorithm='decision_forest_regression', - stages=['training', 'prediction'], params=params, - functions=['df_regr.fit', 'df_regr.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_test], - alg_instance=regr) + bench.print_output( + library='sklearn', + algorithm='decision_forest_regression', + stages=['training', 'prediction'], + params=params, + functions=['df_regr.fit', 'df_regr.predict'], + times=[fit_time, predict_time], + accuracy_type=['rmse', 'r2_score'], + accuracies=[[train_rmse, test_rmse], [train_r2, test_r2]], + data=[X_train, X_test], + alg_instance=regr, + ) if __name__ == "__main__": diff --git a/sklearn_bench/elasticnet.py b/sklearn_bench/elasticnet.py index ac7f34050..bcd33b242 100755 --- a/sklearn_bench/elasticnet.py +++ b/sklearn_bench/elasticnet.py @@ -33,19 +33,27 @@ def main(): fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) # Time predict - predict_time, pred_train = bench.measure_function_time(regr.predict, + predict_time, y_pred = bench.measure_function_time(regr.predict, X_train, params=params) - train_rmse = bench.rmse_score(pred_train, y_train) - pred_test = regr.predict(X_test) - test_rmse = bench.rmse_score(pred_test, y_test) + train_rmse = bench.rmse_score(y_train, y_pred) + train_r2 = bench.r2_score(y_train, y_pred) + y_pred = regr.predict(X_test) + test_rmse = bench.rmse_score(y_test, y_pred) + test_r2 = bench.r2_score(y_test, y_pred) - bench.print_output(library='sklearn', algorithm='elastic-net', - stages=['training', 'prediction'], params=params, - functions=['ElasticNet.fit', 'ElasticNet.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_train], - alg_instance=regr) + bench.print_output( + library='sklearn', + algorithm='elastic-net', + stages=['training', 'prediction'], + params=params, + functions=['ElasticNet.fit', 'ElasticNet.predict'], + times=[fit_time, predict_time], + accuracy_type=['rmse', 'r2_score'], + accuracies=[[train_rmse, test_rmse], [train_r2, test_r2]], + data=[X_train, X_train], + alg_instance=regr, + ) if __name__ == "__main__": diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index 6d3afa0e3..0fa96b0dc 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -66,13 +66,18 @@ def fit_kmeans(X, X_init): acc_test = davies_bouldin_score(X_test, test_predict) - bench.print_output(library='sklearn', algorithm='kmeans', - stages=['training', 'prediction'], - params=params, functions=['KMeans.fit', 'KMeans.predict'], - times=[fit_time, predict_time], - accuracy_type='davies_bouldin_score', - accuracies=[acc_train, acc_test], data=[X_train, X_test], - alg_instance=kmeans) + bench.print_output( + library='sklearn', + algorithm='kmeans', + stages=['training', 'prediction'], + params=params, + functions=['KMeans.fit', 'KMeans.predict'], + times=[fit_time, predict_time], + accuracy_type=['davies_bouldin_score', 'inertia'], + accuracies=[[acc_train, acc_test], [kmeans.inertia_, kmeans.inertia_]], + data=[X_train, X_test], + alg_instance=kmeans, + ) if __name__ == "__main__": @@ -86,7 +91,7 @@ def fit_kmeans(X, X_init): parser.add_argument('--n-clusters', type=int, help='Number of clusters') parser.add_argument('--algorithm', type=str, default='full', help='K-means algorithm to use') - parser.add_argument('--n_init', type=int, default=10, + parser.add_argument('--n_init', type=int, default=1, help='Number of time the k-means algorithm ' 'will be run with different centroid seeds') parser.add_argument('--random_state', type=int, default=777, diff --git a/sklearn_bench/knn_clsf.py b/sklearn_bench/knn_clsf.py index 6c2c28af8..0f98d2f0b 100755 --- a/sklearn_bench/knn_clsf.py +++ b/sklearn_bench/knn_clsf.py @@ -18,7 +18,6 @@ import bench import numpy as np -from sklearn.metrics import accuracy_score def main(): @@ -40,33 +39,51 @@ def main(): knn_clsf.fit, X_train, y_train, params=params) if params.task == 'classification': y_pred = knn_clsf.predict(X_train) - train_acc = 100 * accuracy_score(y_pred, y_train) + train_acc = bench.accuracy_score(y_train, y_pred) + train_log_loss = bench.log_loss(y_train, y_pred) + train_roc_auc = bench.roc_auc_score(y_train, y_pred) # Measure time and accuracy on prediction if params.task == 'classification': predict_time, yp = bench.measure_function_time(knn_clsf.predict, X_test, params=params) - test_acc = 100 * accuracy_score(yp, y_test) + test_acc = bench.accuracy_score(y_test, yp) + test_log_loss = bench.log_loss(y_test, yp) + test_roc_auc = bench.roc_auc_score(y_test, yp) else: predict_time, _ = bench.measure_function_time(knn_clsf.kneighbors, X_test, params=params) if params.task == 'classification': - bench.print_output(library='sklearn', - algorithm=knn_clsf._fit_method + '_knn_classification', - stages=['training', 'prediction'], params=params, - functions=['knn_clsf.fit', 'knn_clsf.predict'], - times=[train_time, predict_time], - accuracies=[train_acc, test_acc], accuracy_type='accuracy[%]', - data=[X_train, X_test], alg_instance=knn_clsf) + bench.print_output( + library='sklearn', + algorithm=knn_clsf._fit_method + '_knn_classification', + stages=['training', 'prediction'], + params=params, + functions=['knn_clsf.fit', 'knn_clsf.predict'], + times=[train_time, predict_time], + accuracy_type=['accuracy', 'log_loss', 'roc_auc'], + accuracies=[ + [train_acc, test_acc], + [train_log_loss, test_log_loss], + [train_roc_auc, test_roc_auc], + ], + data=[X_train, X_test], + alg_instance=knn_clsf, + ) else: - bench.print_output(library='sklearn', - algorithm=knn_clsf._fit_method + '_knn_search', - stages=['training', 'search'], params=params, - functions=['knn_clsf.fit', 'knn_clsf.kneighbors'], - times=[train_time, predict_time], - accuracies=[], accuracy_type=None, - data=[X_train, X_test], alg_instance=knn_clsf) + bench.print_output( + library='sklearn', + algorithm=knn_clsf._fit_method + '_knn_search', + stages=['training', 'search'], + params=params, + functions=['knn_clsf.fit', 'knn_clsf.kneighbors'], + times=[train_time, predict_time], + accuracy_type=None, + accuracies=[], + data=[X_train, X_test], + alg_instance=knn_clsf, + ) if __name__ == "__main__": diff --git a/sklearn_bench/lasso.py b/sklearn_bench/lasso.py index 32fd0d591..741060ccf 100755 --- a/sklearn_bench/lasso.py +++ b/sklearn_bench/lasso.py @@ -33,19 +33,27 @@ def main(): fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) # Time predict - predict_time, pred_train = bench.measure_function_time( + predict_time, yp = bench.measure_function_time( regr.predict, X_train, params=params) - train_rmse = bench.rmse_score(pred_train, y_train) - pred_test = regr.predict(X_test) - test_rmse = bench.rmse_score(pred_test, y_test) + train_rmse = bench.rmse_score(y_train, yp) + train_r2 = bench.r2_score(y_train, yp) + yp = regr.predict(X_test) + test_rmse = bench.rmse_score(y_test, yp) + test_r2 = bench.r2_score(y_test, yp) - bench.print_output(library='sklearn', algorithm='lasso', - stages=['training', 'prediction'], params=params, - functions=['Lasso.fit', 'Lasso.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_test], - alg_instance=regr) + bench.print_output( + library='sklearn', + algorithm='lasso', + stages=['training', 'prediction'], + params=params, + functions=['Lasso.fit', 'Lasso.predict'], + times=[fit_time, predict_time], + accuracy_type=['rmse', 'r2_score'], + accuracies=[[train_rmse, test_rmse], [train_r2, test_r2]], + data=[X_train, X_test], + alg_instance=regr, + ) if __name__ == "__main__": diff --git a/sklearn_bench/linear.py b/sklearn_bench/linear.py index f926c6ce8..4da59e9ae 100644 --- a/sklearn_bench/linear.py +++ b/sklearn_bench/linear.py @@ -15,7 +15,6 @@ # =============================================================================== import argparse - import bench @@ -36,16 +35,22 @@ def main(): # Time predict predict_time, yp = bench.measure_function_time(regr.predict, X_test, params=params) - test_rmse = bench.rmse_score(yp, y_test) + test_rmse = bench.rmse_score(y_test, yp) + test_r2 = bench.r2_score(y_test, yp) yp = regr.predict(X_train) - train_rmse = bench.rmse_score(yp, y_train) - - bench.print_output(library='sklearn', algorithm='linear_regression', - stages=['training', 'prediction'], - params=params, functions=['Linear.fit', 'Linear.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_test], - alg_instance=regr) + train_rmse = bench.rmse_score(y_train, yp) + train_r2 = bench.r2_score(y_train, yp) + + bench.print_output( + library='sklearn', algorithm='linear_regression', + stages=['training', 'prediction'], + params=params, functions=['Linear.fit', 'Linear.predict'], + times=[fit_time, predict_time], + accuracy_type=['rmse', 'r2_score'], + accuracies=[[train_rmse, test_rmse], [train_r2, test_r2]], + data=[X_train, X_test], + alg_instance=regr, + ) if __name__ == "__main__": diff --git a/sklearn_bench/log_reg.py b/sklearn_bench/log_reg.py index dc9d750bd..22a0a11d7 100644 --- a/sklearn_bench/log_reg.py +++ b/sklearn_bench/log_reg.py @@ -22,7 +22,6 @@ def main(): from sklearn.linear_model import LogisticRegression - from sklearn.metrics import accuracy_score # Load generated data X_train, X_test, y_train, y_test = bench.load_data(params) @@ -45,18 +44,32 @@ def main(): fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) y_pred = clf.predict(X_train) - train_acc = 100 * accuracy_score(y_pred, y_train) + train_acc = bench.accuracy_score(y_train, y_pred) + train_log_loss = bench.log_loss(y_train, y_pred) + train_roc_auc = bench.roc_auc_score(y_train, y_pred) predict_time, y_pred = bench.measure_function_time( clf.predict, X_test, params=params) - test_acc = 100 * accuracy_score(y_pred, y_test) + test_acc = bench.accuracy_score(y_test, y_pred) + test_log_loss = bench.log_loss(y_test, y_pred) + test_roc_auc = bench.roc_auc_score(y_test, y_pred) - bench.print_output(library='sklearn', algorithm='logistic_regression', - stages=['training', 'prediction'], params=params, - functions=['LogReg.fit', 'LogReg.predict'], - times=[fit_time, predict_time], accuracy_type='accuracy[%]', - accuracies=[train_acc, test_acc], data=[X_train, X_test], - alg_instance=clf) + bench.print_output( + library='sklearn', + algorithm='logistic_regression', + stages=['training', 'prediction'], + params=params, + functions=['LogReg.fit', 'LogReg.predict'], + times=[fit_time, predict_time], + accuracy_type=['accuracy', 'log_loss', 'roc_auc'], + accuracies=[ + [train_acc, test_acc], + [train_log_loss, test_log_loss], + [train_roc_auc, test_roc_auc], + ], + data=[X_train, X_test], + alg_instance=clf, + ) if __name__ == "__main__": diff --git a/sklearn_bench/nusvc.py b/sklearn_bench/nusvc.py index 52f5b6d3d..e22124480 100644 --- a/sklearn_bench/nusvc.py +++ b/sklearn_bench/nusvc.py @@ -43,33 +43,38 @@ def main(): if params.probability: state_predict = 'predict_proba' - accuracy_type = 'log_loss' clf_predict = clf.predict_proba - - def metric_call(x, y): - return bench.log_loss(x, y) else: state_predict = 'prediction' - accuracy_type = 'accuracy[%]' clf_predict = clf.predict - def metric_call(x, y): - return bench.accuracy_score(x, y) - predict_train_time, y_pred = bench.measure_function_time( clf_predict, X_train, params=params) - train_acc = metric_call(y_train, y_pred) + train_acc = bench.accuracy_score(y_train, y_pred) + train_log_loss = bench.log_loss(y_train, y_pred) + train_roc_auc = bench.roc_auc_score(y_train, y_pred) _, y_pred = bench.measure_function_time( clf_predict, X_test, params=params) - test_acc = metric_call(y_test, y_pred) - - bench.print_output(library='sklearn', algorithm='nusvc', - stages=['training', state_predict], - params=params, functions=['NuSVC.fit', f'NuSVC.{state_predict}'], - times=[fit_time, predict_train_time], accuracy_type=accuracy_type, - accuracies=[train_acc, test_acc], data=[X_train, X_train], - alg_instance=clf) + test_acc = bench.accuracy_score(y_test, y_pred) + test_log_loss = bench.log_loss(y_test, y_pred) + test_roc_auc = bench.roc_auc_score(y_test, y_pred) + + bench.print_output( + library='sklearn', + algorithm='nusvc', + stages=['training', state_predict], + params=params, functions=['NuSVC.fit', f'NuSVC.{state_predict}'], + times=[fit_time, predict_train_time], + accuracy_type=['accuracy', 'log_loss', 'roc_auc'], + accuracies=[ + [train_acc, test_acc], + [train_log_loss, test_log_loss], + [train_roc_auc, test_roc_auc], + ], + data=[X_train, X_train], + alg_instance=clf, + ) if __name__ == "__main__": diff --git a/sklearn_bench/nusvr.py b/sklearn_bench/nusvr.py index 3d9a23780..4f4c113d1 100644 --- a/sklearn_bench/nusvr.py +++ b/sklearn_bench/nusvr.py @@ -22,6 +22,7 @@ def main(): from sklearn.svm import NuSVR + from sklearn.metrics import r2_score X_train, X_test, y_train, y_test = bench.load_data(params) y_train = np.asfortranarray(y_train).ravel() @@ -44,17 +45,25 @@ def main(): predict_train_time, y_pred = bench.measure_function_time( regr.predict, X_train, params=params) train_rmse = bench.rmse_score(y_train, y_pred) + train_r2 = bench.r2_score(y_train, y_pred) _, y_pred = bench.measure_function_time( regr.predict, X_test, params=params) test_rmse = bench.rmse_score(y_test, y_pred) - - bench.print_output(library='sklearn', algorithm='nusvr', - stages=['training', 'prediction'], - params=params, functions=['NuSVR.fit', 'NuSVR.predict'], - times=[fit_time, predict_train_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_train], - alg_instance=regr) + test_r2 = bench.r2_score(y_test, y_pred) + + bench.print_output( + library='sklearn', + algorithm='nusvr', + stages=['training', 'prediction'], + params=params, + functions=['NuSVR.fit', 'NuSVR.predict'], + times=[fit_time, predict_train_time], + accuracy_type=['rmse', 'r2_score'], + accuracies=[[train_rmse, test_rmse], [train_r2, test_r2]], + data=[X_train, X_train], + alg_instance=regr, + ) if __name__ == "__main__": diff --git a/sklearn_bench/pca.py b/sklearn_bench/pca.py index b810603a8..367638633 100644 --- a/sklearn_bench/pca.py +++ b/sklearn_bench/pca.py @@ -40,12 +40,18 @@ def main(): transform_time, _ = bench.measure_function_time( pca.transform, X_train, params=params) - bench.print_output(library='sklearn', algorithm='pca', - stages=['training', 'transformation'], - params=params, functions=['PCA.fit', 'PCA.transform'], - times=[fit_time, transform_time], accuracy_type=None, - accuracies=[None, None], data=[X_train, X_test], - alg_instance=pca) + bench.print_output( + library='sklearn', + algorithm='pca', + stages=['training', 'transformation'], + params=params, + functions=['PCA.fit', 'PCA.transform'], + times=[fit_time, transform_time], + accuracy_type='noise_variance', + accuracies=[pca.noise_variance_, pca.noise_variance_], + data=[X_train, X_test], + alg_instance=pca, + ) if __name__ == "__main__": diff --git a/sklearn_bench/ridge.py b/sklearn_bench/ridge.py index ee47a616d..48489086c 100644 --- a/sklearn_bench/ridge.py +++ b/sklearn_bench/ridge.py @@ -36,16 +36,24 @@ def main(): # Time predict predict_time, yp = bench.measure_function_time(regr.predict, X_test, params=params) - test_rmse = bench.rmse_score(yp, y_test) + test_rmse = bench.rmse_score(y_test, yp) + test_r2 = bench.r2_score(y_test, yp) yp = regr.predict(X_train) - train_rmse = bench.rmse_score(yp, y_train) - - bench.print_output(library='sklearn', algorithm='ridge_regression', - stages=['training', 'prediction'], params=params, - functions=['Ridge.fit', 'Ridge.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_test], - alg_instance=regr) + train_rmse = bench.rmse_score(y_train, yp) + train_r2 = bench.r2_score(y_train, yp) + + bench.print_output( + library='sklearn', + algorithm='ridge_regression', + stages=['training', 'prediction'], + params=params, + functions=['Ridge.fit', 'Ridge.predict'], + times=[fit_time, predict_time], + accuracy_type=['rmse', 'r2_score'], + accuracies=[[train_rmse, test_rmse], [train_r2, test_r2]], + data=[X_train, X_test], + alg_instance=regr, + ) if __name__ == "__main__": diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py index 9491ed136..db4cb149f 100644 --- a/sklearn_bench/svm.py +++ b/sklearn_bench/svm.py @@ -43,33 +43,39 @@ def main(): if params.probability: state_predict = 'predict_proba' - accuracy_type = 'log_loss' clf_predict = clf.predict_proba - - def metric_call(x, y): - return bench.log_loss(x, y) else: state_predict = 'prediction' - accuracy_type = 'accuracy[%]' clf_predict = clf.predict - def metric_call(x, y): - return bench.accuracy_score(x, y) - predict_train_time, y_pred = bench.measure_function_time( clf_predict, X_train, params=params) - train_acc = metric_call(y_train, y_pred) + train_acc = bench.accuracy_score(y_train, y_pred) + train_log_loss = bench.log_loss(y_train, y_pred) + train_roc_auc = bench.roc_auc_score(y_train, y_pred) _, y_pred = bench.measure_function_time( clf_predict, X_test, params=params) - test_acc = metric_call(y_test, y_pred) - - bench.print_output(library='sklearn', algorithm='svc', - stages=['training', state_predict], - params=params, functions=['SVM.fit', f'SVM.{state_predict}'], - times=[fit_time, predict_train_time], accuracy_type=accuracy_type, - accuracies=[train_acc, test_acc], data=[X_train, X_train], - alg_instance=clf) + test_acc = bench.accuracy_score(y_test, y_pred) + test_log_loss = bench.log_loss(y_test, y_pred) + test_roc_auc = bench.roc_auc_score(y_test, y_pred) + + bench.print_output( + library='sklearn', + algorithm='svc', + stages=['training', state_predict], + params=params, + functions=['SVM.fit', f'SVM.{state_predict}'], + times=[fit_time, predict_train_time], + accuracy_type=['accuracy', 'log_loss', 'roc_auc'], + accuracies=[ + [train_acc, test_acc], + [train_log_loss, test_log_loss], + [train_roc_auc, test_roc_auc], + ], + data=[X_train, X_train], + alg_instance=clf, + ) if __name__ == "__main__": diff --git a/sklearn_bench/svr.py b/sklearn_bench/svr.py index 0a20c4638..f9bf9407c 100644 --- a/sklearn_bench/svr.py +++ b/sklearn_bench/svr.py @@ -44,17 +44,25 @@ def main(): predict_train_time, y_pred = bench.measure_function_time( regr.predict, X_train, params=params) train_rmse = bench.rmse_score(y_train, y_pred) + train_r2 = bench.r2_score(y_train, y_pred) _, y_pred = bench.measure_function_time( regr.predict, X_test, params=params) test_rmse = bench.rmse_score(y_test, y_pred) - - bench.print_output(library='sklearn', algorithm='svr', - stages=['training', 'prediction'], - params=params, functions=['SVR.fit', 'SVR.predict'], - times=[fit_time, predict_train_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_train], - alg_instance=regr) + test_r2 = bench.r2_score(y_test, y_pred) + + bench.print_output( + library='sklearn', + algorithm='svr', + stages=['training', 'prediction'], + params=params, + functions=['SVR.fit', 'SVR.predict'], + times=[fit_time, predict_train_time], + accuracy_type=['rmse', 'r2_score'], + accuracies=[[train_rmse, test_rmse], [train_r2, test_r2]], + data=[X_train, X_train], + alg_instance=regr, + ) if __name__ == "__main__": From 4b22fb6e8176c457b4f2e48efa87ed50111bb144 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Mon, 2 Aug 2021 08:54:51 +0300 Subject: [PATCH 08/27] codefactor.io --- bench.py | 46 ++++++++++++---------------------------------- 1 file changed, 12 insertions(+), 34 deletions(-) diff --git a/bench.py b/bench.py index 199bd52a5..82134bd5e 100644 --- a/bench.py +++ b/bench.py @@ -30,9 +30,9 @@ def get_dtype(data): ''' if hasattr(data, 'dtype'): return data.dtype - elif hasattr(data, 'dtypes'): + if hasattr(data, 'dtypes'): return str(data.dtypes[0]) - elif hasattr(data, 'values'): + if hasattr(data, 'values'): return data.values.dtype else: raise ValueError(f'Impossible to get data type of {type(data)}') @@ -66,10 +66,7 @@ def _parse_size(string, dim=2): def float_or_int(string): - if '.' in string: - return float(string) - else: - return int(string) + return float(string) if '.' in string else int(string) def get_optimal_cache_size(n_rows, dtype=np.double, max_cache=64): @@ -90,10 +87,8 @@ def get_optimal_cache_size(n_rows, dtype=np.double, max_cache=64): optimal_cache_size_bytes = byte_size * (n_rows ** 2) one_gb = 2 ** 30 max_cache_bytes = max_cache * one_gb - if optimal_cache_size_bytes > max_cache_bytes: - return max_cache_bytes - else: - return optimal_cache_size_bytes + return max_cache_bytes if optimal_cache_size_bytes > max_cache_bytes \ + else optimal_cache_size_bytes def parse_args(parser, size=None, loop_types=(), @@ -324,20 +319,6 @@ def convert_to_numpy(data): return data -def columnwise_score(y, yp, score_func): - y = convert_to_numpy(y) - yp = convert_to_numpy(yp) - if y.ndim + yp.ndim > 2: - if 1 in (y.shape + yp.shape)[1:]: - if y.ndim > 1: - y = y[:, 0] - if yp.ndim > 1: - yp = yp[:, 0] - else: - return [score_func(y[i], yp[i]) for i in range(y.shape[1])] - return score_func(y, yp) - - def accuracy_score(y, yp): from sklearn.metrics import accuracy_score as sklearn_accuracy y = convert_to_numpy(y) @@ -362,7 +343,7 @@ def roc_auc_score(y, yp, multi_class='ovr'): yp = convert_to_numpy(yp) try: res = sklearn_roc_auc(y, yp, multi_class=multi_class) - except: + except Exception: res = -1 return res @@ -394,14 +375,11 @@ def convert_data(data, dtype, data_order, data_format): # Secondly, change format of data if data_format == 'numpy': return data - elif data_format == 'pandas': + if data_format == 'pandas': import pandas as pd - if data.ndim == 1: - return pd.Series(data) - else: - return pd.DataFrame(data) - elif data_format == 'cudf': + return pd.Series(data) if data.ndim == 1 else pd.DataFrame(data) + if data_format == 'cudf': import cudf import pandas as pd @@ -527,9 +505,9 @@ def print_output(library, algorithm, stages, params, functions, if isinstance(accuracy_type, str): result.update({f'{accuracy_type}': accuracies[i]}) elif isinstance(accuracy_type, list): - for j in range(len(accuracy_type)): - if accuracies[j][i] is not None: - result.update({f'{accuracy_type[j]}': accuracies[j][i]}) + for ind, val in enumerate(accuracy_type): + if accuracies[ind][i] is not None: + result.update({f'{val}': accuracies[ind][i]}) if hasattr(params, 'n_classes'): result['input_data'].update({'classes': params.n_classes}) if hasattr(params, 'n_clusters'): From 33794ed6e0681437067a190d4f1f27afb881463b Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Mon, 2 Aug 2021 08:58:23 +0300 Subject: [PATCH 09/27] pep8 --- bench.py | 4 ++-- sklearn_bench/dbscan.py | 2 +- sklearn_bench/elasticnet.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bench.py b/bench.py index 82134bd5e..bd8c96428 100644 --- a/bench.py +++ b/bench.py @@ -352,14 +352,14 @@ def rmse_score(y, yp): from sklearn.metrics import mean_squared_error as sklearn_mse y = convert_to_numpy(y) yp = convert_to_numpy(yp) - return sklearn_mse(y, yp) + return sklearn_mse(y, yp) def r2_score(y, yp): from sklearn.metrics import r2_score as sklearn_r2_score y = convert_to_numpy(y) yp = convert_to_numpy(yp) - return sklearn_r2_score(y, yp) + return sklearn_r2_score(y, yp) def convert_data(data, dtype, data_order, data_format): diff --git a/sklearn_bench/dbscan.py b/sklearn_bench/dbscan.py index e8191c0db..5da2e3301 100644 --- a/sklearn_bench/dbscan.py +++ b/sklearn_bench/dbscan.py @@ -42,7 +42,7 @@ def main(): params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0) try: acc = davies_bouldin_score(X, labels) - except: + except Exception: acc = -1 bench.print_output(library='sklearn', algorithm='dbscan', stages=['training'], diff --git a/sklearn_bench/elasticnet.py b/sklearn_bench/elasticnet.py index bcd33b242..082963146 100755 --- a/sklearn_bench/elasticnet.py +++ b/sklearn_bench/elasticnet.py @@ -34,7 +34,7 @@ def main(): # Time predict predict_time, y_pred = bench.measure_function_time(regr.predict, - X_train, params=params) + X_train, params=params) train_rmse = bench.rmse_score(y_train, y_pred) train_r2 = bench.r2_score(y_train, y_pred) From 28b8185ff57f1fe448dcc2258228f16f30ba0aef Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Mon, 2 Aug 2021 11:26:45 +0300 Subject: [PATCH 10/27] fix ci --- bench.py | 4 ++-- sklearn_bench/nusvr.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/bench.py b/bench.py index bd8c96428..2019dff26 100644 --- a/bench.py +++ b/bench.py @@ -333,7 +333,7 @@ def log_loss(y, yp): try: res = sklearn_log_loss(y, yp) except Exception: - res = -1 + res = None return res @@ -344,7 +344,7 @@ def roc_auc_score(y, yp, multi_class='ovr'): try: res = sklearn_roc_auc(y, yp, multi_class=multi_class) except Exception: - res = -1 + res = None return res diff --git a/sklearn_bench/nusvr.py b/sklearn_bench/nusvr.py index 4f4c113d1..e439c7b28 100644 --- a/sklearn_bench/nusvr.py +++ b/sklearn_bench/nusvr.py @@ -22,7 +22,6 @@ def main(): from sklearn.svm import NuSVR - from sklearn.metrics import r2_score X_train, X_test, y_train, y_test = bench.load_data(params) y_train = np.asfortranarray(y_train).ravel() From 069e15f112e6983d0b9db16208197d2f1db274b1 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Mon, 2 Aug 2021 15:41:59 +0300 Subject: [PATCH 11/27] fix --- bench.py | 48 +++++++++++++++++++-------------------- sklearn_bench/df_clsf.py | 4 ++-- sklearn_bench/knn_clsf.py | 4 ++-- sklearn_bench/log_reg.py | 4 ++-- sklearn_bench/nusvc.py | 6 +++-- sklearn_bench/svm.py | 6 +++-- 6 files changed, 38 insertions(+), 34 deletions(-) diff --git a/bench.py b/bench.py index 2019dff26..52f7cbc20 100644 --- a/bench.py +++ b/bench.py @@ -30,10 +30,10 @@ def get_dtype(data): ''' if hasattr(data, 'dtype'): return data.dtype - if hasattr(data, 'dtypes'): + elif hasattr(data, 'dtypes'): return str(data.dtypes[0]) - if hasattr(data, 'values'): - return data.values.dtype + elif hasattr(data, 'values'): + return data.values.dtype else: raise ValueError(f'Impossible to get data type of {type(data)}') @@ -66,7 +66,10 @@ def _parse_size(string, dim=2): def float_or_int(string): - return float(string) if '.' in string else int(string) + if '.' in string: + return float(string) + else: + return int(string) def get_optimal_cache_size(n_rows, dtype=np.double, max_cache=64): @@ -87,8 +90,10 @@ def get_optimal_cache_size(n_rows, dtype=np.double, max_cache=64): optimal_cache_size_bytes = byte_size * (n_rows ** 2) one_gb = 2 ** 30 max_cache_bytes = max_cache * one_gb - return max_cache_bytes if optimal_cache_size_bytes > max_cache_bytes \ - else optimal_cache_size_bytes + if optimal_cache_size_bytes > max_cache_bytes: + return max_cache_bytes + else: + return optimal_cache_size_bytes def parse_args(parser, size=None, loop_types=(), @@ -330,29 +335,21 @@ def log_loss(y, yp): from sklearn.metrics import log_loss as sklearn_log_loss y = convert_to_numpy(y) yp = convert_to_numpy(yp) - try: - res = sklearn_log_loss(y, yp) - except Exception: - res = None - return res + return sklearn_log_loss(y, yp) def roc_auc_score(y, yp, multi_class='ovr'): from sklearn.metrics import roc_auc_score as sklearn_roc_auc y = convert_to_numpy(y) yp = convert_to_numpy(yp) - try: - res = sklearn_roc_auc(y, yp, multi_class=multi_class) - except Exception: - res = None - return res + return sklearn_roc_auc(y, yp, multi_class=multi_class) -def rmse_score(y, yp): +def rmse_score(y, yp, squared=False): from sklearn.metrics import mean_squared_error as sklearn_mse y = convert_to_numpy(y) yp = convert_to_numpy(yp) - return sklearn_mse(y, yp) + return sklearn_mse(y, yp, squared=squared) def r2_score(y, yp): @@ -375,11 +372,14 @@ def convert_data(data, dtype, data_order, data_format): # Secondly, change format of data if data_format == 'numpy': return data - if data_format == 'pandas': + elif data_format == 'pandas': import pandas as pd - return pd.Series(data) if data.ndim == 1 else pd.DataFrame(data) - if data_format == 'cudf': + if data.ndim == 1: + return pd.Series(data) + else: + return pd.DataFrame(data) + elif data_format == 'cudf': import cudf import pandas as pd @@ -500,14 +500,14 @@ def print_output(library, algorithm, stages, params, functions, for i in range(len(stages)): result = gen_basic_dict(library, algorithm, stages[i], params, data[i], alg_instance, alg_params) - result.update({'time[s]': times[i]}) + result.update({'time[s]': str(times[i])}) if accuracy_type is not None: if isinstance(accuracy_type, str): - result.update({f'{accuracy_type}': accuracies[i]}) + result.update({f'{accuracy_type}': str(accuracies[i])}) elif isinstance(accuracy_type, list): for ind, val in enumerate(accuracy_type): if accuracies[ind][i] is not None: - result.update({f'{val}': accuracies[ind][i]}) + result.update({f'{val}': str(accuracies[ind][i])}) if hasattr(params, 'n_classes'): result['input_data'].update({'classes': params.n_classes}) if hasattr(params, 'n_clusters'): diff --git a/sklearn_bench/df_clsf.py b/sklearn_bench/df_clsf.py index 0d48b82f5..5102827ec 100644 --- a/sklearn_bench/df_clsf.py +++ b/sklearn_bench/df_clsf.py @@ -43,13 +43,13 @@ def main(): fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) y_pred = clf.predict(X_train) train_acc = bench.accuracy_score(y_train, y_pred) - train_log_loss = bench.log_loss(y_train, y_pred) + train_log_loss = bench.log_loss(y_train, clf.predict_proba(X_train)) train_roc_auc = bench.roc_auc_score(y_train, y_pred) predict_time, y_pred = bench.measure_function_time( clf.predict, X_test, params=params) test_acc = bench.accuracy_score(y_test, y_pred) - test_log_loss = bench.log_loss(y_test, y_pred) + test_log_loss = bench.log_loss(y_test, clf.predict_proba(X_test)) test_roc_auc = bench.roc_auc_score(y_test, y_pred) bench.print_output( diff --git a/sklearn_bench/knn_clsf.py b/sklearn_bench/knn_clsf.py index 0f98d2f0b..228932d5b 100755 --- a/sklearn_bench/knn_clsf.py +++ b/sklearn_bench/knn_clsf.py @@ -40,7 +40,7 @@ def main(): if params.task == 'classification': y_pred = knn_clsf.predict(X_train) train_acc = bench.accuracy_score(y_train, y_pred) - train_log_loss = bench.log_loss(y_train, y_pred) + train_log_loss = bench.log_loss(y_train, knn_clsf.predict_proba(X_train)) train_roc_auc = bench.roc_auc_score(y_train, y_pred) # Measure time and accuracy on prediction @@ -48,7 +48,7 @@ def main(): predict_time, yp = bench.measure_function_time(knn_clsf.predict, X_test, params=params) test_acc = bench.accuracy_score(y_test, yp) - test_log_loss = bench.log_loss(y_test, yp) + test_log_loss = bench.log_loss(y_test, knn_clsf.predict_proba(X_test)) test_roc_auc = bench.roc_auc_score(y_test, yp) else: predict_time, _ = bench.measure_function_time(knn_clsf.kneighbors, X_test, diff --git a/sklearn_bench/log_reg.py b/sklearn_bench/log_reg.py index 22a0a11d7..9367286e7 100644 --- a/sklearn_bench/log_reg.py +++ b/sklearn_bench/log_reg.py @@ -45,13 +45,13 @@ def main(): y_pred = clf.predict(X_train) train_acc = bench.accuracy_score(y_train, y_pred) - train_log_loss = bench.log_loss(y_train, y_pred) + train_log_loss = bench.log_loss(y_train, clf.predict_proba(X_train)) train_roc_auc = bench.roc_auc_score(y_train, y_pred) predict_time, y_pred = bench.measure_function_time( clf.predict, X_test, params=params) test_acc = bench.accuracy_score(y_test, y_pred) - test_log_loss = bench.log_loss(y_test, y_pred) + test_log_loss = bench.log_loss(y_test, clf.predict_proba(X_test)) test_roc_auc = bench.roc_auc_score(y_test, y_pred) bench.print_output( diff --git a/sklearn_bench/nusvc.py b/sklearn_bench/nusvc.py index e22124480..a7001a3e9 100644 --- a/sklearn_bench/nusvc.py +++ b/sklearn_bench/nusvc.py @@ -44,20 +44,22 @@ def main(): if params.probability: state_predict = 'predict_proba' clf_predict = clf.predict_proba + train_log_loss = bench.log_loss(y_train, clf.predict_proba(X_train)) + test_log_loss = bench.log_loss(y_test, clf.predict_proba(X_test)) else: state_predict = 'prediction' clf_predict = clf.predict + train_log_loss = None + test_log_loss = None predict_train_time, y_pred = bench.measure_function_time( clf_predict, X_train, params=params) train_acc = bench.accuracy_score(y_train, y_pred) - train_log_loss = bench.log_loss(y_train, y_pred) train_roc_auc = bench.roc_auc_score(y_train, y_pred) _, y_pred = bench.measure_function_time( clf_predict, X_test, params=params) test_acc = bench.accuracy_score(y_test, y_pred) - test_log_loss = bench.log_loss(y_test, y_pred) test_roc_auc = bench.roc_auc_score(y_test, y_pred) bench.print_output( diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py index db4cb149f..542ccccb6 100644 --- a/sklearn_bench/svm.py +++ b/sklearn_bench/svm.py @@ -44,20 +44,22 @@ def main(): if params.probability: state_predict = 'predict_proba' clf_predict = clf.predict_proba + train_log_loss = bench.log_loss(y_train, clf_predict(X_train)) + test_log_loss = bench.log_loss(y_test, clf_predict(X_test)) else: state_predict = 'prediction' clf_predict = clf.predict + train_log_loss = None + test_log_loss = None predict_train_time, y_pred = bench.measure_function_time( clf_predict, X_train, params=params) train_acc = bench.accuracy_score(y_train, y_pred) - train_log_loss = bench.log_loss(y_train, y_pred) train_roc_auc = bench.roc_auc_score(y_train, y_pred) _, y_pred = bench.measure_function_time( clf_predict, X_test, params=params) test_acc = bench.accuracy_score(y_test, y_pred) - test_log_loss = bench.log_loss(y_test, y_pred) test_roc_auc = bench.roc_auc_score(y_test, y_pred) bench.print_output( From e0a6d74d5d6bd9042ce75039cb4cc29a8cdae31e Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Mon, 2 Aug 2021 15:45:42 +0300 Subject: [PATCH 12/27] whitespace --- bench.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench.py b/bench.py index 52f7cbc20..91e950878 100644 --- a/bench.py +++ b/bench.py @@ -33,7 +33,7 @@ def get_dtype(data): elif hasattr(data, 'dtypes'): return str(data.dtypes[0]) elif hasattr(data, 'values'): - return data.values.dtype + return data.values.dtype else: raise ValueError(f'Impossible to get data type of {type(data)}') From 4a49d18df6b99bb43b09c7b55d2be175f069a654 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Mon, 2 Aug 2021 15:57:32 +0300 Subject: [PATCH 13/27] return to stock functions --- bench.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/bench.py b/bench.py index 91e950878..8cae5a1db 100644 --- a/bench.py +++ b/bench.py @@ -324,11 +324,22 @@ def convert_to_numpy(data): return data -def accuracy_score(y, yp): - from sklearn.metrics import accuracy_score as sklearn_accuracy +def columnwise_score(y, yp, score_func): y = convert_to_numpy(y) yp = convert_to_numpy(yp) - return sklearn_accuracy(y, yp) + if y.ndim + yp.ndim > 2: + if 1 in (y.shape + yp.shape)[1:]: + if y.ndim > 1: + y = y[:, 0] + if yp.ndim > 1: + yp = yp[:, 0] + else: + return [score_func(y[i], yp[i]) for i in range(y.shape[1])] + return score_func(y, yp) + + +def accuracy_score(y, yp): + return columnwise_score(y, yp, lambda y1, y2: np.mean(y1 == y2)) def log_loss(y, yp): @@ -346,10 +357,8 @@ def roc_auc_score(y, yp, multi_class='ovr'): def rmse_score(y, yp, squared=False): - from sklearn.metrics import mean_squared_error as sklearn_mse - y = convert_to_numpy(y) - yp = convert_to_numpy(yp) - return sklearn_mse(y, yp, squared=squared) + return columnwise_score( + y, yp, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2)))) def r2_score(y, yp): @@ -500,14 +509,14 @@ def print_output(library, algorithm, stages, params, functions, for i in range(len(stages)): result = gen_basic_dict(library, algorithm, stages[i], params, data[i], alg_instance, alg_params) - result.update({'time[s]': str(times[i])}) + result.update({'time[s]': times[i]}) if accuracy_type is not None: if isinstance(accuracy_type, str): - result.update({f'{accuracy_type}': str(accuracies[i])}) + result.update({f'{accuracy_type}': accuracies[i]}) elif isinstance(accuracy_type, list): - for ind, val in enumerate(accuracy_type): - if accuracies[ind][i] is not None: - result.update({f'{val}': str(accuracies[ind][i])}) + for j in range(len(accuracy_type)): + if accuracies[j][i] is not None: + result.update({f'{accuracy_type[j]}': accuracies[j][i]}) if hasattr(params, 'n_classes'): result['input_data'].update({'classes': params.n_classes}) if hasattr(params, 'n_clusters'): @@ -524,6 +533,7 @@ def print_output(library, algorithm, stages, params, functions, if 'handle' in result['algorithm_parameters'].keys(): del result['algorithm_parameters']['handle'] output.append(result) + print(output) print(json.dumps(output, indent=4)) From 2310156c11d6bfe9e08e2270f3e5ac3fbd8b3141 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Mon, 2 Aug 2021 16:30:57 +0300 Subject: [PATCH 14/27] remove debug putput --- bench.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bench.py b/bench.py index 8cae5a1db..d31c8683f 100644 --- a/bench.py +++ b/bench.py @@ -533,7 +533,6 @@ def print_output(library, algorithm, stages, params, functions, if 'handle' in result['algorithm_parameters'].keys(): del result['algorithm_parameters']['handle'] output.append(result) - print(output) print(json.dumps(output, indent=4)) From be16c37b396d71168a86aa02ccf4887ce56bcbd8 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Mon, 2 Aug 2021 17:02:51 +0300 Subject: [PATCH 15/27] apply comments --- bench.py | 23 ++++++++++++++++------- sklearn_bench/dbscan.py | 6 +----- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/bench.py b/bench.py index d31c8683f..d5d8b0466 100644 --- a/bench.py +++ b/bench.py @@ -324,7 +324,7 @@ def convert_to_numpy(data): return data -def columnwise_score(y, yp, score_func): +def columnwise_score(y, yp, score_func, **params): y = convert_to_numpy(y) yp = convert_to_numpy(yp) if y.ndim + yp.ndim > 2: @@ -334,8 +334,8 @@ def columnwise_score(y, yp, score_func): if yp.ndim > 1: yp = yp[:, 0] else: - return [score_func(y[i], yp[i]) for i in range(y.shape[1])] - return score_func(y, yp) + return [score_func(y[i], yp[i], **params) for i in range(y.shape[1])] + return score_func(y, yp, **params) def accuracy_score(y, yp): @@ -351,12 +351,10 @@ def log_loss(y, yp): def roc_auc_score(y, yp, multi_class='ovr'): from sklearn.metrics import roc_auc_score as sklearn_roc_auc - y = convert_to_numpy(y) - yp = convert_to_numpy(yp) - return sklearn_roc_auc(y, yp, multi_class=multi_class) + return columnwise_score(y, yp, sklearn_roc_auc, multi_class=multi_class) -def rmse_score(y, yp, squared=False): +def rmse_score(y, yp): return columnwise_score( y, yp, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2)))) @@ -368,6 +366,17 @@ def r2_score(y, yp): return sklearn_r2_score(y, yp) +def davies_bouldin_score(y, yp): + from sklearn.metrics.cluster import davies_bouldin_score as sklearn_dbs + y = convert_to_numpy(y) + yp = convert_to_numpy(yp) + try: + res = sklearn_dbs(y, yp) + except ValueError: # Number of labels is 1 + res = "Error" + return res + + def convert_data(data, dtype, data_order, data_format): ''' Convert input data (numpy array) to needed format, type and order diff --git a/sklearn_bench/dbscan.py b/sklearn_bench/dbscan.py index 5da2e3301..8f941c01f 100644 --- a/sklearn_bench/dbscan.py +++ b/sklearn_bench/dbscan.py @@ -21,7 +21,6 @@ def main(): from sklearn.cluster import DBSCAN - from sklearn.metrics.cluster import davies_bouldin_score # Load generated data X, _, _, _ = bench.load_data(params, add_dtype=True) @@ -40,10 +39,7 @@ def main(): labels = dbscan.labels_ params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0) - try: - acc = davies_bouldin_score(X, labels) - except Exception: - acc = -1 + acc = bench.davies_bouldin_score(X, labels) bench.print_output(library='sklearn', algorithm='dbscan', stages=['training'], params=params, functions=['DBSCAN'], times=[time], From 6508ca2463971c6879557c3ee12104a564569bc8 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Mon, 2 Aug 2021 17:12:37 +0300 Subject: [PATCH 16/27] metrics & metric_type --- bench.py | 16 ++++++++-------- cuml_bench/dbscan.py | 2 +- cuml_bench/df_clsf.py | 4 ++-- cuml_bench/df_regr.py | 4 ++-- cuml_bench/elasticnet.py | 4 ++-- cuml_bench/kmeans.py | 4 ++-- cuml_bench/knn_clsf.py | 4 ++-- cuml_bench/lasso.py | 4 ++-- cuml_bench/linear.py | 4 ++-- cuml_bench/log_reg.py | 4 ++-- cuml_bench/pca.py | 4 ++-- cuml_bench/ridge.py | 4 ++-- cuml_bench/svm.py | 8 ++++---- cuml_bench/svr.py | 4 ++-- cuml_bench/train_test_split.py | 4 ++-- daal4py_bench/dbscan.py | 2 +- daal4py_bench/df_clsf.py | 4 ++-- daal4py_bench/df_regr.py | 4 ++-- daal4py_bench/distances.py | 2 +- daal4py_bench/kmeans.py | 4 ++-- daal4py_bench/linear.py | 4 ++-- daal4py_bench/pca.py | 4 ++-- daal4py_bench/ridge.py | 4 ++-- modelbuilders_bench/lgbm_mb.py | 2 +- modelbuilders_bench/mb_utils.py | 6 +++--- modelbuilders_bench/xgb_mb.py | 4 ++-- sklearn_bench/dbscan.py | 2 +- sklearn_bench/df_clsf.py | 4 ++-- sklearn_bench/df_regr.py | 4 ++-- sklearn_bench/distances.py | 2 +- sklearn_bench/elasticnet.py | 4 ++-- sklearn_bench/kmeans.py | 4 ++-- sklearn_bench/knn_clsf.py | 8 ++++---- sklearn_bench/lasso.py | 4 ++-- sklearn_bench/linear.py | 4 ++-- sklearn_bench/log_reg.py | 4 ++-- sklearn_bench/nusvc.py | 4 ++-- sklearn_bench/nusvr.py | 4 ++-- sklearn_bench/pca.py | 2 +- sklearn_bench/ridge.py | 4 ++-- sklearn_bench/svm.py | 4 ++-- sklearn_bench/svr.py | 4 ++-- sklearn_bench/train_test_split.py | 4 ++-- xgboost_bench/gbt.py | 4 ++-- 44 files changed, 92 insertions(+), 92 deletions(-) diff --git a/bench.py b/bench.py index d5d8b0466..5a4a0cd4b 100644 --- a/bench.py +++ b/bench.py @@ -511,7 +511,7 @@ def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None, def print_output(library, algorithm, stages, params, functions, - times, accuracy_type, accuracies, data, alg_instance=None, + times, metric_type, metrics, data, alg_instance=None, alg_params=None): if params.output_format == 'json': output = [] @@ -519,13 +519,13 @@ def print_output(library, algorithm, stages, params, functions, result = gen_basic_dict(library, algorithm, stages[i], params, data[i], alg_instance, alg_params) result.update({'time[s]': times[i]}) - if accuracy_type is not None: - if isinstance(accuracy_type, str): - result.update({f'{accuracy_type}': accuracies[i]}) - elif isinstance(accuracy_type, list): - for j in range(len(accuracy_type)): - if accuracies[j][i] is not None: - result.update({f'{accuracy_type[j]}': accuracies[j][i]}) + if metric_type is not None: + if isinstance(metric_type, str): + result.update({f'{metric_type}': metrics[i]}) + elif isinstance(metric_type, list): + for ind, val in enumerate(metric_type): + if metrics[ind][i] is not None: + result.update({f'{val}': metrics[ind][i]}) if hasattr(params, 'n_classes'): result['input_data'].update({'classes': params.n_classes}) if hasattr(params, 'n_clusters'): diff --git a/cuml_bench/dbscan.py b/cuml_bench/dbscan.py index 663a2fc10..03ecfc2e4 100644 --- a/cuml_bench/dbscan.py +++ b/cuml_bench/dbscan.py @@ -48,5 +48,5 @@ bench.print_output(library='cuml', algorithm='dbscan', stages=['training'], params=params, functions=['DBSCAN'], times=[time], - accuracies=[acc], accuracy_type='davies_bouldin_score', data=[X], + metrics=[acc], metric_type='davies_bouldin_score', data=[X], alg_instance=dbscan) diff --git a/cuml_bench/df_clsf.py b/cuml_bench/df_clsf.py index e3b133544..e4d265954 100755 --- a/cuml_bench/df_clsf.py +++ b/cuml_bench/df_clsf.py @@ -97,6 +97,6 @@ def predict(X): bench.print_output(library='cuml', algorithm='decision_forest_classification', stages=['training', 'prediction'], params=params, functions=['df_clsf.fit', 'df_clsf.predict'], - times=[fit_time, predict_time], accuracy_type='accuracy[%]', - accuracies=[train_acc, test_acc], data=[X_train, X_test], + times=[fit_time, predict_time], metric_type='accuracy[%]', + metrics=[train_acc, test_acc], data=[X_train, X_test], alg_instance=clf) diff --git a/cuml_bench/df_regr.py b/cuml_bench/df_regr.py index 62040791b..9e7298882 100644 --- a/cuml_bench/df_regr.py +++ b/cuml_bench/df_regr.py @@ -93,6 +93,6 @@ def predict(X): bench.print_output(library='cuml', algorithm='decision_forest_regression', stages=['training', 'prediction'], params=params, functions=['df_regr.fit', 'df_regr.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_test], + times=[fit_time, predict_time], metric_type='rmse', + metrics=[train_rmse, test_rmse], data=[X_train, X_test], alg_instance=regr) diff --git a/cuml_bench/elasticnet.py b/cuml_bench/elasticnet.py index bd0684a09..2f4e3dd5e 100755 --- a/cuml_bench/elasticnet.py +++ b/cuml_bench/elasticnet.py @@ -56,6 +56,6 @@ bench.print_output(library='cuml', algorithm='elastic-net', stages=['training', 'prediction'], params=params, functions=['ElasticNet.fit', 'ElasticNet.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_train], + times=[fit_time, predict_time], metric_type='rmse', + metrics=[train_rmse, test_rmse], data=[X_train, X_train], alg_instance=regr) diff --git a/cuml_bench/kmeans.py b/cuml_bench/kmeans.py index d0192ba4d..2e3e9d9ff 100644 --- a/cuml_bench/kmeans.py +++ b/cuml_bench/kmeans.py @@ -88,6 +88,6 @@ def kmeans_fit(X): bench.print_output(library='cuml', algorithm='kmeans', stages=['training', 'prediction'], params=params, functions=['KMeans.fit', 'KMeans.predict'], - times=[fit_time, predict_time], accuracy_type='davies_bouldin_score', - accuracies=[acc_train, acc_test], data=[X_train, X_test], + times=[fit_time, predict_time], metric_type='davies_bouldin_score', + metrics=[acc_train, acc_test], data=[X_train, X_test], alg_instance=kmeans) diff --git a/cuml_bench/knn_clsf.py b/cuml_bench/knn_clsf.py index 0460346bf..6ccf3aa47 100755 --- a/cuml_bench/knn_clsf.py +++ b/cuml_bench/knn_clsf.py @@ -68,7 +68,7 @@ stages=['training', 'prediction'], params=params, functions=['knn_clsf.fit', 'knn_clsf.predict'], times=[train_time, predict_time], - accuracies=[train_acc, test_acc], accuracy_type='accuracy[%]', + metrics=[train_acc, test_acc], metric_type='accuracy[%]', data=[X_train, X_test], alg_instance=knn_clsf) else: bench.print_output(library='cuml', @@ -76,5 +76,5 @@ stages=['training', 'search'], params=params, functions=['knn_clsf.fit', 'knn_clsf.kneighbors'], times=[train_time, predict_time], - accuracies=[], accuracy_type=None, + metrics=[], metric_type=None, data=[X_train, X_test], alg_instance=knn_clsf) diff --git a/cuml_bench/lasso.py b/cuml_bench/lasso.py index 373ea3f19..9dc9e9e1c 100755 --- a/cuml_bench/lasso.py +++ b/cuml_bench/lasso.py @@ -53,6 +53,6 @@ bench.print_output(library='sklearn', algorithm='lasso', stages=['training', 'prediction'], params=params, functions=['Lasso.fit', 'Lasso.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_test], + times=[fit_time, predict_time], metric_type='rmse', + metrics=[train_rmse, test_rmse], data=[X_train, X_test], alg_instance=regr) diff --git a/cuml_bench/linear.py b/cuml_bench/linear.py index f80434917..bfe81991f 100644 --- a/cuml_bench/linear.py +++ b/cuml_bench/linear.py @@ -50,6 +50,6 @@ bench.print_output(library='cuml', algorithm='linear_regression', stages=['training', 'prediction'], params=params, functions=['Linear.fit', 'Linear.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_test], + times=[fit_time, predict_time], metric_type='rmse', + metrics=[train_rmse, test_rmse], data=[X_train, X_test], alg_instance=regr) diff --git a/cuml_bench/log_reg.py b/cuml_bench/log_reg.py index f8e143d9b..599b1bfdf 100644 --- a/cuml_bench/log_reg.py +++ b/cuml_bench/log_reg.py @@ -61,6 +61,6 @@ bench.print_output(library='cuml', algorithm='logistic_regression', stages=['training', 'prediction'], params=params, functions=['LogReg.fit', 'LogReg.predict'], - times=[fit_time, predict_time], accuracy_type='accuracy[%]', - accuracies=[train_acc, test_acc], data=[X_train, X_test], + times=[fit_time, predict_time], metric_type='accuracy[%]', + metrics=[train_acc, test_acc], data=[X_train, X_test], alg_instance=clf) diff --git a/cuml_bench/pca.py b/cuml_bench/pca.py index c43f569ae..35f20f3b6 100644 --- a/cuml_bench/pca.py +++ b/cuml_bench/pca.py @@ -51,6 +51,6 @@ bench.print_output(library='cuml', algorithm='pca', stages=['training', 'transformation'], params=params, functions=['PCA.fit', 'PCA.transform'], - times=[fit_time, transform_time], accuracy_type=None, - accuracies=[None, None], data=[X_train, X_test], + times=[fit_time, transform_time], metric_type=None, + metrics=[None, None], data=[X_train, X_test], alg_instance=pca) diff --git a/cuml_bench/ridge.py b/cuml_bench/ridge.py index 6c1696ae7..d6d488673 100644 --- a/cuml_bench/ridge.py +++ b/cuml_bench/ridge.py @@ -52,6 +52,6 @@ bench.print_output(library='cuml', algorithm='ridge_regression', stages=['training', 'prediction'], params=params, functions=['Ridge.fit', 'Ridge.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_test], + times=[fit_time, predict_time], metric_type='rmse', + metrics=[train_rmse, test_rmse], data=[X_train, X_test], alg_instance=regr) diff --git a/cuml_bench/svm.py b/cuml_bench/svm.py index 3067b175e..112427397 100644 --- a/cuml_bench/svm.py +++ b/cuml_bench/svm.py @@ -57,14 +57,14 @@ if params.probability: state_predict = 'predict_proba' - accuracy_type = 'log_loss' + metric_type = 'log_loss' clf_predict = clf.predict_proba def metric_call(x, y): return bench.log_loss(x, y) else: state_predict = 'prediction' - accuracy_type = 'accuracy[%]' + metric_type = 'accuracy[%]' clf_predict = clf.predict def metric_call(x, y): @@ -82,6 +82,6 @@ def metric_call(x, y): bench.print_output(library='cuml', algorithm='svc', stages=['training', state_predict], params=params, functions=['SVM.fit', 'SVM.predict'], - times=[fit_time, predict_train_time], accuracy_type=accuracy_type, - accuracies=[train_acc, test_acc], data=[X_train, X_train], + times=[fit_time, predict_train_time], metric_type=metric_type, + metrics=[train_acc, test_acc], data=[X_train, X_train], alg_instance=clf) diff --git a/cuml_bench/svr.py b/cuml_bench/svr.py index 9644a9362..7560ff103 100644 --- a/cuml_bench/svr.py +++ b/cuml_bench/svr.py @@ -66,6 +66,6 @@ bench.print_output(library='cuml', algorithm='svr', stages=['training', 'prediction'], params=params, functions=['SVR.fit', 'SVR.predict'], - times=[fit_time, predict_train_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_train], + times=[fit_time, predict_train_time], metric_type='rmse', + metrics=[train_rmse, test_rmse], data=[X_train, X_train], alg_instance=regr) diff --git a/cuml_bench/train_test_split.py b/cuml_bench/train_test_split.py index d9e74ad94..d8f70f7e6 100644 --- a/cuml_bench/train_test_split.py +++ b/cuml_bench/train_test_split.py @@ -44,5 +44,5 @@ bench.print_output(library='cuml', algorithm='train_test_split', stages=['training'], params=params, - functions=['train_test_split'], times=[time], accuracies=[None], - accuracy_type=None, data=[X], alg_params=tts_params) + functions=['train_test_split'], times=[time], metrics=[None], + metric_type=None, data=[X], alg_params=tts_params) diff --git a/daal4py_bench/dbscan.py b/daal4py_bench/dbscan.py index 64a19d813..d5010cfd0 100644 --- a/daal4py_bench/dbscan.py +++ b/daal4py_bench/dbscan.py @@ -51,4 +51,4 @@ def test_dbscan(X): bench.print_output(library='daal4py', algorithm='dbscan', stages=['training'], params=params, functions=['DBSCAN'], times=[time], - accuracies=[None], accuracy_type=None, data=[X]) + metrics=[None], metric_type=None, data=[X]) diff --git a/daal4py_bench/df_clsf.py b/daal4py_bench/df_clsf.py index 0c149291f..0e32a1185 100644 --- a/daal4py_bench/df_clsf.py +++ b/daal4py_bench/df_clsf.py @@ -126,5 +126,5 @@ def df_clsf_predict(X, training_result, n_classes, verbose=False): bench.print_output(library='daal4py', algorithm='decision_forest_classification', stages=['training', 'prediction'], params=params, functions=['df_clsf.fit', 'df_clsf.predict'], - times=[fit_time, predict_time], accuracy_type='accuracy[%]', - accuracies=[train_acc, test_acc], data=[X_train, X_test]) + times=[fit_time, predict_time], metric_type='accuracy[%]', + metrics=[train_acc, test_acc], data=[X_train, X_test]) diff --git a/daal4py_bench/df_regr.py b/daal4py_bench/df_regr.py index 628e159fd..5ff2beb9b 100644 --- a/daal4py_bench/df_regr.py +++ b/daal4py_bench/df_regr.py @@ -123,5 +123,5 @@ def df_regr_predict(X, training_result): bench.print_output(library='daal4py', algorithm='decision_forest_regression', stages=['training', 'prediction'], params=params, functions=['df_regr.fit', 'df_regr.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_test]) + times=[fit_time, predict_time], metric_type='rmse', + metrics=[train_rmse, test_rmse], data=[X_train, X_test]) diff --git a/daal4py_bench/distances.py b/daal4py_bench/distances.py index ed6896024..70408856d 100644 --- a/daal4py_bench/distances.py +++ b/daal4py_bench/distances.py @@ -43,5 +43,5 @@ def compute_distances(pairwise_distances, X): bench.print_output(library='daal4py', algorithm='distances', stages=['computation'], params=params, functions=[params.metric.capitalize()], times=[time], - accuracy_type=None, accuracies=[None], data=[X], + metric_type=None, metrics=[None], data=[X], alg_params={'metric': params.metric}) diff --git a/daal4py_bench/kmeans.py b/daal4py_bench/kmeans.py index 9a224713f..bc9b1afe4 100644 --- a/daal4py_bench/kmeans.py +++ b/daal4py_bench/kmeans.py @@ -87,5 +87,5 @@ def test_predict(X, X_init): bench.print_output(library='daal4py', algorithm='kmeans', stages=['training', 'prediction'], params=params, functions=['KMeans.fit', 'KMeans.predict'], - times=[fit_time, predict_time], accuracy_type='inertia', - accuracies=[train_inertia, test_inertia], data=[X_train, X_test]) + times=[fit_time, predict_time], metric_type='inertia', + metrics=[train_inertia, test_inertia], data=[X_train, X_test]) diff --git a/daal4py_bench/linear.py b/daal4py_bench/linear.py index 0b62a42a5..8cf076427 100644 --- a/daal4py_bench/linear.py +++ b/daal4py_bench/linear.py @@ -68,5 +68,5 @@ def test_predict(Xp, model): bench.print_output(library='daal4py', algorithm='linear_regression', stages=['training', 'prediction'], params=params, functions=['Linear.fit', 'Linear.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_test]) + times=[fit_time, predict_time], metric_type='rmse', + metrics=[train_rmse, test_rmse], data=[X_train, X_test]) diff --git a/daal4py_bench/pca.py b/daal4py_bench/pca.py index 81161d1bd..98fb12e61 100644 --- a/daal4py_bench/pca.py +++ b/daal4py_bench/pca.py @@ -142,7 +142,7 @@ def test_transform(Xp, pca_result, eigenvalues, eigenvectors): bench.print_output(library='daal4py', algorithm='pca', stages=['training', 'transformation'], params=params, functions=['PCA.fit', 'PCA.transform'], - times=[fit_time, transform_time], accuracy_type=None, - accuracies=[None, None], data=[X_train, X_test], + times=[fit_time, transform_time], metric_type=None, + metrics=[None, None], data=[X_train, X_test], alg_params={'svd_solver': params.svd_solver, 'n_components': params.n_components}) diff --git a/daal4py_bench/ridge.py b/daal4py_bench/ridge.py index 1a04edf78..38722cb60 100644 --- a/daal4py_bench/ridge.py +++ b/daal4py_bench/ridge.py @@ -64,5 +64,5 @@ def test_predict(Xp, model): bench.print_output(library='daal4py', algorithm='ridge_regression', stages=['training', 'prediction'], params=params, functions=['Ridge.fit', 'Ridge.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_test]) + times=[fit_time, predict_time], metric_type='rmse', + metrics=[train_rmse, test_rmse], data=[X_train, X_test]) diff --git a/modelbuilders_bench/lgbm_mb.py b/modelbuilders_bench/lgbm_mb.py index 2b4c29616..7eca91ce3 100644 --- a/modelbuilders_bench/lgbm_mb.py +++ b/modelbuilders_bench/lgbm_mb.py @@ -146,6 +146,6 @@ params=params, functions=['lgbm_dataset', 'lgbm_dataset', 'lgbm_train', 'lgbm_predict', 'lgbm_to_daal', 'daal_compute'], times=[t_creat_train, t_train, t_creat_test, t_lgbm_pred, t_trans, t_daal_pred], - accuracy_type=metric_name, accuracies=[train_metric, test_metric_lgbm, + metric_type=metric_name, metrics=[train_metric, test_metric_lgbm, test_metric_daal], data=[X_train, X_test, X_test]) diff --git a/modelbuilders_bench/mb_utils.py b/modelbuilders_bench/mb_utils.py index d66adaac8..2d659dc09 100644 --- a/modelbuilders_bench/mb_utils.py +++ b/modelbuilders_bench/mb_utils.py @@ -37,7 +37,7 @@ def get_accuracy(true_labels, prediction): def print_output(library, algorithm, stages, params, functions, - times, accuracy_type, accuracies, data): + times, metric_type, metrics, data): if params.output_format == 'json': output = [] output.append({ @@ -67,7 +67,7 @@ def print_output(library, algorithm, stages, params, functions, else: result.update({'matrix_creation_time': times[2 * i], 'prediction_time': times[2 * i + 1]}) - if accuracies[i] is not None: - result.update({f'{accuracy_type}': accuracies[i]}) + if metrics[i] is not None: + result.update({f'{metric_type}': metrics[i]}) output.append(result) print(json.dumps(output, indent=4)) diff --git a/modelbuilders_bench/xgb_mb.py b/modelbuilders_bench/xgb_mb.py index 92764142d..54bf4bd95 100644 --- a/modelbuilders_bench/xgb_mb.py +++ b/modelbuilders_bench/xgb_mb.py @@ -196,6 +196,6 @@ def predict(dmatrix): # type: ignore 'daal4py.get_gbt_model_from_xgboost', 'daal4py.compute'], times=[t_creat_train, fit_time, t_creat_test, predict_time, transform_time, predict_time_daal], - accuracy_type=metric_name, - accuracies=[None, train_metric, None, test_metric, None, test_metric_daal], + metric_type=metric_name, + metrics=[None, train_metric, None, test_metric, None, test_metric_daal], data=[X_train, X_train, X_test, X_test, X_test, X_test]) diff --git a/sklearn_bench/dbscan.py b/sklearn_bench/dbscan.py index 8f941c01f..0ef445f7b 100644 --- a/sklearn_bench/dbscan.py +++ b/sklearn_bench/dbscan.py @@ -43,7 +43,7 @@ def main(): bench.print_output(library='sklearn', algorithm='dbscan', stages=['training'], params=params, functions=['DBSCAN'], times=[time], - accuracies=[acc], accuracy_type='davies_bouldin_score', + metrics=[acc], metric_type='davies_bouldin_score', data=[X], alg_instance=dbscan) diff --git a/sklearn_bench/df_clsf.py b/sklearn_bench/df_clsf.py index 5102827ec..30610a7bd 100644 --- a/sklearn_bench/df_clsf.py +++ b/sklearn_bench/df_clsf.py @@ -59,8 +59,8 @@ def main(): params=params, functions=['df_clsf.fit', 'df_clsf.predict'], times=[fit_time, predict_time], - accuracy_type=['accuracy', 'log_loss', 'roc_auc'], - accuracies=[ + metric_type=['accuracy', 'log_loss', 'roc_auc'], + metrics=[ [train_acc, test_acc], [train_log_loss, test_log_loss], [train_roc_auc, test_roc_auc], diff --git a/sklearn_bench/df_regr.py b/sklearn_bench/df_regr.py index 61a24f918..2d12c65f7 100644 --- a/sklearn_bench/df_regr.py +++ b/sklearn_bench/df_regr.py @@ -55,8 +55,8 @@ def main(): params=params, functions=['df_regr.fit', 'df_regr.predict'], times=[fit_time, predict_time], - accuracy_type=['rmse', 'r2_score'], - accuracies=[[train_rmse, test_rmse], [train_r2, test_r2]], + metric_type=['rmse', 'r2_score'], + metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], data=[X_train, X_test], alg_instance=regr, ) diff --git a/sklearn_bench/distances.py b/sklearn_bench/distances.py index a1eeaa989..c708513d1 100644 --- a/sklearn_bench/distances.py +++ b/sklearn_bench/distances.py @@ -30,7 +30,7 @@ def main(): bench.print_output(library='sklearn', algorithm='distances', stages=['computation'], params=params, functions=[params.metric.capitalize()], - times=[time], accuracy_type=None, accuracies=[None], data=[X], + times=[time], metric_type=None, metrics=[None], data=[X], alg_params={'metric': params.metric}) diff --git a/sklearn_bench/elasticnet.py b/sklearn_bench/elasticnet.py index 082963146..764995e0b 100755 --- a/sklearn_bench/elasticnet.py +++ b/sklearn_bench/elasticnet.py @@ -49,8 +49,8 @@ def main(): params=params, functions=['ElasticNet.fit', 'ElasticNet.predict'], times=[fit_time, predict_time], - accuracy_type=['rmse', 'r2_score'], - accuracies=[[train_rmse, test_rmse], [train_r2, test_r2]], + metric_type=['rmse', 'r2_score'], + metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], data=[X_train, X_train], alg_instance=regr, ) diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index 0fa96b0dc..6028cd1e7 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -73,8 +73,8 @@ def fit_kmeans(X, X_init): params=params, functions=['KMeans.fit', 'KMeans.predict'], times=[fit_time, predict_time], - accuracy_type=['davies_bouldin_score', 'inertia'], - accuracies=[[acc_train, acc_test], [kmeans.inertia_, kmeans.inertia_]], + metric_type=['davies_bouldin_score', 'inertia'], + metrics=[[acc_train, acc_test], [kmeans.inertia_, kmeans.inertia_]], data=[X_train, X_test], alg_instance=kmeans, ) diff --git a/sklearn_bench/knn_clsf.py b/sklearn_bench/knn_clsf.py index 228932d5b..ad3de56dd 100755 --- a/sklearn_bench/knn_clsf.py +++ b/sklearn_bench/knn_clsf.py @@ -62,8 +62,8 @@ def main(): params=params, functions=['knn_clsf.fit', 'knn_clsf.predict'], times=[train_time, predict_time], - accuracy_type=['accuracy', 'log_loss', 'roc_auc'], - accuracies=[ + metric_type=['accuracy', 'log_loss', 'roc_auc'], + metrics=[ [train_acc, test_acc], [train_log_loss, test_log_loss], [train_roc_auc, test_roc_auc], @@ -79,8 +79,8 @@ def main(): params=params, functions=['knn_clsf.fit', 'knn_clsf.kneighbors'], times=[train_time, predict_time], - accuracy_type=None, - accuracies=[], + metric_type=None, + metrics=[], data=[X_train, X_test], alg_instance=knn_clsf, ) diff --git a/sklearn_bench/lasso.py b/sklearn_bench/lasso.py index 741060ccf..d0e10cb7c 100755 --- a/sklearn_bench/lasso.py +++ b/sklearn_bench/lasso.py @@ -49,8 +49,8 @@ def main(): params=params, functions=['Lasso.fit', 'Lasso.predict'], times=[fit_time, predict_time], - accuracy_type=['rmse', 'r2_score'], - accuracies=[[train_rmse, test_rmse], [train_r2, test_r2]], + metric_type=['rmse', 'r2_score'], + metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], data=[X_train, X_test], alg_instance=regr, ) diff --git a/sklearn_bench/linear.py b/sklearn_bench/linear.py index 4da59e9ae..c7390efbe 100644 --- a/sklearn_bench/linear.py +++ b/sklearn_bench/linear.py @@ -46,8 +46,8 @@ def main(): stages=['training', 'prediction'], params=params, functions=['Linear.fit', 'Linear.predict'], times=[fit_time, predict_time], - accuracy_type=['rmse', 'r2_score'], - accuracies=[[train_rmse, test_rmse], [train_r2, test_r2]], + metric_type=['rmse', 'r2_score'], + metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], data=[X_train, X_test], alg_instance=regr, ) diff --git a/sklearn_bench/log_reg.py b/sklearn_bench/log_reg.py index 9367286e7..eb7e65403 100644 --- a/sklearn_bench/log_reg.py +++ b/sklearn_bench/log_reg.py @@ -61,8 +61,8 @@ def main(): params=params, functions=['LogReg.fit', 'LogReg.predict'], times=[fit_time, predict_time], - accuracy_type=['accuracy', 'log_loss', 'roc_auc'], - accuracies=[ + metric_type=['accuracy', 'log_loss', 'roc_auc'], + metrics=[ [train_acc, test_acc], [train_log_loss, test_log_loss], [train_roc_auc, test_roc_auc], diff --git a/sklearn_bench/nusvc.py b/sklearn_bench/nusvc.py index a7001a3e9..1caab08a1 100644 --- a/sklearn_bench/nusvc.py +++ b/sklearn_bench/nusvc.py @@ -68,8 +68,8 @@ def main(): stages=['training', state_predict], params=params, functions=['NuSVC.fit', f'NuSVC.{state_predict}'], times=[fit_time, predict_train_time], - accuracy_type=['accuracy', 'log_loss', 'roc_auc'], - accuracies=[ + metric_type=['accuracy', 'log_loss', 'roc_auc'], + metrics=[ [train_acc, test_acc], [train_log_loss, test_log_loss], [train_roc_auc, test_roc_auc], diff --git a/sklearn_bench/nusvr.py b/sklearn_bench/nusvr.py index e439c7b28..ae400c93c 100644 --- a/sklearn_bench/nusvr.py +++ b/sklearn_bench/nusvr.py @@ -58,8 +58,8 @@ def main(): params=params, functions=['NuSVR.fit', 'NuSVR.predict'], times=[fit_time, predict_train_time], - accuracy_type=['rmse', 'r2_score'], - accuracies=[[train_rmse, test_rmse], [train_r2, test_r2]], + metric_type=['rmse', 'r2_score'], + metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], data=[X_train, X_train], alg_instance=regr, ) diff --git a/sklearn_bench/pca.py b/sklearn_bench/pca.py index 367638633..1c031674d 100644 --- a/sklearn_bench/pca.py +++ b/sklearn_bench/pca.py @@ -48,7 +48,7 @@ def main(): functions=['PCA.fit', 'PCA.transform'], times=[fit_time, transform_time], accuracy_type='noise_variance', - accuracies=[pca.noise_variance_, pca.noise_variance_], + metrics=[pca.noise_variance_, pca.noise_variance_], data=[X_train, X_test], alg_instance=pca, ) diff --git a/sklearn_bench/ridge.py b/sklearn_bench/ridge.py index 48489086c..3b8f138d2 100644 --- a/sklearn_bench/ridge.py +++ b/sklearn_bench/ridge.py @@ -49,8 +49,8 @@ def main(): params=params, functions=['Ridge.fit', 'Ridge.predict'], times=[fit_time, predict_time], - accuracy_type=['rmse', 'r2_score'], - accuracies=[[train_rmse, test_rmse], [train_r2, test_r2]], + metric_type=['rmse', 'r2_score'], + metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], data=[X_train, X_test], alg_instance=regr, ) diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py index 542ccccb6..ba46fb3f7 100644 --- a/sklearn_bench/svm.py +++ b/sklearn_bench/svm.py @@ -69,8 +69,8 @@ def main(): params=params, functions=['SVM.fit', f'SVM.{state_predict}'], times=[fit_time, predict_train_time], - accuracy_type=['accuracy', 'log_loss', 'roc_auc'], - accuracies=[ + metric_type=['accuracy', 'log_loss', 'roc_auc'], + metrics=[ [train_acc, test_acc], [train_log_loss, test_log_loss], [train_roc_auc, test_roc_auc], diff --git a/sklearn_bench/svr.py b/sklearn_bench/svr.py index f9bf9407c..39b67a84f 100644 --- a/sklearn_bench/svr.py +++ b/sklearn_bench/svr.py @@ -58,8 +58,8 @@ def main(): params=params, functions=['SVR.fit', 'SVR.predict'], times=[fit_time, predict_train_time], - accuracy_type=['rmse', 'r2_score'], - accuracies=[[train_rmse, test_rmse], [train_r2, test_r2]], + metric_type=['rmse', 'r2_score'], + metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], data=[X_train, X_train], alg_instance=regr, ) diff --git a/sklearn_bench/train_test_split.py b/sklearn_bench/train_test_split.py index aac1ec2e3..046719b48 100644 --- a/sklearn_bench/train_test_split.py +++ b/sklearn_bench/train_test_split.py @@ -47,8 +47,8 @@ def main(): bench.print_output(library='sklearn', algorithm='train_test_split', stages=['training'], params=params, - functions=['train_test_split'], times=[time], accuracies=[None], - accuracy_type=None, data=[X], alg_params=tts_params) + functions=['train_test_split'], times=[time], metrics=[None], + metric_type=None, data=[X], alg_params=tts_params) if __name__ == "__main__": diff --git a/xgboost_bench/gbt.py b/xgboost_bench/gbt.py index aa54a094d..8540f4f5a 100644 --- a/xgboost_bench/gbt.py +++ b/xgboost_bench/gbt.py @@ -176,6 +176,6 @@ def predict(dmatrix): # type: ignore bench.print_output(library='xgboost', algorithm=f'gradient_boosted_trees_{task}', stages=['training', 'prediction'], params=params, functions=['gbt.fit', 'gbt.predict'], - times=[fit_time, predict_time], accuracy_type=metric_name, - accuracies=[train_metric, test_metric], data=[X_train, X_test], + times=[fit_time, predict_time], metric_type=metric_name, + metrics=[train_metric, test_metric], data=[X_train, X_test], alg_instance=booster, alg_params=xgb_params) From a4f3b701ad93a259517429ba6440039a0e532933 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Mon, 2 Aug 2021 17:22:26 +0300 Subject: [PATCH 17/27] pep8 --- bench.py | 2 +- modelbuilders_bench/lgbm_mb.py | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/bench.py b/bench.py index 5a4a0cd4b..0266ba2bd 100644 --- a/bench.py +++ b/bench.py @@ -372,7 +372,7 @@ def davies_bouldin_score(y, yp): yp = convert_to_numpy(yp) try: res = sklearn_dbs(y, yp) - except ValueError: # Number of labels is 1 + except ValueError: # Number of labels is 1 res = "Error" return res diff --git a/modelbuilders_bench/lgbm_mb.py b/modelbuilders_bench/lgbm_mb.py index 7eca91ce3..d70f46772 100644 --- a/modelbuilders_bench/lgbm_mb.py +++ b/modelbuilders_bench/lgbm_mb.py @@ -141,11 +141,14 @@ test_metric_daal = metric_func(y_test, daal_pred.prediction) utils.print_output( - library='modelbuilders', algorithm=f'lightgbm_{task}_and_modelbuilder', + library='modelbuilders', + algorithm=f'lightgbm_{task}_and_modelbuilder', stages=['lgbm_train', 'lgbm_predict', 'daal4py_predict'], - params=params, functions=['lgbm_dataset', 'lgbm_dataset', 'lgbm_train', - 'lgbm_predict', 'lgbm_to_daal', 'daal_compute'], + params=params, + functions=['lgbm_dataset', 'lgbm_dataset', 'lgbm_train', + 'lgbm_predict', 'lgbm_to_daal', 'daal_compute'], times=[t_creat_train, t_train, t_creat_test, t_lgbm_pred, t_trans, t_daal_pred], - metric_type=metric_name, metrics=[train_metric, test_metric_lgbm, - test_metric_daal], - data=[X_train, X_test, X_test]) + metric_type=metric_name, + metrics=[train_metric, test_metric_lgbm, test_metric_daal], + data=[X_train, X_test, X_test], +) From 3afb8b77d2163cfcaa4952ad632858ce5e8e4f6f Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Mon, 2 Aug 2021 17:27:12 +0300 Subject: [PATCH 18/27] pca --- sklearn_bench/pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn_bench/pca.py b/sklearn_bench/pca.py index 1c031674d..356bfed09 100644 --- a/sklearn_bench/pca.py +++ b/sklearn_bench/pca.py @@ -47,7 +47,7 @@ def main(): params=params, functions=['PCA.fit', 'PCA.transform'], times=[fit_time, transform_time], - accuracy_type='noise_variance', + metric_type='noise_variance', metrics=[pca.noise_variance_, pca.noise_variance_], data=[X_train, X_test], alg_instance=pca, From 501d3283e3cec2ae7a2886cf8d9f2e3a8bca9701 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Mon, 2 Aug 2021 18:29:14 +0300 Subject: [PATCH 19/27] roc_auc details section --- bench.py | 10 ++++++---- sklearn_bench/df_clsf.py | 10 ++++++---- sklearn_bench/knn_clsf.py | 10 ++++++---- sklearn_bench/log_reg.py | 10 ++++++---- sklearn_bench/nusvc.py | 12 ++++++++---- sklearn_bench/svm.py | 12 ++++++++---- 6 files changed, 40 insertions(+), 24 deletions(-) diff --git a/bench.py b/bench.py index 0266ba2bd..5544709bb 100644 --- a/bench.py +++ b/bench.py @@ -324,7 +324,7 @@ def convert_to_numpy(data): return data -def columnwise_score(y, yp, score_func, **params): +def columnwise_score(y, yp, score_func): y = convert_to_numpy(y) yp = convert_to_numpy(yp) if y.ndim + yp.ndim > 2: @@ -334,8 +334,8 @@ def columnwise_score(y, yp, score_func, **params): if yp.ndim > 1: yp = yp[:, 0] else: - return [score_func(y[i], yp[i], **params) for i in range(y.shape[1])] - return score_func(y, yp, **params) + return [score_func(y[i], yp[i]) for i in range(y.shape[1])] + return score_func(y, yp) def accuracy_score(y, yp): @@ -351,7 +351,9 @@ def log_loss(y, yp): def roc_auc_score(y, yp, multi_class='ovr'): from sklearn.metrics import roc_auc_score as sklearn_roc_auc - return columnwise_score(y, yp, sklearn_roc_auc, multi_class=multi_class) + y = convert_to_numpy(y) + yp = convert_to_numpy(yp) + return sklearn_roc_auc(y, yp, multi_class=multi_class) def rmse_score(y, yp): diff --git a/sklearn_bench/df_clsf.py b/sklearn_bench/df_clsf.py index 30610a7bd..5212057b3 100644 --- a/sklearn_bench/df_clsf.py +++ b/sklearn_bench/df_clsf.py @@ -42,15 +42,17 @@ def main(): fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) y_pred = clf.predict(X_train) + y_proba = clf.predict_proba(X_train) train_acc = bench.accuracy_score(y_train, y_pred) - train_log_loss = bench.log_loss(y_train, clf.predict_proba(X_train)) - train_roc_auc = bench.roc_auc_score(y_train, y_pred) + train_log_loss = bench.log_loss(y_train, y_proba) + train_roc_auc = bench.roc_auc_score(y_train, y_proba[:,1]) predict_time, y_pred = bench.measure_function_time( clf.predict, X_test, params=params) + y_proba = clf.predict_proba(X_test) test_acc = bench.accuracy_score(y_test, y_pred) - test_log_loss = bench.log_loss(y_test, clf.predict_proba(X_test)) - test_roc_auc = bench.roc_auc_score(y_test, y_pred) + test_log_loss = bench.log_loss(y_test, y_proba) + test_roc_auc = bench.roc_auc_score(y_test, y_proba[:,1]) bench.print_output( library='sklearn', diff --git a/sklearn_bench/knn_clsf.py b/sklearn_bench/knn_clsf.py index ad3de56dd..b505410cd 100755 --- a/sklearn_bench/knn_clsf.py +++ b/sklearn_bench/knn_clsf.py @@ -39,17 +39,19 @@ def main(): knn_clsf.fit, X_train, y_train, params=params) if params.task == 'classification': y_pred = knn_clsf.predict(X_train) + y_proba = knn_clsf.predict_proba(X_train) train_acc = bench.accuracy_score(y_train, y_pred) - train_log_loss = bench.log_loss(y_train, knn_clsf.predict_proba(X_train)) - train_roc_auc = bench.roc_auc_score(y_train, y_pred) + train_log_loss = bench.log_loss(y_train, y_proba) + train_roc_auc = bench.roc_auc_score(y_train, y_proba[:,1]) # Measure time and accuracy on prediction if params.task == 'classification': predict_time, yp = bench.measure_function_time(knn_clsf.predict, X_test, params=params) + y_proba = knn_clsf.predict_proba(X_test) test_acc = bench.accuracy_score(y_test, yp) - test_log_loss = bench.log_loss(y_test, knn_clsf.predict_proba(X_test)) - test_roc_auc = bench.roc_auc_score(y_test, yp) + test_log_loss = bench.log_loss(y_test, y_proba) + test_roc_auc = bench.roc_auc_score(y_test, y_proba[:,1]) else: predict_time, _ = bench.measure_function_time(knn_clsf.kneighbors, X_test, params=params) diff --git a/sklearn_bench/log_reg.py b/sklearn_bench/log_reg.py index eb7e65403..8337942aa 100644 --- a/sklearn_bench/log_reg.py +++ b/sklearn_bench/log_reg.py @@ -44,15 +44,17 @@ def main(): fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) y_pred = clf.predict(X_train) + y_proba = clf.predict_proba(X_train) train_acc = bench.accuracy_score(y_train, y_pred) - train_log_loss = bench.log_loss(y_train, clf.predict_proba(X_train)) - train_roc_auc = bench.roc_auc_score(y_train, y_pred) + train_log_loss = bench.log_loss(y_train, y_proba) + train_roc_auc = bench.roc_auc_score(y_train, y_proba[:,1]) predict_time, y_pred = bench.measure_function_time( clf.predict, X_test, params=params) + y_proba = clf.predict_proba(X_test) test_acc = bench.accuracy_score(y_test, y_pred) - test_log_loss = bench.log_loss(y_test, clf.predict_proba(X_test)) - test_roc_auc = bench.roc_auc_score(y_test, y_pred) + test_log_loss = bench.log_loss(y_test, y_proba) + test_roc_auc = bench.roc_auc_score(y_test, y_proba[:,1]) bench.print_output( library='sklearn', diff --git a/sklearn_bench/nusvc.py b/sklearn_bench/nusvc.py index 1caab08a1..daee5f640 100644 --- a/sklearn_bench/nusvc.py +++ b/sklearn_bench/nusvc.py @@ -44,23 +44,27 @@ def main(): if params.probability: state_predict = 'predict_proba' clf_predict = clf.predict_proba - train_log_loss = bench.log_loss(y_train, clf.predict_proba(X_train)) - test_log_loss = bench.log_loss(y_test, clf.predict_proba(X_test)) + y_proba_train = clf_predict(X_train) + y_proba_test = clf_predict(X_test) + train_log_loss = bench.log_loss(y_train, y_proba_train) + test_log_loss = bench.log_loss(y_test, y_proba_test) + train_roc_auc = bench.roc_auc_score(y_train, y_proba_train[:,1]) + test_roc_auc = bench.roc_auc_score(y_test, y_proba_test[:,1]) else: state_predict = 'prediction' clf_predict = clf.predict train_log_loss = None test_log_loss = None + train_roc_auc = None + test_roc_auc = None predict_train_time, y_pred = bench.measure_function_time( clf_predict, X_train, params=params) train_acc = bench.accuracy_score(y_train, y_pred) - train_roc_auc = bench.roc_auc_score(y_train, y_pred) _, y_pred = bench.measure_function_time( clf_predict, X_test, params=params) test_acc = bench.accuracy_score(y_test, y_pred) - test_roc_auc = bench.roc_auc_score(y_test, y_pred) bench.print_output( library='sklearn', diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py index ba46fb3f7..dda1541e5 100644 --- a/sklearn_bench/svm.py +++ b/sklearn_bench/svm.py @@ -44,23 +44,27 @@ def main(): if params.probability: state_predict = 'predict_proba' clf_predict = clf.predict_proba - train_log_loss = bench.log_loss(y_train, clf_predict(X_train)) - test_log_loss = bench.log_loss(y_test, clf_predict(X_test)) + y_proba_train = clf_predict(X_train) + y_proba_test = clf_predict(X_test) + train_log_loss = bench.log_loss(y_train, y_proba_train) + test_log_loss = bench.log_loss(y_test, y_proba_test) + train_roc_auc = bench.roc_auc_score(y_train, y_proba_train[:,1]) + test_roc_auc = bench.roc_auc_score(y_test, y_proba_test[:,1]) else: state_predict = 'prediction' clf_predict = clf.predict train_log_loss = None test_log_loss = None + train_roc_auc = None + test_roc_auc = None predict_train_time, y_pred = bench.measure_function_time( clf_predict, X_train, params=params) train_acc = bench.accuracy_score(y_train, y_pred) - train_roc_auc = bench.roc_auc_score(y_train, y_pred) _, y_pred = bench.measure_function_time( clf_predict, X_test, params=params) test_acc = bench.accuracy_score(y_test, y_pred) - test_roc_auc = bench.roc_auc_score(y_test, y_pred) bench.print_output( library='sklearn', From 44124444a496f96385851c807d32da856173041f Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Mon, 2 Aug 2021 18:32:06 +0300 Subject: [PATCH 20/27] pep8 --- sklearn_bench/df_clsf.py | 4 ++-- sklearn_bench/knn_clsf.py | 4 ++-- sklearn_bench/log_reg.py | 4 ++-- sklearn_bench/nusvc.py | 4 ++-- sklearn_bench/svm.py | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sklearn_bench/df_clsf.py b/sklearn_bench/df_clsf.py index 5212057b3..e2508ae26 100644 --- a/sklearn_bench/df_clsf.py +++ b/sklearn_bench/df_clsf.py @@ -45,14 +45,14 @@ def main(): y_proba = clf.predict_proba(X_train) train_acc = bench.accuracy_score(y_train, y_pred) train_log_loss = bench.log_loss(y_train, y_proba) - train_roc_auc = bench.roc_auc_score(y_train, y_proba[:,1]) + train_roc_auc = bench.roc_auc_score(y_train, y_proba[:, 1]) predict_time, y_pred = bench.measure_function_time( clf.predict, X_test, params=params) y_proba = clf.predict_proba(X_test) test_acc = bench.accuracy_score(y_test, y_pred) test_log_loss = bench.log_loss(y_test, y_proba) - test_roc_auc = bench.roc_auc_score(y_test, y_proba[:,1]) + test_roc_auc = bench.roc_auc_score(y_test, y_proba[:, 1]) bench.print_output( library='sklearn', diff --git a/sklearn_bench/knn_clsf.py b/sklearn_bench/knn_clsf.py index b505410cd..42b2975f0 100755 --- a/sklearn_bench/knn_clsf.py +++ b/sklearn_bench/knn_clsf.py @@ -42,7 +42,7 @@ def main(): y_proba = knn_clsf.predict_proba(X_train) train_acc = bench.accuracy_score(y_train, y_pred) train_log_loss = bench.log_loss(y_train, y_proba) - train_roc_auc = bench.roc_auc_score(y_train, y_proba[:,1]) + train_roc_auc = bench.roc_auc_score(y_train, y_proba[:, 1]) # Measure time and accuracy on prediction if params.task == 'classification': @@ -51,7 +51,7 @@ def main(): y_proba = knn_clsf.predict_proba(X_test) test_acc = bench.accuracy_score(y_test, yp) test_log_loss = bench.log_loss(y_test, y_proba) - test_roc_auc = bench.roc_auc_score(y_test, y_proba[:,1]) + test_roc_auc = bench.roc_auc_score(y_test, y_proba[:, 1]) else: predict_time, _ = bench.measure_function_time(knn_clsf.kneighbors, X_test, params=params) diff --git a/sklearn_bench/log_reg.py b/sklearn_bench/log_reg.py index 8337942aa..58a7a98fe 100644 --- a/sklearn_bench/log_reg.py +++ b/sklearn_bench/log_reg.py @@ -47,14 +47,14 @@ def main(): y_proba = clf.predict_proba(X_train) train_acc = bench.accuracy_score(y_train, y_pred) train_log_loss = bench.log_loss(y_train, y_proba) - train_roc_auc = bench.roc_auc_score(y_train, y_proba[:,1]) + train_roc_auc = bench.roc_auc_score(y_train, y_proba[:, 1]) predict_time, y_pred = bench.measure_function_time( clf.predict, X_test, params=params) y_proba = clf.predict_proba(X_test) test_acc = bench.accuracy_score(y_test, y_pred) test_log_loss = bench.log_loss(y_test, y_proba) - test_roc_auc = bench.roc_auc_score(y_test, y_proba[:,1]) + test_roc_auc = bench.roc_auc_score(y_test, y_proba[:, 1]) bench.print_output( library='sklearn', diff --git a/sklearn_bench/nusvc.py b/sklearn_bench/nusvc.py index daee5f640..54665952f 100644 --- a/sklearn_bench/nusvc.py +++ b/sklearn_bench/nusvc.py @@ -48,8 +48,8 @@ def main(): y_proba_test = clf_predict(X_test) train_log_loss = bench.log_loss(y_train, y_proba_train) test_log_loss = bench.log_loss(y_test, y_proba_test) - train_roc_auc = bench.roc_auc_score(y_train, y_proba_train[:,1]) - test_roc_auc = bench.roc_auc_score(y_test, y_proba_test[:,1]) + train_roc_auc = bench.roc_auc_score(y_train, y_proba_train[:, 1]) + test_roc_auc = bench.roc_auc_score(y_test, y_proba_test[:, 1]) else: state_predict = 'prediction' clf_predict = clf.predict diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py index dda1541e5..8680a7fc6 100644 --- a/sklearn_bench/svm.py +++ b/sklearn_bench/svm.py @@ -48,8 +48,8 @@ def main(): y_proba_test = clf_predict(X_test) train_log_loss = bench.log_loss(y_train, y_proba_train) test_log_loss = bench.log_loss(y_test, y_proba_test) - train_roc_auc = bench.roc_auc_score(y_train, y_proba_train[:,1]) - test_roc_auc = bench.roc_auc_score(y_test, y_proba_test[:,1]) + train_roc_auc = bench.roc_auc_score(y_train, y_proba_train[:, 1]) + test_roc_auc = bench.roc_auc_score(y_test, y_proba_test[:, 1]) else: state_predict = 'prediction' clf_predict = clf.predict From 6f5888693ff679ea58fbeb107a959861eb2c48cb Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Mon, 2 Aug 2021 21:48:17 +0300 Subject: [PATCH 21/27] finally solve roc_auc trouble --- sklearn_bench/df_clsf.py | 4 ++-- sklearn_bench/knn_clsf.py | 4 ++-- sklearn_bench/log_reg.py | 4 ++-- sklearn_bench/nusvc.py | 4 ++-- sklearn_bench/svm.py | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sklearn_bench/df_clsf.py b/sklearn_bench/df_clsf.py index e2508ae26..95709340c 100644 --- a/sklearn_bench/df_clsf.py +++ b/sklearn_bench/df_clsf.py @@ -45,14 +45,14 @@ def main(): y_proba = clf.predict_proba(X_train) train_acc = bench.accuracy_score(y_train, y_pred) train_log_loss = bench.log_loss(y_train, y_proba) - train_roc_auc = bench.roc_auc_score(y_train, y_proba[:, 1]) + train_roc_auc = bench.roc_auc_score(y_train, y_proba) predict_time, y_pred = bench.measure_function_time( clf.predict, X_test, params=params) y_proba = clf.predict_proba(X_test) test_acc = bench.accuracy_score(y_test, y_pred) test_log_loss = bench.log_loss(y_test, y_proba) - test_roc_auc = bench.roc_auc_score(y_test, y_proba[:, 1]) + test_roc_auc = bench.roc_auc_score(y_test, y_proba) bench.print_output( library='sklearn', diff --git a/sklearn_bench/knn_clsf.py b/sklearn_bench/knn_clsf.py index 42b2975f0..ef581f537 100755 --- a/sklearn_bench/knn_clsf.py +++ b/sklearn_bench/knn_clsf.py @@ -42,7 +42,7 @@ def main(): y_proba = knn_clsf.predict_proba(X_train) train_acc = bench.accuracy_score(y_train, y_pred) train_log_loss = bench.log_loss(y_train, y_proba) - train_roc_auc = bench.roc_auc_score(y_train, y_proba[:, 1]) + train_roc_auc = bench.roc_auc_score(y_train, y_proba) # Measure time and accuracy on prediction if params.task == 'classification': @@ -51,7 +51,7 @@ def main(): y_proba = knn_clsf.predict_proba(X_test) test_acc = bench.accuracy_score(y_test, yp) test_log_loss = bench.log_loss(y_test, y_proba) - test_roc_auc = bench.roc_auc_score(y_test, y_proba[:, 1]) + test_roc_auc = bench.roc_auc_score(y_test, y_proba) else: predict_time, _ = bench.measure_function_time(knn_clsf.kneighbors, X_test, params=params) diff --git a/sklearn_bench/log_reg.py b/sklearn_bench/log_reg.py index 58a7a98fe..1053d3819 100644 --- a/sklearn_bench/log_reg.py +++ b/sklearn_bench/log_reg.py @@ -47,14 +47,14 @@ def main(): y_proba = clf.predict_proba(X_train) train_acc = bench.accuracy_score(y_train, y_pred) train_log_loss = bench.log_loss(y_train, y_proba) - train_roc_auc = bench.roc_auc_score(y_train, y_proba[:, 1]) + train_roc_auc = bench.roc_auc_score(y_train, y_proba) predict_time, y_pred = bench.measure_function_time( clf.predict, X_test, params=params) y_proba = clf.predict_proba(X_test) test_acc = bench.accuracy_score(y_test, y_pred) test_log_loss = bench.log_loss(y_test, y_proba) - test_roc_auc = bench.roc_auc_score(y_test, y_proba[:, 1]) + test_roc_auc = bench.roc_auc_score(y_test, y_proba) bench.print_output( library='sklearn', diff --git a/sklearn_bench/nusvc.py b/sklearn_bench/nusvc.py index 54665952f..4cdf66490 100644 --- a/sklearn_bench/nusvc.py +++ b/sklearn_bench/nusvc.py @@ -48,8 +48,8 @@ def main(): y_proba_test = clf_predict(X_test) train_log_loss = bench.log_loss(y_train, y_proba_train) test_log_loss = bench.log_loss(y_test, y_proba_test) - train_roc_auc = bench.roc_auc_score(y_train, y_proba_train[:, 1]) - test_roc_auc = bench.roc_auc_score(y_test, y_proba_test[:, 1]) + train_roc_auc = bench.roc_auc_score(y_train, y_proba_train) + test_roc_auc = bench.roc_auc_score(y_test, y_proba_test) else: state_predict = 'prediction' clf_predict = clf.predict diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py index 8680a7fc6..66356e741 100644 --- a/sklearn_bench/svm.py +++ b/sklearn_bench/svm.py @@ -48,8 +48,8 @@ def main(): y_proba_test = clf_predict(X_test) train_log_loss = bench.log_loss(y_train, y_proba_train) test_log_loss = bench.log_loss(y_test, y_proba_test) - train_roc_auc = bench.roc_auc_score(y_train, y_proba_train[:, 1]) - test_roc_auc = bench.roc_auc_score(y_test, y_proba_test[:, 1]) + train_roc_auc = bench.roc_auc_score(y_train, y_proba_train) + test_roc_auc = bench.roc_auc_score(y_test, y_proba_test) else: state_predict = 'prediction' clf_predict = clf.predict From 9fa4a9457e3ac14f5f2078e9658e92aca11972a5 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Mon, 2 Aug 2021 22:01:04 +0300 Subject: [PATCH 22/27] add kmeans.iter_ & done metrics in bench --- bench.py | 65 ++++++++++++++++++----------------------- sklearn_bench/kmeans.py | 8 +++-- 2 files changed, 34 insertions(+), 39 deletions(-) diff --git a/bench.py b/bench.py index 5544709bb..13cc05af4 100644 --- a/bench.py +++ b/bench.py @@ -324,58 +324,49 @@ def convert_to_numpy(data): return data -def columnwise_score(y, yp, score_func): - y = convert_to_numpy(y) - yp = convert_to_numpy(yp) - if y.ndim + yp.ndim > 2: - if 1 in (y.shape + yp.shape)[1:]: - if y.ndim > 1: - y = y[:, 0] - if yp.ndim > 1: - yp = yp[:, 0] - else: - return [score_func(y[i], yp[i]) for i in range(y.shape[1])] - return score_func(y, yp) - - -def accuracy_score(y, yp): - return columnwise_score(y, yp, lambda y1, y2: np.mean(y1 == y2)) +def accuracy_score(y_true, y_pred): + from sklearn.metrics import accuracy_score as sklearn_accuracy + y_true = convert_to_numpy(y_true) + y_pred = convert_to_numpy(y_pred) + return sklearn_accuracy(y_true, y_pred) -def log_loss(y, yp): +def log_loss(y_true, y_pred): from sklearn.metrics import log_loss as sklearn_log_loss - y = convert_to_numpy(y) - yp = convert_to_numpy(yp) - return sklearn_log_loss(y, yp) + y_true = convert_to_numpy(y_true) + y_pred = convert_to_numpy(y_pred) + return sklearn_log_loss(y_true, y_pred) -def roc_auc_score(y, yp, multi_class='ovr'): +def roc_auc_score(y_true, y_pred, multi_class='ovr'): from sklearn.metrics import roc_auc_score as sklearn_roc_auc - y = convert_to_numpy(y) - yp = convert_to_numpy(yp) - return sklearn_roc_auc(y, yp, multi_class=multi_class) + y_true = convert_to_numpy(y_true) + y_pred = convert_to_numpy(y_pred) + return sklearn_roc_auc(y_true, y_pred, multi_class=multi_class) -def rmse_score(y, yp): - return columnwise_score( - y, yp, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2)))) +def rmse_score(y_true, y_pred, squared=False): + from sklearn.metrics import mean_squared_error as sklearn_mse + y_true = convert_to_numpy(y_true) + y_pred = convert_to_numpy(y_pred) + return sklearn_mse(y_true, y_pred, squared=squared) -def r2_score(y, yp): +def r2_score(y_true, y_pred): from sklearn.metrics import r2_score as sklearn_r2_score - y = convert_to_numpy(y) - yp = convert_to_numpy(yp) - return sklearn_r2_score(y, yp) + y_true = convert_to_numpy(y_true) + y_pred = convert_to_numpy(y_pred) + return sklearn_r2_score(y_true, y_pred) -def davies_bouldin_score(y, yp): +def davies_bouldin_score(y_true, y_pred): from sklearn.metrics.cluster import davies_bouldin_score as sklearn_dbs - y = convert_to_numpy(y) - yp = convert_to_numpy(yp) + y_true = convert_to_numpy(y_true) + y_pred = convert_to_numpy(y_pred) try: - res = sklearn_dbs(y, yp) - except ValueError: # Number of labels is 1 - res = "Error" + res = sklearn_dbs(y_true, y_pred) + except ValueError: + res = "Number of labels is 1" return res diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index 6028cd1e7..2329adc1e 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -73,8 +73,12 @@ def fit_kmeans(X, X_init): params=params, functions=['KMeans.fit', 'KMeans.predict'], times=[fit_time, predict_time], - metric_type=['davies_bouldin_score', 'inertia'], - metrics=[[acc_train, acc_test], [kmeans.inertia_, kmeans.inertia_]], + metric_type=['davies_bouldin_score', 'inertia', 'iter'], + metrics=[ + [acc_train, acc_test], + [kmeans.inertia_, kmeans.inertia_], + [kmeans.iter_, kmeans.iter_] + ], data=[X_train, X_test], alg_instance=kmeans, ) From 070c4d22063e9ad40660e9885cd82d62541eb684 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Mon, 2 Aug 2021 22:03:26 +0300 Subject: [PATCH 23/27] n_iter_ --- sklearn_bench/kmeans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index 2329adc1e..b522a0e92 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -77,7 +77,7 @@ def fit_kmeans(X, X_init): metrics=[ [acc_train, acc_test], [kmeans.inertia_, kmeans.inertia_], - [kmeans.iter_, kmeans.iter_] + [kmeans.n_iter_, kmeans.n_iter_] ], data=[X_train, X_test], alg_instance=kmeans, From c7b7a5c2fdee13405fff414e78c2340ecdffc995 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Mon, 2 Aug 2021 22:14:01 +0300 Subject: [PATCH 24/27] stay columnwise_score because of xgb --- bench.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/bench.py b/bench.py index 13cc05af4..c857f9fdd 100644 --- a/bench.py +++ b/bench.py @@ -324,11 +324,22 @@ def convert_to_numpy(data): return data +def columnwise_score(y, yp, score_func): + y = convert_to_numpy(y) + yp = convert_to_numpy(yp) + if y.ndim + yp.ndim > 2: + if 1 in (y.shape + yp.shape)[1:]: + if y.ndim > 1: + y = y[:, 0] + if yp.ndim > 1: + yp = yp[:, 0] + else: + return [score_func(y[i], yp[i]) for i in range(y.shape[1])] + return score_func(y, yp) + + def accuracy_score(y_true, y_pred): - from sklearn.metrics import accuracy_score as sklearn_accuracy - y_true = convert_to_numpy(y_true) - y_pred = convert_to_numpy(y_pred) - return sklearn_accuracy(y_true, y_pred) + return columnwise_score(y_true, y_pred, lambda y1, y2: np.mean(y1 == y2)) def log_loss(y_true, y_pred): @@ -345,11 +356,9 @@ def roc_auc_score(y_true, y_pred, multi_class='ovr'): return sklearn_roc_auc(y_true, y_pred, multi_class=multi_class) -def rmse_score(y_true, y_pred, squared=False): - from sklearn.metrics import mean_squared_error as sklearn_mse - y_true = convert_to_numpy(y_true) - y_pred = convert_to_numpy(y_pred) - return sklearn_mse(y_true, y_pred, squared=squared) +def rmse_score(y_true, y_pred): + return columnwise_score( + y_true, y_pred, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2)))) def r2_score(y_true, y_pred): From 8c8f96496c1286f6886c95ba4457c527a4d370df Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Mon, 2 Aug 2021 22:49:50 +0300 Subject: [PATCH 25/27] roc_auc_score binary case --- bench.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bench.py b/bench.py index c857f9fdd..94130b85d 100644 --- a/bench.py +++ b/bench.py @@ -353,6 +353,8 @@ def roc_auc_score(y_true, y_pred, multi_class='ovr'): from sklearn.metrics import roc_auc_score as sklearn_roc_auc y_true = convert_to_numpy(y_true) y_pred = convert_to_numpy(y_pred) + if y_pred.shape[1] == 2: # binary case + y_pred = y_pred[:, 1] return sklearn_roc_auc(y_true, y_pred, multi_class=multi_class) From ff9da3bd91248a82b0332867d1cbd5c366f87a17 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Tue, 3 Aug 2021 09:34:45 +0300 Subject: [PATCH 26/27] add n_sv in svms --- sklearn_bench/nusvc.py | 3 ++- sklearn_bench/nusvr.py | 8 ++++++-- sklearn_bench/svm.py | 3 ++- sklearn_bench/svr.py | 8 ++++++-- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/sklearn_bench/nusvc.py b/sklearn_bench/nusvc.py index 4cdf66490..d98b184df 100644 --- a/sklearn_bench/nusvc.py +++ b/sklearn_bench/nusvc.py @@ -72,11 +72,12 @@ def main(): stages=['training', state_predict], params=params, functions=['NuSVC.fit', f'NuSVC.{state_predict}'], times=[fit_time, predict_train_time], - metric_type=['accuracy', 'log_loss', 'roc_auc'], + metric_type=['accuracy', 'log_loss', 'roc_auc', 'n_sv'], metrics=[ [train_acc, test_acc], [train_log_loss, test_log_loss], [train_roc_auc, test_roc_auc], + [int(clf.n_support_.sum()), int(clf.n_support_.sum())], ], data=[X_train, X_train], alg_instance=clf, diff --git a/sklearn_bench/nusvr.py b/sklearn_bench/nusvr.py index ae400c93c..d31b7d26e 100644 --- a/sklearn_bench/nusvr.py +++ b/sklearn_bench/nusvr.py @@ -58,8 +58,12 @@ def main(): params=params, functions=['NuSVR.fit', 'NuSVR.predict'], times=[fit_time, predict_train_time], - metric_type=['rmse', 'r2_score'], - metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], + metric_type=['rmse', 'r2_score', 'n_sv'], + metrics=[ + [train_rmse, test_rmse], + [train_r2, test_r2], + [int(regr.n_support_.sum()), int(regr.n_support_.sum())], + ], data=[X_train, X_train], alg_instance=regr, ) diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py index 66356e741..6e17ea00a 100644 --- a/sklearn_bench/svm.py +++ b/sklearn_bench/svm.py @@ -73,11 +73,12 @@ def main(): params=params, functions=['SVM.fit', f'SVM.{state_predict}'], times=[fit_time, predict_train_time], - metric_type=['accuracy', 'log_loss', 'roc_auc'], + metric_type=['accuracy', 'log_loss', 'roc_auc', 'n_sv'], metrics=[ [train_acc, test_acc], [train_log_loss, test_log_loss], [train_roc_auc, test_roc_auc], + [int(clf.n_support_.sum()), int(clf.n_support_.sum())], ], data=[X_train, X_train], alg_instance=clf, diff --git a/sklearn_bench/svr.py b/sklearn_bench/svr.py index 39b67a84f..a3447332b 100644 --- a/sklearn_bench/svr.py +++ b/sklearn_bench/svr.py @@ -58,8 +58,12 @@ def main(): params=params, functions=['SVR.fit', 'SVR.predict'], times=[fit_time, predict_train_time], - metric_type=['rmse', 'r2_score'], - metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], + metric_type=['rmse', 'r2_score', 'n_sv'], + metrics=[ + [train_rmse, test_rmse], + [train_r2, test_r2], + [int(regr.n_support_.sum()), int(regr.n_support_.sum())], + ], data=[X_train, X_train], alg_instance=regr, ) From 9871f94b99614b7fa4eaca64ffaa34028bbf4177 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Tue, 3 Aug 2021 16:30:52 +0300 Subject: [PATCH 27/27] apply comments --- bench.py | 12 ++++++------ sklearn_bench/dbscan.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bench.py b/bench.py index 94130b85d..52f25ca7d 100644 --- a/bench.py +++ b/bench.py @@ -370,14 +370,14 @@ def r2_score(y_true, y_pred): return sklearn_r2_score(y_true, y_pred) -def davies_bouldin_score(y_true, y_pred): +def davies_bouldin_score(X, labels): from sklearn.metrics.cluster import davies_bouldin_score as sklearn_dbs - y_true = convert_to_numpy(y_true) - y_pred = convert_to_numpy(y_pred) + X = convert_to_numpy(X) + labels = convert_to_numpy(labels) try: - res = sklearn_dbs(y_true, y_pred) - except ValueError: - res = "Number of labels is 1" + res = sklearn_dbs(X, labels) + except ValueError as ex: + res = ex return res diff --git a/sklearn_bench/dbscan.py b/sklearn_bench/dbscan.py index 0ef445f7b..94a55bafa 100644 --- a/sklearn_bench/dbscan.py +++ b/sklearn_bench/dbscan.py @@ -49,7 +49,7 @@ def main(): if __name__ == "__main__": parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark') - parser.add_argument('-e', '--eps', '--epsilon', type=float, default=0.5, + parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10., help='Radius of neighborhood of a point') parser.add_argument('-m', '--min-samples', default=5, type=int, help='The minimum number of samples required in a '