From a4d1e13c00821f5b395b28520e0fe52e930b2d70 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Wed, 4 Aug 2021 14:43:27 +0300 Subject: [PATCH 1/3] codefactor --- bench.py | 28 ++++++++++--------------- cuml_bench/df_clsf.py | 37 ++++++++++++++++----------------- cuml_bench/df_regr.py | 35 +++++++++++++++---------------- daal4py_bench/pca.py | 5 ++--- modelbuilders_bench/mb_utils.py | 16 +++++++------- 5 files changed, 55 insertions(+), 66 deletions(-) diff --git a/bench.py b/bench.py index 52f25ca7d..c96925972 100644 --- a/bench.py +++ b/bench.py @@ -30,12 +30,11 @@ def get_dtype(data): ''' if hasattr(data, 'dtype'): return data.dtype - elif hasattr(data, 'dtypes'): + if hasattr(data, 'dtypes'): return str(data.dtypes[0]) - elif hasattr(data, 'values'): + if hasattr(data, 'values'): return data.values.dtype - else: - raise ValueError(f'Impossible to get data type of {type(data)}') + raise ValueError(f'Impossible to get data type of {type(data)}') def sklearn_disable_finiteness_check(): @@ -66,10 +65,7 @@ def _parse_size(string, dim=2): def float_or_int(string): - if '.' in string: - return float(string) - else: - return int(string) + return float(string) if '.' in string else int(string) def get_optimal_cache_size(n_rows, dtype=np.double, max_cache=64): @@ -90,10 +86,8 @@ def get_optimal_cache_size(n_rows, dtype=np.double, max_cache=64): optimal_cache_size_bytes = byte_size * (n_rows ** 2) one_gb = 2 ** 30 max_cache_bytes = max_cache * one_gb - if optimal_cache_size_bytes > max_cache_bytes: - return max_cache_bytes - else: - return optimal_cache_size_bytes + return max_cache_bytes \ + if optimal_cache_size_bytes > max_cache_bytes else optimal_cache_size_bytes def parse_args(parser, size=None, loop_types=(), @@ -175,9 +169,9 @@ def parse_args(parser, size=None, loop_types=(), help='Seed to pass as random_state') parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name') - parser.add_argument('--no-intel-optimized', default=False, action='store_true', - help='Use no intel optimized version. ' - 'Now avalible for scikit-learn benchmarks'), + _ = parser.add_argument('--no-intel-optimized', default=False, action='store_true', + help='Use no intel optimized version. ' + 'Now avalible for scikit-learn benchmarks'), parser.add_argument('--device', default='None', type=str, choices=('host', 'cpu', 'gpu', 'None'), help='Execution context device') @@ -519,8 +513,8 @@ def print_output(library, algorithm, stages, params, functions, alg_params=None): if params.output_format == 'json': output = [] - for i in range(len(stages)): - result = gen_basic_dict(library, algorithm, stages[i], params, + for i, stage in enumerate(stages): + result = gen_basic_dict(library, algorithm, stage, params, data[i], alg_instance, alg_params) result.update({'time[s]': times[i]}) if metric_type is not None: diff --git a/cuml_bench/df_clsf.py b/cuml_bench/df_clsf.py index e4d265954..2ebfcde5c 100755 --- a/cuml_bench/df_clsf.py +++ b/cuml_bench/df_clsf.py @@ -15,7 +15,6 @@ # =============================================================================== import argparse -from typing import Any import bench import cuml @@ -62,36 +61,36 @@ params.split_algorithm = 1 params.n_classes = y_train[y_train.columns[0]].nunique() -clf: Any - - -def fit(X, y): - global clf - clf = RandomForestClassifier(split_criterion=params.criterion, - split_algo=params.split_algorithm, - n_estimators=params.num_trees, - max_depth=params.max_depth, - max_features=params.max_features, - min_samples_split=params.min_samples_split, - max_leaves=params.max_leaf_nodes, - min_impurity_decrease=params.min_impurity_decrease, - bootstrap=params.bootstrap) + +clf = RandomForestClassifier( + split_criterion=params.criterion, + split_algo=params.split_algorithm, + n_estimators=params.num_trees, + max_depth=params.max_depth, + max_features=params.max_features, + min_samples_split=params.min_samples_split, + max_leaves=params.max_leaf_nodes, + min_impurity_decrease=params.min_impurity_decrease, + bootstrap=params.bootstrap, +) + + +def fit(clf, X, y): return clf.fit(X, y) -def predict(X): - global clf +def predict(clf, X): prediction_args = {'predict_model': 'GPU'} if int(cuml.__version__.split('.')[1]) <= 14: prediction_args.update({'num_classes': params.n_classes}) return clf.predict(X, **prediction_args) -fit_time, _ = bench.measure_function_time(fit, X_train, y_train, params=params) +fit_time, _ = bench.measure_function_time(fit, clf, X_train, y_train, params=params) y_pred = predict(X_train) train_acc = 100 * bench.accuracy_score(y_pred, y_train) -predict_time, y_pred = bench.measure_function_time(predict, X_test, params=params) +predict_time, y_pred = bench.measure_function_time(predict, clf, X_test, params=params) test_acc = 100 * bench.accuracy_score(y_pred, y_test) bench.print_output(library='cuml', algorithm='decision_forest_classification', diff --git a/cuml_bench/df_regr.py b/cuml_bench/df_regr.py index 9e7298882..24e76fc55 100644 --- a/cuml_bench/df_regr.py +++ b/cuml_bench/df_regr.py @@ -15,7 +15,6 @@ # =============================================================================== import argparse -from typing import Any import bench from cuml.ensemble import RandomForestRegressor @@ -59,35 +58,35 @@ params.split_algorithm = 0 else: params.split_algorithm = 1 -regr: Any - # Create our random forest regressor -def fit(X, y): - global regr - regr = RandomForestRegressor(split_criterion=params.criterion, - split_algo=params.split_algorithm, - n_estimators=params.num_trees, - max_depth=params.max_depth, - max_features=params.max_features, - min_samples_split=params.min_samples_split, - max_leaves=params.max_leaf_nodes, - min_impurity_decrease=params.min_impurity_decrease, - bootstrap=params.bootstrap) +regr = RandomForestRegressor( + split_criterion=params.criterion, + split_algo=params.split_algorithm, + n_estimators=params.num_trees, + max_depth=params.max_depth, + max_features=params.max_features, + min_samples_split=params.min_samples_split, + max_leaves=params.max_leaf_nodes, + min_impurity_decrease=params.min_impurity_decrease, + bootstrap=params.bootstrap, +) + + +def fit(regr, X, y): return regr.fit(X, y) -def predict(X): - global regr +def predict(regr, X): return regr.predict(X, predict_model='GPU') -fit_time, _ = bench.measure_function_time(fit, X_train, y_train, params=params) +fit_time, _ = bench.measure_function_time(fit, regr, X_train, y_train, params=params) y_pred = predict(X_train) train_rmse = bench.rmse_score(y_pred, y_train) -predict_time, y_pred = bench.measure_function_time(predict, X_test, params=params) +predict_time, y_pred = bench.measure_function_time(predict, regr, X_test, params=params) test_rmse = bench.rmse_score(y_pred, y_test) bench.print_output(library='cuml', algorithm='decision_forest_regression', diff --git a/daal4py_bench/pca.py b/daal4py_bench/pca.py index 98fb12e61..f2b4ab4d3 100644 --- a/daal4py_bench/pca.py +++ b/daal4py_bench/pca.py @@ -121,9 +121,8 @@ def pca_fit_full_daal(X, n_components): def test_fit(X): if params.svd_solver == 'full': return pca_fit_full_daal(X, params.n_components) - else: - method = 'correlationDense' if params.svd_solver == 'correlation' else 'svdDense' - return pca_fit_daal(X, params.n_components, method) + method = 'correlationDense' if params.svd_solver == 'correlation' else 'svdDense' + return pca_fit_daal(X, params.n_components, method) def test_transform(Xp, pca_result, eigenvalues, eigenvectors): diff --git a/modelbuilders_bench/mb_utils.py b/modelbuilders_bench/mb_utils.py index 2d659dc09..7c54efd92 100644 --- a/modelbuilders_bench/mb_utils.py +++ b/modelbuilders_bench/mb_utils.py @@ -21,17 +21,15 @@ def get_accuracy(true_labels, prediction): errors = 0 - for i in range(len(true_labels)): + for i, true_label in enumerate(true_labels): pred_label = 0 - if isinstance(prediction[i], float) or \ - isinstance(prediction[i], np.single) or \ - isinstance(prediction[i], np.float): + if isinstance(prediction[i], (float, np.single, np.float)): pred_label = prediction[i] > 0.5 elif prediction[i].shape[0] == 1: pred_label = prediction[i][0] else: pred_label = np.argmax(prediction[i]) - if true_labels[i] != pred_label: + if true_label != pred_label: errors += 1 return 100 * (1 - errors / len(true_labels)) @@ -54,14 +52,14 @@ def print_output(library, algorithm, stages, params, functions, }) if hasattr(params, 'n_classes'): output[-1]['input_data'].update({'classes': params.n_classes}) - for i in range(len(stages)): + for i, stage in enumerate(stages): result = { - 'stage': stages[i], + 'stage': stage, } - if 'daal' in stages[i]: + if 'daal' in stage: result.update({'conversion_to_daal4py': times[2 * i], 'prediction_time': times[2 * i + 1]}) - elif 'train' in stages[i]: + elif 'train' in stage: result.update({'matrix_creation_time': times[2 * i], 'training_time': times[2 * i + 1]}) else: From 65e712174edc451ef105d0c31f11bce748663448 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Wed, 4 Aug 2021 15:59:19 +0300 Subject: [PATCH 2/3] mypy --- cuml_bench/df_clsf.py | 2 +- cuml_bench/df_regr.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cuml_bench/df_clsf.py b/cuml_bench/df_clsf.py index 2ebfcde5c..848e97d7e 100755 --- a/cuml_bench/df_clsf.py +++ b/cuml_bench/df_clsf.py @@ -87,7 +87,7 @@ def predict(clf, X): fit_time, _ = bench.measure_function_time(fit, clf, X_train, y_train, params=params) -y_pred = predict(X_train) +y_pred = predict(clf, X_train) train_acc = 100 * bench.accuracy_score(y_pred, y_train) predict_time, y_pred = bench.measure_function_time(predict, clf, X_test, params=params) diff --git a/cuml_bench/df_regr.py b/cuml_bench/df_regr.py index 24e76fc55..61e08ce7b 100644 --- a/cuml_bench/df_regr.py +++ b/cuml_bench/df_regr.py @@ -83,7 +83,7 @@ def predict(regr, X): fit_time, _ = bench.measure_function_time(fit, regr, X_train, y_train, params=params) -y_pred = predict(X_train) +y_pred = predict(regr, X_train) train_rmse = bench.rmse_score(y_pred, y_train) predict_time, y_pred = bench.measure_function_time(predict, regr, X_test, params=params) From f3d5e16fd7b503dd05257ec0d313f116583b3371 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Wed, 4 Aug 2021 18:09:32 +0300 Subject: [PATCH 3/3] aaply comments --- bench.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bench.py b/bench.py index c96925972..e56630ac4 100644 --- a/bench.py +++ b/bench.py @@ -169,9 +169,10 @@ def parse_args(parser, size=None, loop_types=(), help='Seed to pass as random_state') parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name') - _ = parser.add_argument('--no-intel-optimized', default=False, action='store_true', - help='Use no intel optimized version. ' - 'Now avalible for scikit-learn benchmarks'), + parser.add_argument('--no-intel-optimized', default=False, + action='store_true', + help='Use no intel optimized version. ' + 'Now avalible for scikit-learn benchmarks') parser.add_argument('--device', default='None', type=str, choices=('host', 'cpu', 'gpu', 'None'), help='Execution context device')