Enable codefactor in master & cleaning code (#85)

OnlyDeniko · web-flow · commit 964a26716e2e · 2021-08-04T18:16:17.000+03:00
* codefactor

* mypy
diff --git a/bench.py b/bench.py
@@ -30,12 +30,11 @@ def get_dtype(data):
     '''
     if hasattr(data, 'dtype'):
         return data.dtype
-    elif hasattr(data, 'dtypes'):
+    if hasattr(data, 'dtypes'):
         return str(data.dtypes[0])
-    elif hasattr(data, 'values'):
+    if hasattr(data, 'values'):
         return data.values.dtype
-    else:
-        raise ValueError(f'Impossible to get data type of {type(data)}')
+    raise ValueError(f'Impossible to get data type of {type(data)}')
 
 
 def sklearn_disable_finiteness_check():
@@ -66,10 +65,7 @@ def _parse_size(string, dim=2):
 
 
 def float_or_int(string):
-    if '.' in string:
-        return float(string)
-    else:
-        return int(string)
+    return float(string) if '.' in string else int(string)
 
 
 def get_optimal_cache_size(n_rows, dtype=np.double, max_cache=64):
@@ -90,10 +86,8 @@ def get_optimal_cache_size(n_rows, dtype=np.double, max_cache=64):
     optimal_cache_size_bytes = byte_size * (n_rows ** 2)
     one_gb = 2 ** 30
     max_cache_bytes = max_cache * one_gb
-    if optimal_cache_size_bytes > max_cache_bytes:
-        return max_cache_bytes
-    else:
-        return optimal_cache_size_bytes
+    return max_cache_bytes \
+        if optimal_cache_size_bytes > max_cache_bytes else optimal_cache_size_bytes
 
 
 def parse_args(parser, size=None, loop_types=(),
@@ -175,9 +169,10 @@ def parse_args(parser, size=None, loop_types=(),
                         help='Seed to pass as random_state')
     parser.add_argument('--dataset-name', type=str, default=None,
                         help='Dataset name')
-    parser.add_argument('--no-intel-optimized', default=False, action='store_true',
+    parser.add_argument('--no-intel-optimized', default=False,
+                        action='store_true',
                         help='Use no intel optimized version. '
-                             'Now avalible for scikit-learn benchmarks'),
+                        'Now avalible for scikit-learn benchmarks')
     parser.add_argument('--device', default='None', type=str,
                         choices=('host', 'cpu', 'gpu', 'None'),
                         help='Execution context device')
@@ -519,8 +514,8 @@ def print_output(library, algorithm, stages, params, functions,
                  alg_params=None):
     if params.output_format == 'json':
         output = []
-        for i in range(len(stages)):
-            result = gen_basic_dict(library, algorithm, stages[i], params,
+        for i, stage in enumerate(stages):
+            result = gen_basic_dict(library, algorithm, stage, params,
                                     data[i], alg_instance, alg_params)
             result.update({'time[s]': times[i]})
             if metric_type is not None:
diff --git a/cuml_bench/df_clsf.py b/cuml_bench/df_clsf.py
@@ -15,7 +15,6 @@
 # ===============================================================================
 
 import argparse
-from typing import Any
 
 import bench
 import cuml
@@ -62,36 +61,36 @@
     params.split_algorithm = 1
 
 params.n_classes = y_train[y_train.columns[0]].nunique()
-clf: Any
-
-
-def fit(X, y):
-    global clf
-    clf = RandomForestClassifier(split_criterion=params.criterion,
-                                 split_algo=params.split_algorithm,
-                                 n_estimators=params.num_trees,
-                                 max_depth=params.max_depth,
-                                 max_features=params.max_features,
-                                 min_samples_split=params.min_samples_split,
-                                 max_leaves=params.max_leaf_nodes,
-                                 min_impurity_decrease=params.min_impurity_decrease,
-                                 bootstrap=params.bootstrap)
+
+clf = RandomForestClassifier(
+    split_criterion=params.criterion,
+    split_algo=params.split_algorithm,
+    n_estimators=params.num_trees,
+    max_depth=params.max_depth,
+    max_features=params.max_features,
+    min_samples_split=params.min_samples_split,
+    max_leaves=params.max_leaf_nodes,
+    min_impurity_decrease=params.min_impurity_decrease,
+    bootstrap=params.bootstrap,
+)
+
+
+def fit(clf, X, y):
     return clf.fit(X, y)
 
 
-def predict(X):
-    global clf
+def predict(clf, X):
     prediction_args = {'predict_model': 'GPU'}
     if int(cuml.__version__.split('.')[1]) <= 14:
         prediction_args.update({'num_classes': params.n_classes})
     return clf.predict(X, **prediction_args)
 
 
-fit_time, _ = bench.measure_function_time(fit, X_train, y_train, params=params)
-y_pred = predict(X_train)
+fit_time, _ = bench.measure_function_time(fit, clf, X_train, y_train, params=params)
+y_pred = predict(clf, X_train)
 train_acc = 100 * bench.accuracy_score(y_pred, y_train)
 
-predict_time, y_pred = bench.measure_function_time(predict, X_test, params=params)
+predict_time, y_pred = bench.measure_function_time(predict, clf, X_test, params=params)
 test_acc = 100 * bench.accuracy_score(y_pred, y_test)
 
 bench.print_output(library='cuml', algorithm='decision_forest_classification',
diff --git a/cuml_bench/df_regr.py b/cuml_bench/df_regr.py
@@ -15,7 +15,6 @@
 # ===============================================================================
 
 import argparse
-from typing import Any
 
 import bench
 from cuml.ensemble import RandomForestRegressor
@@ -59,35 +58,35 @@
     params.split_algorithm = 0
 else:
     params.split_algorithm = 1
-regr: Any
-
 
 # Create our random forest regressor
-def fit(X, y):
-    global regr
-    regr = RandomForestRegressor(split_criterion=params.criterion,
-                                 split_algo=params.split_algorithm,
-                                 n_estimators=params.num_trees,
-                                 max_depth=params.max_depth,
-                                 max_features=params.max_features,
-                                 min_samples_split=params.min_samples_split,
-                                 max_leaves=params.max_leaf_nodes,
-                                 min_impurity_decrease=params.min_impurity_decrease,
-                                 bootstrap=params.bootstrap)
+regr = RandomForestRegressor(
+    split_criterion=params.criterion,
+    split_algo=params.split_algorithm,
+    n_estimators=params.num_trees,
+    max_depth=params.max_depth,
+    max_features=params.max_features,
+    min_samples_split=params.min_samples_split,
+    max_leaves=params.max_leaf_nodes,
+    min_impurity_decrease=params.min_impurity_decrease,
+    bootstrap=params.bootstrap,
+)
+
+
+def fit(regr, X, y):
     return regr.fit(X, y)
 
 
-def predict(X):
-    global regr
+def predict(regr, X):
     return regr.predict(X, predict_model='GPU')
 
 
-fit_time, _ = bench.measure_function_time(fit, X_train, y_train, params=params)
+fit_time, _ = bench.measure_function_time(fit, regr, X_train, y_train, params=params)
 
-y_pred = predict(X_train)
+y_pred = predict(regr, X_train)
 train_rmse = bench.rmse_score(y_pred, y_train)
 
-predict_time, y_pred = bench.measure_function_time(predict, X_test, params=params)
+predict_time, y_pred = bench.measure_function_time(predict, regr, X_test, params=params)
 test_rmse = bench.rmse_score(y_pred, y_test)
 
 bench.print_output(library='cuml', algorithm='decision_forest_regression',
diff --git a/daal4py_bench/pca.py b/daal4py_bench/pca.py
@@ -121,9 +121,8 @@ def pca_fit_full_daal(X, n_components):
 def test_fit(X):
     if params.svd_solver == 'full':
         return pca_fit_full_daal(X, params.n_components)
-    else:
-        method = 'correlationDense' if params.svd_solver == 'correlation' else 'svdDense'
-        return pca_fit_daal(X, params.n_components, method)
+    method = 'correlationDense' if params.svd_solver == 'correlation' else 'svdDense'
+    return pca_fit_daal(X, params.n_components, method)
 
 
 def test_transform(Xp, pca_result, eigenvalues, eigenvectors):
diff --git a/modelbuilders_bench/mb_utils.py b/modelbuilders_bench/mb_utils.py
@@ -21,17 +21,15 @@
 
 def get_accuracy(true_labels, prediction):
     errors = 0
-    for i in range(len(true_labels)):
+    for i, true_label in enumerate(true_labels):
         pred_label = 0
-        if isinstance(prediction[i], float) or \
-                isinstance(prediction[i], np.single) or \
-                isinstance(prediction[i], np.float):
+        if isinstance(prediction[i], (float, np.single, np.float)):
             pred_label = prediction[i] > 0.5
         elif prediction[i].shape[0] == 1:
             pred_label = prediction[i][0]
         else:
             pred_label = np.argmax(prediction[i])
-        if true_labels[i] != pred_label:
+        if true_label != pred_label:
             errors += 1
     return 100 * (1 - errors / len(true_labels))
 
@@ -54,14 +52,14 @@ def print_output(library, algorithm, stages, params, functions,
         })
         if hasattr(params, 'n_classes'):
             output[-1]['input_data'].update({'classes': params.n_classes})
-        for i in range(len(stages)):
+        for i, stage in enumerate(stages):
             result = {
-                'stage': stages[i],
+                'stage': stage,
             }
-            if 'daal' in stages[i]:
+            if 'daal' in stage:
                 result.update({'conversion_to_daal4py': times[2 * i],
                                'prediction_time': times[2 * i + 1]})
-            elif 'train' in stages[i]:
+            elif 'train' in stage:
                 result.update({'matrix_creation_time': times[2 * i],
                                'training_time': times[2 * i + 1]})
             else: