Skip to content

Enable codefactor in master & cleaning code #85

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Aug 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 11 additions & 16 deletions bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,11 @@ def get_dtype(data):
'''
if hasattr(data, 'dtype'):
return data.dtype
elif hasattr(data, 'dtypes'):
if hasattr(data, 'dtypes'):
return str(data.dtypes[0])
elif hasattr(data, 'values'):
if hasattr(data, 'values'):
return data.values.dtype
else:
raise ValueError(f'Impossible to get data type of {type(data)}')
raise ValueError(f'Impossible to get data type of {type(data)}')


def sklearn_disable_finiteness_check():
Expand Down Expand Up @@ -66,10 +65,7 @@ def _parse_size(string, dim=2):


def float_or_int(string):
if '.' in string:
return float(string)
else:
return int(string)
return float(string) if '.' in string else int(string)


def get_optimal_cache_size(n_rows, dtype=np.double, max_cache=64):
Expand All @@ -90,10 +86,8 @@ def get_optimal_cache_size(n_rows, dtype=np.double, max_cache=64):
optimal_cache_size_bytes = byte_size * (n_rows ** 2)
one_gb = 2 ** 30
max_cache_bytes = max_cache * one_gb
if optimal_cache_size_bytes > max_cache_bytes:
return max_cache_bytes
else:
return optimal_cache_size_bytes
return max_cache_bytes \
if optimal_cache_size_bytes > max_cache_bytes else optimal_cache_size_bytes


def parse_args(parser, size=None, loop_types=(),
Expand Down Expand Up @@ -175,9 +169,10 @@ def parse_args(parser, size=None, loop_types=(),
help='Seed to pass as random_state')
parser.add_argument('--dataset-name', type=str, default=None,
help='Dataset name')
parser.add_argument('--no-intel-optimized', default=False, action='store_true',
parser.add_argument('--no-intel-optimized', default=False,
action='store_true',
help='Use no intel optimized version. '
'Now avalible for scikit-learn benchmarks'),
'Now avalible for scikit-learn benchmarks')
parser.add_argument('--device', default='None', type=str,
choices=('host', 'cpu', 'gpu', 'None'),
help='Execution context device')
Expand Down Expand Up @@ -519,8 +514,8 @@ def print_output(library, algorithm, stages, params, functions,
alg_params=None):
if params.output_format == 'json':
output = []
for i in range(len(stages)):
result = gen_basic_dict(library, algorithm, stages[i], params,
for i, stage in enumerate(stages):
result = gen_basic_dict(library, algorithm, stage, params,
data[i], alg_instance, alg_params)
result.update({'time[s]': times[i]})
if metric_type is not None:
Expand Down
39 changes: 19 additions & 20 deletions cuml_bench/df_clsf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
# ===============================================================================

import argparse
from typing import Any

import bench
import cuml
Expand Down Expand Up @@ -62,36 +61,36 @@
params.split_algorithm = 1

params.n_classes = y_train[y_train.columns[0]].nunique()
clf: Any


def fit(X, y):
global clf
clf = RandomForestClassifier(split_criterion=params.criterion,
split_algo=params.split_algorithm,
n_estimators=params.num_trees,
max_depth=params.max_depth,
max_features=params.max_features,
min_samples_split=params.min_samples_split,
max_leaves=params.max_leaf_nodes,
min_impurity_decrease=params.min_impurity_decrease,
bootstrap=params.bootstrap)

clf = RandomForestClassifier(
split_criterion=params.criterion,
split_algo=params.split_algorithm,
n_estimators=params.num_trees,
max_depth=params.max_depth,
max_features=params.max_features,
min_samples_split=params.min_samples_split,
max_leaves=params.max_leaf_nodes,
min_impurity_decrease=params.min_impurity_decrease,
bootstrap=params.bootstrap,
)


def fit(clf, X, y):
return clf.fit(X, y)


def predict(X):
global clf
def predict(clf, X):
prediction_args = {'predict_model': 'GPU'}
if int(cuml.__version__.split('.')[1]) <= 14:
prediction_args.update({'num_classes': params.n_classes})
return clf.predict(X, **prediction_args)


fit_time, _ = bench.measure_function_time(fit, X_train, y_train, params=params)
y_pred = predict(X_train)
fit_time, _ = bench.measure_function_time(fit, clf, X_train, y_train, params=params)
y_pred = predict(clf, X_train)
train_acc = 100 * bench.accuracy_score(y_pred, y_train)

predict_time, y_pred = bench.measure_function_time(predict, X_test, params=params)
predict_time, y_pred = bench.measure_function_time(predict, clf, X_test, params=params)
test_acc = 100 * bench.accuracy_score(y_pred, y_test)

bench.print_output(library='cuml', algorithm='decision_forest_classification',
Expand Down
37 changes: 18 additions & 19 deletions cuml_bench/df_regr.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
# ===============================================================================

import argparse
from typing import Any

import bench
from cuml.ensemble import RandomForestRegressor
Expand Down Expand Up @@ -59,35 +58,35 @@
params.split_algorithm = 0
else:
params.split_algorithm = 1
regr: Any


# Create our random forest regressor
def fit(X, y):
global regr
regr = RandomForestRegressor(split_criterion=params.criterion,
split_algo=params.split_algorithm,
n_estimators=params.num_trees,
max_depth=params.max_depth,
max_features=params.max_features,
min_samples_split=params.min_samples_split,
max_leaves=params.max_leaf_nodes,
min_impurity_decrease=params.min_impurity_decrease,
bootstrap=params.bootstrap)
regr = RandomForestRegressor(
split_criterion=params.criterion,
split_algo=params.split_algorithm,
n_estimators=params.num_trees,
max_depth=params.max_depth,
max_features=params.max_features,
min_samples_split=params.min_samples_split,
max_leaves=params.max_leaf_nodes,
min_impurity_decrease=params.min_impurity_decrease,
bootstrap=params.bootstrap,
)


def fit(regr, X, y):
return regr.fit(X, y)


def predict(X):
global regr
def predict(regr, X):
return regr.predict(X, predict_model='GPU')


fit_time, _ = bench.measure_function_time(fit, X_train, y_train, params=params)
fit_time, _ = bench.measure_function_time(fit, regr, X_train, y_train, params=params)

y_pred = predict(X_train)
y_pred = predict(regr, X_train)
train_rmse = bench.rmse_score(y_pred, y_train)

predict_time, y_pred = bench.measure_function_time(predict, X_test, params=params)
predict_time, y_pred = bench.measure_function_time(predict, regr, X_test, params=params)
test_rmse = bench.rmse_score(y_pred, y_test)

bench.print_output(library='cuml', algorithm='decision_forest_regression',
Expand Down
5 changes: 2 additions & 3 deletions daal4py_bench/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,8 @@ def pca_fit_full_daal(X, n_components):
def test_fit(X):
if params.svd_solver == 'full':
return pca_fit_full_daal(X, params.n_components)
else:
method = 'correlationDense' if params.svd_solver == 'correlation' else 'svdDense'
return pca_fit_daal(X, params.n_components, method)
method = 'correlationDense' if params.svd_solver == 'correlation' else 'svdDense'
return pca_fit_daal(X, params.n_components, method)


def test_transform(Xp, pca_result, eigenvalues, eigenvectors):
Expand Down
16 changes: 7 additions & 9 deletions modelbuilders_bench/mb_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,15 @@

def get_accuracy(true_labels, prediction):
errors = 0
for i in range(len(true_labels)):
for i, true_label in enumerate(true_labels):
pred_label = 0
if isinstance(prediction[i], float) or \
isinstance(prediction[i], np.single) or \
isinstance(prediction[i], np.float):
if isinstance(prediction[i], (float, np.single, np.float)):
pred_label = prediction[i] > 0.5
elif prediction[i].shape[0] == 1:
pred_label = prediction[i][0]
else:
pred_label = np.argmax(prediction[i])
if true_labels[i] != pred_label:
if true_label != pred_label:
errors += 1
return 100 * (1 - errors / len(true_labels))

Expand All @@ -54,14 +52,14 @@ def print_output(library, algorithm, stages, params, functions,
})
if hasattr(params, 'n_classes'):
output[-1]['input_data'].update({'classes': params.n_classes})
for i in range(len(stages)):
for i, stage in enumerate(stages):
result = {
'stage': stages[i],
'stage': stage,
}
if 'daal' in stages[i]:
if 'daal' in stage:
result.update({'conversion_to_daal4py': times[2 * i],
'prediction_time': times[2 * i + 1]})
elif 'train' in stages[i]:
elif 'train' in stage:
result.update({'matrix_creation_time': times[2 * i],
'training_time': times[2 * i + 1]})
else:
Expand Down