Skip to content

Extend output result & minor fixes #81

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Aug 4, 2021
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 43 additions & 11 deletions bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,20 +338,47 @@ def columnwise_score(y, yp, score_func):
return score_func(y, yp)


def accuracy_score(y, yp):
return columnwise_score(y, yp, lambda y1, y2: np.mean(y1 == y2))
def accuracy_score(y_true, y_pred):
return columnwise_score(y_true, y_pred, lambda y1, y2: np.mean(y1 == y2))


def log_loss(y, yp):
def log_loss(y_true, y_pred):
from sklearn.metrics import log_loss as sklearn_log_loss
y = convert_to_numpy(y)
yp = convert_to_numpy(yp)
return sklearn_log_loss(y, yp)
y_true = convert_to_numpy(y_true)
y_pred = convert_to_numpy(y_pred)
return sklearn_log_loss(y_true, y_pred)


def roc_auc_score(y_true, y_pred, multi_class='ovr'):
from sklearn.metrics import roc_auc_score as sklearn_roc_auc
y_true = convert_to_numpy(y_true)
y_pred = convert_to_numpy(y_pred)
if y_pred.shape[1] == 2: # binary case
y_pred = y_pred[:, 1]
return sklearn_roc_auc(y_true, y_pred, multi_class=multi_class)

def rmse_score(y, yp):

def rmse_score(y_true, y_pred):
return columnwise_score(
y, yp, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2))))
y_true, y_pred, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2))))


def r2_score(y_true, y_pred):
from sklearn.metrics import r2_score as sklearn_r2_score
y_true = convert_to_numpy(y_true)
y_pred = convert_to_numpy(y_pred)
return sklearn_r2_score(y_true, y_pred)


def davies_bouldin_score(y_true, y_pred):
from sklearn.metrics.cluster import davies_bouldin_score as sklearn_dbs
y_true = convert_to_numpy(y_true)
y_pred = convert_to_numpy(y_pred)
try:
res = sklearn_dbs(y_true, y_pred)
except ValueError:
res = "Number of labels is 1"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for example: y _pred= [1, 1, 1, 1, 1, 1, 1,] ? Or?

return res

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm slightly concerned about such exception handling. My opinion is - our own configs shouldn't cause any exceptions.
And I would have rather removed every handling from the code



def convert_data(data, dtype, data_order, data_format):
Expand Down Expand Up @@ -488,16 +515,21 @@ def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None,


def print_output(library, algorithm, stages, params, functions,
times, accuracy_type, accuracies, data, alg_instance=None,
times, metric_type, metrics, data, alg_instance=None,
alg_params=None):
if params.output_format == 'json':
output = []
for i in range(len(stages)):
result = gen_basic_dict(library, algorithm, stages[i], params,
data[i], alg_instance, alg_params)
result.update({'time[s]': times[i]})
if accuracy_type is not None:
result.update({f'{accuracy_type}': accuracies[i]})
if metric_type is not None:
if isinstance(metric_type, str):
result.update({f'{metric_type}': metrics[i]})
elif isinstance(metric_type, list):
for ind, val in enumerate(metric_type):
if metrics[ind][i] is not None:
result.update({f'{val}': metrics[ind][i]})
if hasattr(params, 'n_classes'):
result['input_data'].update({'classes': params.n_classes})
if hasattr(params, 'n_clusters'):
Expand Down
2 changes: 1 addition & 1 deletion configs/blogs/skl_2021_3.json
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@
}
],
"nu": [0.25],
"kernel": ["sigmoid"]
"kernel": ["poly"]
},
{
"algorithm": "svr",
Expand Down
2 changes: 1 addition & 1 deletion cuml_bench/dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,5 +48,5 @@

bench.print_output(library='cuml', algorithm='dbscan', stages=['training'],
params=params, functions=['DBSCAN'], times=[time],
accuracies=[acc], accuracy_type='davies_bouldin_score', data=[X],
metrics=[acc], metric_type='davies_bouldin_score', data=[X],
alg_instance=dbscan)
4 changes: 2 additions & 2 deletions cuml_bench/df_clsf.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,6 @@ def predict(X):
bench.print_output(library='cuml', algorithm='decision_forest_classification',
stages=['training', 'prediction'],
params=params, functions=['df_clsf.fit', 'df_clsf.predict'],
times=[fit_time, predict_time], accuracy_type='accuracy[%]',
accuracies=[train_acc, test_acc], data=[X_train, X_test],
times=[fit_time, predict_time], metric_type='accuracy[%]',
metrics=[train_acc, test_acc], data=[X_train, X_test],
alg_instance=clf)
4 changes: 2 additions & 2 deletions cuml_bench/df_regr.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,6 @@ def predict(X):
bench.print_output(library='cuml', algorithm='decision_forest_regression',
stages=['training', 'prediction'], params=params,
functions=['df_regr.fit', 'df_regr.predict'],
times=[fit_time, predict_time], accuracy_type='rmse',
accuracies=[train_rmse, test_rmse], data=[X_train, X_test],
times=[fit_time, predict_time], metric_type='rmse',
metrics=[train_rmse, test_rmse], data=[X_train, X_test],
alg_instance=regr)
4 changes: 2 additions & 2 deletions cuml_bench/elasticnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,6 @@
bench.print_output(library='cuml', algorithm='elastic-net',
stages=['training', 'prediction'], params=params,
functions=['ElasticNet.fit', 'ElasticNet.predict'],
times=[fit_time, predict_time], accuracy_type='rmse',
accuracies=[train_rmse, test_rmse], data=[X_train, X_train],
times=[fit_time, predict_time], metric_type='rmse',
metrics=[train_rmse, test_rmse], data=[X_train, X_train],
alg_instance=regr)
4 changes: 2 additions & 2 deletions cuml_bench/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,6 @@ def kmeans_fit(X):
bench.print_output(library='cuml', algorithm='kmeans',
stages=['training', 'prediction'], params=params,
functions=['KMeans.fit', 'KMeans.predict'],
times=[fit_time, predict_time], accuracy_type='davies_bouldin_score',
accuracies=[acc_train, acc_test], data=[X_train, X_test],
times=[fit_time, predict_time], metric_type='davies_bouldin_score',
metrics=[acc_train, acc_test], data=[X_train, X_test],
alg_instance=kmeans)
4 changes: 2 additions & 2 deletions cuml_bench/knn_clsf.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,13 @@
stages=['training', 'prediction'], params=params,
functions=['knn_clsf.fit', 'knn_clsf.predict'],
times=[train_time, predict_time],
accuracies=[train_acc, test_acc], accuracy_type='accuracy[%]',
metrics=[train_acc, test_acc], metric_type='accuracy[%]',
data=[X_train, X_test], alg_instance=knn_clsf)
else:
bench.print_output(library='cuml',
algorithm=knn_clsf.algorithm + '_knn_search',
stages=['training', 'search'], params=params,
functions=['knn_clsf.fit', 'knn_clsf.kneighbors'],
times=[train_time, predict_time],
accuracies=[], accuracy_type=None,
metrics=[], metric_type=None,
data=[X_train, X_test], alg_instance=knn_clsf)
4 changes: 2 additions & 2 deletions cuml_bench/lasso.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,6 @@
bench.print_output(library='sklearn', algorithm='lasso',
stages=['training', 'prediction'],
params=params, functions=['Lasso.fit', 'Lasso.predict'],
times=[fit_time, predict_time], accuracy_type='rmse',
accuracies=[train_rmse, test_rmse], data=[X_train, X_test],
times=[fit_time, predict_time], metric_type='rmse',
metrics=[train_rmse, test_rmse], data=[X_train, X_test],
alg_instance=regr)
4 changes: 2 additions & 2 deletions cuml_bench/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,6 @@
bench.print_output(library='cuml', algorithm='linear_regression',
stages=['training', 'prediction'], params=params,
functions=['Linear.fit', 'Linear.predict'],
times=[fit_time, predict_time], accuracy_type='rmse',
accuracies=[train_rmse, test_rmse], data=[X_train, X_test],
times=[fit_time, predict_time], metric_type='rmse',
metrics=[train_rmse, test_rmse], data=[X_train, X_test],
alg_instance=regr)
4 changes: 2 additions & 2 deletions cuml_bench/log_reg.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,6 @@
bench.print_output(library='cuml', algorithm='logistic_regression',
stages=['training', 'prediction'], params=params,
functions=['LogReg.fit', 'LogReg.predict'],
times=[fit_time, predict_time], accuracy_type='accuracy[%]',
accuracies=[train_acc, test_acc], data=[X_train, X_test],
times=[fit_time, predict_time], metric_type='accuracy[%]',
metrics=[train_acc, test_acc], data=[X_train, X_test],
alg_instance=clf)
4 changes: 2 additions & 2 deletions cuml_bench/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,6 @@
bench.print_output(library='cuml', algorithm='pca',
stages=['training', 'transformation'],
params=params, functions=['PCA.fit', 'PCA.transform'],
times=[fit_time, transform_time], accuracy_type=None,
accuracies=[None, None], data=[X_train, X_test],
times=[fit_time, transform_time], metric_type=None,
metrics=[None, None], data=[X_train, X_test],
alg_instance=pca)
4 changes: 2 additions & 2 deletions cuml_bench/ridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,6 @@
bench.print_output(library='cuml', algorithm='ridge_regression',
stages=['training', 'prediction'], params=params,
functions=['Ridge.fit', 'Ridge.predict'],
times=[fit_time, predict_time], accuracy_type='rmse',
accuracies=[train_rmse, test_rmse], data=[X_train, X_test],
times=[fit_time, predict_time], metric_type='rmse',
metrics=[train_rmse, test_rmse], data=[X_train, X_test],
alg_instance=regr)
8 changes: 4 additions & 4 deletions cuml_bench/svm.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,14 @@

if params.probability:
state_predict = 'predict_proba'
accuracy_type = 'log_loss'
metric_type = 'log_loss'
clf_predict = clf.predict_proba

def metric_call(x, y):
return bench.log_loss(x, y)
else:
state_predict = 'prediction'
accuracy_type = 'accuracy[%]'
metric_type = 'accuracy[%]'
clf_predict = clf.predict

def metric_call(x, y):
Expand All @@ -82,6 +82,6 @@ def metric_call(x, y):
bench.print_output(library='cuml', algorithm='svc',
stages=['training', state_predict], params=params,
functions=['SVM.fit', 'SVM.predict'],
times=[fit_time, predict_train_time], accuracy_type=accuracy_type,
accuracies=[train_acc, test_acc], data=[X_train, X_train],
times=[fit_time, predict_train_time], metric_type=metric_type,
metrics=[train_acc, test_acc], data=[X_train, X_train],
alg_instance=clf)
4 changes: 2 additions & 2 deletions cuml_bench/svr.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,6 @@
bench.print_output(library='cuml', algorithm='svr',
stages=['training', 'prediction'], params=params,
functions=['SVR.fit', 'SVR.predict'],
times=[fit_time, predict_train_time], accuracy_type='rmse',
accuracies=[train_rmse, test_rmse], data=[X_train, X_train],
times=[fit_time, predict_train_time], metric_type='rmse',
metrics=[train_rmse, test_rmse], data=[X_train, X_train],
alg_instance=regr)
4 changes: 2 additions & 2 deletions cuml_bench/train_test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,5 @@

bench.print_output(library='cuml', algorithm='train_test_split',
stages=['training'], params=params,
functions=['train_test_split'], times=[time], accuracies=[None],
accuracy_type=None, data=[X], alg_params=tts_params)
functions=['train_test_split'], times=[time], metrics=[None],
metric_type=None, data=[X], alg_params=tts_params)
2 changes: 1 addition & 1 deletion daal4py_bench/dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,4 @@ def test_dbscan(X):

bench.print_output(library='daal4py', algorithm='dbscan', stages=['training'],
params=params, functions=['DBSCAN'], times=[time],
accuracies=[None], accuracy_type=None, data=[X])
metrics=[None], metric_type=None, data=[X])
4 changes: 2 additions & 2 deletions daal4py_bench/df_clsf.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,5 +126,5 @@ def df_clsf_predict(X, training_result, n_classes, verbose=False):
bench.print_output(library='daal4py', algorithm='decision_forest_classification',
stages=['training', 'prediction'], params=params,
functions=['df_clsf.fit', 'df_clsf.predict'],
times=[fit_time, predict_time], accuracy_type='accuracy[%]',
accuracies=[train_acc, test_acc], data=[X_train, X_test])
times=[fit_time, predict_time], metric_type='accuracy[%]',
metrics=[train_acc, test_acc], data=[X_train, X_test])
4 changes: 2 additions & 2 deletions daal4py_bench/df_regr.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,5 +123,5 @@ def df_regr_predict(X, training_result):
bench.print_output(library='daal4py', algorithm='decision_forest_regression',
stages=['training', 'prediction'], params=params,
functions=['df_regr.fit', 'df_regr.predict'],
times=[fit_time, predict_time], accuracy_type='rmse',
accuracies=[train_rmse, test_rmse], data=[X_train, X_test])
times=[fit_time, predict_time], metric_type='rmse',
metrics=[train_rmse, test_rmse], data=[X_train, X_test])
2 changes: 1 addition & 1 deletion daal4py_bench/distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,5 @@ def compute_distances(pairwise_distances, X):

bench.print_output(library='daal4py', algorithm='distances', stages=['computation'],
params=params, functions=[params.metric.capitalize()], times=[time],
accuracy_type=None, accuracies=[None], data=[X],
metric_type=None, metrics=[None], data=[X],
alg_params={'metric': params.metric})
4 changes: 2 additions & 2 deletions daal4py_bench/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,5 +87,5 @@ def test_predict(X, X_init):
bench.print_output(library='daal4py', algorithm='kmeans',
stages=['training', 'prediction'],
params=params, functions=['KMeans.fit', 'KMeans.predict'],
times=[fit_time, predict_time], accuracy_type='inertia',
accuracies=[train_inertia, test_inertia], data=[X_train, X_test])
times=[fit_time, predict_time], metric_type='inertia',
metrics=[train_inertia, test_inertia], data=[X_train, X_test])
4 changes: 2 additions & 2 deletions daal4py_bench/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,5 +68,5 @@ def test_predict(Xp, model):
bench.print_output(library='daal4py', algorithm='linear_regression',
stages=['training', 'prediction'],
params=params, functions=['Linear.fit', 'Linear.predict'],
times=[fit_time, predict_time], accuracy_type='rmse',
accuracies=[train_rmse, test_rmse], data=[X_train, X_test])
times=[fit_time, predict_time], metric_type='rmse',
metrics=[train_rmse, test_rmse], data=[X_train, X_test])
4 changes: 2 additions & 2 deletions daal4py_bench/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def test_transform(Xp, pca_result, eigenvalues, eigenvectors):
bench.print_output(library='daal4py', algorithm='pca',
stages=['training', 'transformation'],
params=params, functions=['PCA.fit', 'PCA.transform'],
times=[fit_time, transform_time], accuracy_type=None,
accuracies=[None, None], data=[X_train, X_test],
times=[fit_time, transform_time], metric_type=None,
metrics=[None, None], data=[X_train, X_test],
alg_params={'svd_solver': params.svd_solver,
'n_components': params.n_components})
4 changes: 2 additions & 2 deletions daal4py_bench/ridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,5 +64,5 @@ def test_predict(Xp, model):
bench.print_output(library='daal4py', algorithm='ridge_regression',
stages=['training', 'prediction'], params=params,
functions=['Ridge.fit', 'Ridge.predict'],
times=[fit_time, predict_time], accuracy_type='rmse',
accuracies=[train_rmse, test_rmse], data=[X_train, X_test])
times=[fit_time, predict_time], metric_type='rmse',
metrics=[train_rmse, test_rmse], data=[X_train, X_test])
41 changes: 26 additions & 15 deletions datasets/loader_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,11 @@ def airline(dataset_dir: Path) -> bool:
Airline dataset
http://kt.ijs.si/elena_ikonomovska/data.html

TaskType:binclass
NumberOfFeatures:13
NumberOfInstances:115M
Classification task. n_classes = 2.
airline X train dataset (92055213, 13)
airline y train dataset (92055213, 1)
airline X test dataset (23013804, 13)
airline y test dataset (23013804, 1)
"""
dataset_name = 'airline'
os.makedirs(dataset_dir, exist_ok=True)
Expand Down Expand Up @@ -126,9 +128,12 @@ def airline(dataset_dir: Path) -> bool:
def airline_ohe(dataset_dir: Path) -> bool:
"""
Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf
TaskType:binclass
NumberOfFeatures:700
NumberOfInstances:10100000

Classification task. n_classes = 2.
airline-ohe X train dataset (1000000, 692)
airline-ohe y train dataset (1000000, 1)
airline-ohe X test dataset (100000, 692)
airline-ohe y test dataset (100000, 1)
"""
dataset_name = 'airline-ohe'
os.makedirs(dataset_dir, exist_ok=True)
Expand Down Expand Up @@ -289,9 +294,11 @@ def epsilon(dataset_dir: Path) -> bool:
Epsilon dataset
https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html

TaskType:binclass
NumberOfFeatures:2000
NumberOfInstances:500K
Classification task. n_classes = 2.
epsilon X train dataset (400000, 2000)
epsilon y train dataset (400000, 1)
epsilon X test dataset (100000, 2000)
epsilon y test dataset (100000, 1)
"""
dataset_name = 'epsilon'
os.makedirs(dataset_dir, exist_ok=True)
Expand Down Expand Up @@ -444,9 +451,11 @@ def higgs(dataset_dir: Path) -> bool:
Higgs dataset from UCI machine learning repository
https://archive.ics.uci.edu/ml/datasets/HIGGS

TaskType:binclass
NumberOfFeatures:28
NumberOfInstances:11M
Classification task. n_classes = 2.
higgs X train dataset (8799999, 28)
higgs y train dataset (8799999, 1)
higgs X test dataset (2200000, 28)
higgs y test dataset (2200000, 1)
"""
dataset_name = 'higgs'
os.makedirs(dataset_dir, exist_ok=True)
Expand Down Expand Up @@ -479,9 +488,11 @@ def higgs_one_m(dataset_dir: Path) -> bool:

Only first 1.5M samples is taken

TaskType:binclass
NumberOfFeatures:28
NumberOfInstances:1.5M
Classification task. n_classes = 2.
higgs1m X train dataset (1000000, 28)
higgs1m y train dataset (1000000, 1)
higgs1m X test dataset (500000, 28)
higgs1m y test dataset (500000, 1)
"""
dataset_name = 'higgs1m'
os.makedirs(dataset_dir, exist_ok=True)
Expand Down
Loading