Skip to content

Commit 972efac

Browse files
OnlyDenikodenis.kulandin
and
denis.kulandin
authored
Extended output result, new metrics and minor fixes (#81)
* minor fixes * pep8 * random state * size of datasets * apply comments * dbscan eps fix * extend output result * codefactor.io * pep8 * fix ci * fix * whitespace * return to stock functions * remove debug putput * apply comments * metrics & metric_type * pep8 * pca * roc_auc details section * pep8 * finally solve roc_auc trouble * add kmeans.iter_ & done metrics in bench * n_iter_ * stay columnwise_score because of xgb * roc_auc_score binary case * add n_sv in svms * apply comments Co-authored-by: denis.kulandin <dkulandi@nnldaal070.inn.intel.com>
1 parent e57be6c commit 972efac

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+461
-250
lines changed

bench.py

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -338,20 +338,47 @@ def columnwise_score(y, yp, score_func):
338338
return score_func(y, yp)
339339

340340

341-
def accuracy_score(y, yp):
342-
return columnwise_score(y, yp, lambda y1, y2: np.mean(y1 == y2))
341+
def accuracy_score(y_true, y_pred):
342+
return columnwise_score(y_true, y_pred, lambda y1, y2: np.mean(y1 == y2))
343343

344344

345-
def log_loss(y, yp):
345+
def log_loss(y_true, y_pred):
346346
from sklearn.metrics import log_loss as sklearn_log_loss
347-
y = convert_to_numpy(y)
348-
yp = convert_to_numpy(yp)
349-
return sklearn_log_loss(y, yp)
347+
y_true = convert_to_numpy(y_true)
348+
y_pred = convert_to_numpy(y_pred)
349+
return sklearn_log_loss(y_true, y_pred)
350+
350351

352+
def roc_auc_score(y_true, y_pred, multi_class='ovr'):
353+
from sklearn.metrics import roc_auc_score as sklearn_roc_auc
354+
y_true = convert_to_numpy(y_true)
355+
y_pred = convert_to_numpy(y_pred)
356+
if y_pred.shape[1] == 2: # binary case
357+
y_pred = y_pred[:, 1]
358+
return sklearn_roc_auc(y_true, y_pred, multi_class=multi_class)
351359

352-
def rmse_score(y, yp):
360+
361+
def rmse_score(y_true, y_pred):
353362
return columnwise_score(
354-
y, yp, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2))))
363+
y_true, y_pred, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2))))
364+
365+
366+
def r2_score(y_true, y_pred):
367+
from sklearn.metrics import r2_score as sklearn_r2_score
368+
y_true = convert_to_numpy(y_true)
369+
y_pred = convert_to_numpy(y_pred)
370+
return sklearn_r2_score(y_true, y_pred)
371+
372+
373+
def davies_bouldin_score(X, labels):
374+
from sklearn.metrics.cluster import davies_bouldin_score as sklearn_dbs
375+
X = convert_to_numpy(X)
376+
labels = convert_to_numpy(labels)
377+
try:
378+
res = sklearn_dbs(X, labels)
379+
except ValueError as ex:
380+
res = ex
381+
return res
355382

356383

357384
def convert_data(data, dtype, data_order, data_format):
@@ -488,16 +515,21 @@ def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None,
488515

489516

490517
def print_output(library, algorithm, stages, params, functions,
491-
times, accuracy_type, accuracies, data, alg_instance=None,
518+
times, metric_type, metrics, data, alg_instance=None,
492519
alg_params=None):
493520
if params.output_format == 'json':
494521
output = []
495522
for i in range(len(stages)):
496523
result = gen_basic_dict(library, algorithm, stages[i], params,
497524
data[i], alg_instance, alg_params)
498525
result.update({'time[s]': times[i]})
499-
if accuracy_type is not None:
500-
result.update({f'{accuracy_type}': accuracies[i]})
526+
if metric_type is not None:
527+
if isinstance(metric_type, str):
528+
result.update({f'{metric_type}': metrics[i]})
529+
elif isinstance(metric_type, list):
530+
for ind, val in enumerate(metric_type):
531+
if metrics[ind][i] is not None:
532+
result.update({f'{val}': metrics[ind][i]})
501533
if hasattr(params, 'n_classes'):
502534
result['input_data'].update({'classes': params.n_classes})
503535
if hasattr(params, 'n_clusters'):

configs/blogs/skl_2021_3.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,7 @@
307307
}
308308
],
309309
"nu": [0.25],
310-
"kernel": ["sigmoid"]
310+
"kernel": ["poly"]
311311
},
312312
{
313313
"algorithm": "svr",

cuml_bench/dbscan.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,5 +48,5 @@
4848

4949
bench.print_output(library='cuml', algorithm='dbscan', stages=['training'],
5050
params=params, functions=['DBSCAN'], times=[time],
51-
accuracies=[acc], accuracy_type='davies_bouldin_score', data=[X],
51+
metrics=[acc], metric_type='davies_bouldin_score', data=[X],
5252
alg_instance=dbscan)

cuml_bench/df_clsf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,6 @@ def predict(X):
9797
bench.print_output(library='cuml', algorithm='decision_forest_classification',
9898
stages=['training', 'prediction'],
9999
params=params, functions=['df_clsf.fit', 'df_clsf.predict'],
100-
times=[fit_time, predict_time], accuracy_type='accuracy[%]',
101-
accuracies=[train_acc, test_acc], data=[X_train, X_test],
100+
times=[fit_time, predict_time], metric_type='accuracy[%]',
101+
metrics=[train_acc, test_acc], data=[X_train, X_test],
102102
alg_instance=clf)

cuml_bench/df_regr.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,6 @@ def predict(X):
9393
bench.print_output(library='cuml', algorithm='decision_forest_regression',
9494
stages=['training', 'prediction'], params=params,
9595
functions=['df_regr.fit', 'df_regr.predict'],
96-
times=[fit_time, predict_time], accuracy_type='rmse',
97-
accuracies=[train_rmse, test_rmse], data=[X_train, X_test],
96+
times=[fit_time, predict_time], metric_type='rmse',
97+
metrics=[train_rmse, test_rmse], data=[X_train, X_test],
9898
alg_instance=regr)

cuml_bench/elasticnet.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,6 @@
5656
bench.print_output(library='cuml', algorithm='elastic-net',
5757
stages=['training', 'prediction'], params=params,
5858
functions=['ElasticNet.fit', 'ElasticNet.predict'],
59-
times=[fit_time, predict_time], accuracy_type='rmse',
60-
accuracies=[train_rmse, test_rmse], data=[X_train, X_train],
59+
times=[fit_time, predict_time], metric_type='rmse',
60+
metrics=[train_rmse, test_rmse], data=[X_train, X_train],
6161
alg_instance=regr)

cuml_bench/kmeans.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,6 @@ def kmeans_fit(X):
8888
bench.print_output(library='cuml', algorithm='kmeans',
8989
stages=['training', 'prediction'], params=params,
9090
functions=['KMeans.fit', 'KMeans.predict'],
91-
times=[fit_time, predict_time], accuracy_type='davies_bouldin_score',
92-
accuracies=[acc_train, acc_test], data=[X_train, X_test],
91+
times=[fit_time, predict_time], metric_type='davies_bouldin_score',
92+
metrics=[acc_train, acc_test], data=[X_train, X_test],
9393
alg_instance=kmeans)

cuml_bench/knn_clsf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,13 @@
6868
stages=['training', 'prediction'], params=params,
6969
functions=['knn_clsf.fit', 'knn_clsf.predict'],
7070
times=[train_time, predict_time],
71-
accuracies=[train_acc, test_acc], accuracy_type='accuracy[%]',
71+
metrics=[train_acc, test_acc], metric_type='accuracy[%]',
7272
data=[X_train, X_test], alg_instance=knn_clsf)
7373
else:
7474
bench.print_output(library='cuml',
7575
algorithm=knn_clsf.algorithm + '_knn_search',
7676
stages=['training', 'search'], params=params,
7777
functions=['knn_clsf.fit', 'knn_clsf.kneighbors'],
7878
times=[train_time, predict_time],
79-
accuracies=[], accuracy_type=None,
79+
metrics=[], metric_type=None,
8080
data=[X_train, X_test], alg_instance=knn_clsf)

cuml_bench/lasso.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,6 @@
5353
bench.print_output(library='sklearn', algorithm='lasso',
5454
stages=['training', 'prediction'],
5555
params=params, functions=['Lasso.fit', 'Lasso.predict'],
56-
times=[fit_time, predict_time], accuracy_type='rmse',
57-
accuracies=[train_rmse, test_rmse], data=[X_train, X_test],
56+
times=[fit_time, predict_time], metric_type='rmse',
57+
metrics=[train_rmse, test_rmse], data=[X_train, X_test],
5858
alg_instance=regr)

cuml_bench/linear.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,6 @@
5050
bench.print_output(library='cuml', algorithm='linear_regression',
5151
stages=['training', 'prediction'], params=params,
5252
functions=['Linear.fit', 'Linear.predict'],
53-
times=[fit_time, predict_time], accuracy_type='rmse',
54-
accuracies=[train_rmse, test_rmse], data=[X_train, X_test],
53+
times=[fit_time, predict_time], metric_type='rmse',
54+
metrics=[train_rmse, test_rmse], data=[X_train, X_test],
5555
alg_instance=regr)

cuml_bench/log_reg.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,6 @@
6161
bench.print_output(library='cuml', algorithm='logistic_regression',
6262
stages=['training', 'prediction'], params=params,
6363
functions=['LogReg.fit', 'LogReg.predict'],
64-
times=[fit_time, predict_time], accuracy_type='accuracy[%]',
65-
accuracies=[train_acc, test_acc], data=[X_train, X_test],
64+
times=[fit_time, predict_time], metric_type='accuracy[%]',
65+
metrics=[train_acc, test_acc], data=[X_train, X_test],
6666
alg_instance=clf)

cuml_bench/pca.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,6 @@
5151
bench.print_output(library='cuml', algorithm='pca',
5252
stages=['training', 'transformation'],
5353
params=params, functions=['PCA.fit', 'PCA.transform'],
54-
times=[fit_time, transform_time], accuracy_type=None,
55-
accuracies=[None, None], data=[X_train, X_test],
54+
times=[fit_time, transform_time], metric_type=None,
55+
metrics=[None, None], data=[X_train, X_test],
5656
alg_instance=pca)

cuml_bench/ridge.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,6 @@
5252
bench.print_output(library='cuml', algorithm='ridge_regression',
5353
stages=['training', 'prediction'], params=params,
5454
functions=['Ridge.fit', 'Ridge.predict'],
55-
times=[fit_time, predict_time], accuracy_type='rmse',
56-
accuracies=[train_rmse, test_rmse], data=[X_train, X_test],
55+
times=[fit_time, predict_time], metric_type='rmse',
56+
metrics=[train_rmse, test_rmse], data=[X_train, X_test],
5757
alg_instance=regr)

cuml_bench/svm.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,14 +57,14 @@
5757

5858
if params.probability:
5959
state_predict = 'predict_proba'
60-
accuracy_type = 'log_loss'
60+
metric_type = 'log_loss'
6161
clf_predict = clf.predict_proba
6262

6363
def metric_call(x, y):
6464
return bench.log_loss(x, y)
6565
else:
6666
state_predict = 'prediction'
67-
accuracy_type = 'accuracy[%]'
67+
metric_type = 'accuracy[%]'
6868
clf_predict = clf.predict
6969

7070
def metric_call(x, y):
@@ -82,6 +82,6 @@ def metric_call(x, y):
8282
bench.print_output(library='cuml', algorithm='svc',
8383
stages=['training', state_predict], params=params,
8484
functions=['SVM.fit', 'SVM.predict'],
85-
times=[fit_time, predict_train_time], accuracy_type=accuracy_type,
86-
accuracies=[train_acc, test_acc], data=[X_train, X_train],
85+
times=[fit_time, predict_train_time], metric_type=metric_type,
86+
metrics=[train_acc, test_acc], data=[X_train, X_train],
8787
alg_instance=clf)

cuml_bench/svr.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,6 @@
6666
bench.print_output(library='cuml', algorithm='svr',
6767
stages=['training', 'prediction'], params=params,
6868
functions=['SVR.fit', 'SVR.predict'],
69-
times=[fit_time, predict_train_time], accuracy_type='rmse',
70-
accuracies=[train_rmse, test_rmse], data=[X_train, X_train],
69+
times=[fit_time, predict_train_time], metric_type='rmse',
70+
metrics=[train_rmse, test_rmse], data=[X_train, X_train],
7171
alg_instance=regr)

cuml_bench/train_test_split.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,5 +44,5 @@
4444

4545
bench.print_output(library='cuml', algorithm='train_test_split',
4646
stages=['training'], params=params,
47-
functions=['train_test_split'], times=[time], accuracies=[None],
48-
accuracy_type=None, data=[X], alg_params=tts_params)
47+
functions=['train_test_split'], times=[time], metrics=[None],
48+
metric_type=None, data=[X], alg_params=tts_params)

daal4py_bench/dbscan.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,4 +51,4 @@ def test_dbscan(X):
5151

5252
bench.print_output(library='daal4py', algorithm='dbscan', stages=['training'],
5353
params=params, functions=['DBSCAN'], times=[time],
54-
accuracies=[None], accuracy_type=None, data=[X])
54+
metrics=[None], metric_type=None, data=[X])

daal4py_bench/df_clsf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,5 +126,5 @@ def df_clsf_predict(X, training_result, n_classes, verbose=False):
126126
bench.print_output(library='daal4py', algorithm='decision_forest_classification',
127127
stages=['training', 'prediction'], params=params,
128128
functions=['df_clsf.fit', 'df_clsf.predict'],
129-
times=[fit_time, predict_time], accuracy_type='accuracy[%]',
130-
accuracies=[train_acc, test_acc], data=[X_train, X_test])
129+
times=[fit_time, predict_time], metric_type='accuracy[%]',
130+
metrics=[train_acc, test_acc], data=[X_train, X_test])

daal4py_bench/df_regr.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,5 +123,5 @@ def df_regr_predict(X, training_result):
123123
bench.print_output(library='daal4py', algorithm='decision_forest_regression',
124124
stages=['training', 'prediction'], params=params,
125125
functions=['df_regr.fit', 'df_regr.predict'],
126-
times=[fit_time, predict_time], accuracy_type='rmse',
127-
accuracies=[train_rmse, test_rmse], data=[X_train, X_test])
126+
times=[fit_time, predict_time], metric_type='rmse',
127+
metrics=[train_rmse, test_rmse], data=[X_train, X_test])

daal4py_bench/distances.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,5 +43,5 @@ def compute_distances(pairwise_distances, X):
4343

4444
bench.print_output(library='daal4py', algorithm='distances', stages=['computation'],
4545
params=params, functions=[params.metric.capitalize()], times=[time],
46-
accuracy_type=None, accuracies=[None], data=[X],
46+
metric_type=None, metrics=[None], data=[X],
4747
alg_params={'metric': params.metric})

daal4py_bench/kmeans.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,5 +87,5 @@ def test_predict(X, X_init):
8787
bench.print_output(library='daal4py', algorithm='kmeans',
8888
stages=['training', 'prediction'],
8989
params=params, functions=['KMeans.fit', 'KMeans.predict'],
90-
times=[fit_time, predict_time], accuracy_type='inertia',
91-
accuracies=[train_inertia, test_inertia], data=[X_train, X_test])
90+
times=[fit_time, predict_time], metric_type='inertia',
91+
metrics=[train_inertia, test_inertia], data=[X_train, X_test])

daal4py_bench/linear.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,5 +68,5 @@ def test_predict(Xp, model):
6868
bench.print_output(library='daal4py', algorithm='linear_regression',
6969
stages=['training', 'prediction'],
7070
params=params, functions=['Linear.fit', 'Linear.predict'],
71-
times=[fit_time, predict_time], accuracy_type='rmse',
72-
accuracies=[train_rmse, test_rmse], data=[X_train, X_test])
71+
times=[fit_time, predict_time], metric_type='rmse',
72+
metrics=[train_rmse, test_rmse], data=[X_train, X_test])

daal4py_bench/pca.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def test_transform(Xp, pca_result, eigenvalues, eigenvectors):
142142
bench.print_output(library='daal4py', algorithm='pca',
143143
stages=['training', 'transformation'],
144144
params=params, functions=['PCA.fit', 'PCA.transform'],
145-
times=[fit_time, transform_time], accuracy_type=None,
146-
accuracies=[None, None], data=[X_train, X_test],
145+
times=[fit_time, transform_time], metric_type=None,
146+
metrics=[None, None], data=[X_train, X_test],
147147
alg_params={'svd_solver': params.svd_solver,
148148
'n_components': params.n_components})

daal4py_bench/ridge.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,5 +64,5 @@ def test_predict(Xp, model):
6464
bench.print_output(library='daal4py', algorithm='ridge_regression',
6565
stages=['training', 'prediction'], params=params,
6666
functions=['Ridge.fit', 'Ridge.predict'],
67-
times=[fit_time, predict_time], accuracy_type='rmse',
68-
accuracies=[train_rmse, test_rmse], data=[X_train, X_test])
67+
times=[fit_time, predict_time], metric_type='rmse',
68+
metrics=[train_rmse, test_rmse], data=[X_train, X_test])

datasets/loader_classification.py

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,11 @@ def airline(dataset_dir: Path) -> bool:
6868
Airline dataset
6969
http://kt.ijs.si/elena_ikonomovska/data.html
7070
71-
TaskType:binclass
72-
NumberOfFeatures:13
73-
NumberOfInstances:115M
71+
Classification task. n_classes = 2.
72+
airline X train dataset (92055213, 13)
73+
airline y train dataset (92055213, 1)
74+
airline X test dataset (23013804, 13)
75+
airline y test dataset (23013804, 1)
7476
"""
7577
dataset_name = 'airline'
7678
os.makedirs(dataset_dir, exist_ok=True)
@@ -126,9 +128,12 @@ def airline(dataset_dir: Path) -> bool:
126128
def airline_ohe(dataset_dir: Path) -> bool:
127129
"""
128130
Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf
129-
TaskType:binclass
130-
NumberOfFeatures:700
131-
NumberOfInstances:10100000
131+
132+
Classification task. n_classes = 2.
133+
airline-ohe X train dataset (1000000, 692)
134+
airline-ohe y train dataset (1000000, 1)
135+
airline-ohe X test dataset (100000, 692)
136+
airline-ohe y test dataset (100000, 1)
132137
"""
133138
dataset_name = 'airline-ohe'
134139
os.makedirs(dataset_dir, exist_ok=True)
@@ -289,9 +294,11 @@ def epsilon(dataset_dir: Path) -> bool:
289294
Epsilon dataset
290295
https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html
291296
292-
TaskType:binclass
293-
NumberOfFeatures:2000
294-
NumberOfInstances:500K
297+
Classification task. n_classes = 2.
298+
epsilon X train dataset (400000, 2000)
299+
epsilon y train dataset (400000, 1)
300+
epsilon X test dataset (100000, 2000)
301+
epsilon y test dataset (100000, 1)
295302
"""
296303
dataset_name = 'epsilon'
297304
os.makedirs(dataset_dir, exist_ok=True)
@@ -444,9 +451,11 @@ def higgs(dataset_dir: Path) -> bool:
444451
Higgs dataset from UCI machine learning repository
445452
https://archive.ics.uci.edu/ml/datasets/HIGGS
446453
447-
TaskType:binclass
448-
NumberOfFeatures:28
449-
NumberOfInstances:11M
454+
Classification task. n_classes = 2.
455+
higgs X train dataset (8799999, 28)
456+
higgs y train dataset (8799999, 1)
457+
higgs X test dataset (2200000, 28)
458+
higgs y test dataset (2200000, 1)
450459
"""
451460
dataset_name = 'higgs'
452461
os.makedirs(dataset_dir, exist_ok=True)
@@ -479,9 +488,11 @@ def higgs_one_m(dataset_dir: Path) -> bool:
479488
480489
Only first 1.5M samples is taken
481490
482-
TaskType:binclass
483-
NumberOfFeatures:28
484-
NumberOfInstances:1.5M
491+
Classification task. n_classes = 2.
492+
higgs1m X train dataset (1000000, 28)
493+
higgs1m y train dataset (1000000, 1)
494+
higgs1m X test dataset (500000, 28)
495+
higgs1m y test dataset (500000, 1)
485496
"""
486497
dataset_name = 'higgs1m'
487498
os.makedirs(dataset_dir, exist_ok=True)

0 commit comments

Comments
 (0)