From 52d54fb6f39265e40474e0c230887e805d1f2145 Mon Sep 17 00:00:00 2001 From: Kirill Petrov Date: Fri, 16 Apr 2021 01:43:34 +0300 Subject: [PATCH 01/14] add support proba svc --- README.md | 8 +- bench.py | 3 + configs/algorithms/svc_proba.json | 230 ++++++++++++++++++++++++++++++ cuml_bench/README.md | 3 +- cuml_bench/svm.py | 36 +++-- datasets/loader.py | 6 +- runner.py | 1 + sklearn_bench/README.md | 11 +- sklearn_bench/requirements.txt | 4 + sklearn_bench/svm.py | 48 ++++--- utils.py | 7 +- 11 files changed, 309 insertions(+), 48 deletions(-) create mode 100755 configs/algorithms/svc_proba.json create mode 100755 sklearn_bench/requirements.txt diff --git a/README.md b/README.md index d2f2ebe34..ad1fc4ab4 100755 --- a/README.md +++ b/README.md @@ -39,7 +39,11 @@ Create a suitable conda environment for each framework to test. Each item in the * [**scikit-learn**](sklearn_bench#how-to-create-conda-environment-for-benchmarking) ```bash -conda create -n bench -c intel python=3.7 scikit-learn scikit-learn-intelex pandas +pip install -r sklearn_bench/requirements.txt +``` +or +```bash +conda create -n bench -c conda-forge scikit-learn scikit-learn-intelex pandas ``` * [**daal4py**](daal4py_bench#how-to-create-conda-environment-for-benchmarking) @@ -109,7 +113,7 @@ The configuration of benchmarks allows you to select the frameworks to run, sele ## Intel(R) Extension for Scikit-learn support -When you run scikit-learn benchmarks on CPU, [Intel(R) Extension for Scikit-learn](https://github.com/intel/scikit-learn-intelex) is used by default. Use the ``--no-intel-optimized`` option to run the benchmarks without the extension. +When you run scikit-learn benchmarks on CPU, [Intel(R) Extension for Scikit-learn](https://github.com/intel/scikit-learn-intelex) is used by default. Use the ``--no-intel-optimized`` option to run the benchmarks without the extension. The following benchmarks have a GPU support: * dbscan diff --git a/bench.py b/bench.py index 591db126c..2dd766073 100644 --- a/bench.py +++ b/bench.py @@ -339,6 +339,9 @@ def columnwise_score(y, yp, score_func): def accuracy_score(y, yp): return columnwise_score(y, yp, lambda y1, y2: np.mean(y1 == y2)) +def log_loss(y, yp): + from sklearn.metrics import log_loss + return columnwise_score(y, yp, log_loss(y1 == y2)) def rmse_score(y, yp): return columnwise_score( diff --git a/configs/algorithms/svc_proba.json b/configs/algorithms/svc_proba.json new file mode 100755 index 000000000..7b7ec3c85 --- /dev/null +++ b/configs/algorithms/svc_proba.json @@ -0,0 +1,230 @@ +{ + "common": { + "lib": ["sklearn"], + "data-format": ["pandas"], + "data-order": ["F"], + "dtype": ["float64"] + }, + "cases": [ + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "ijcnn", + "training": + { + "x": "data/ijcnn_x_train.csv", + "y": "data/ijcnn_y_train.csv" + }, + "testing": + { + "x": "data/ijcnn_x_test.csv", + "y": "data/ijcnn_y_test.csv" + } + } + ], + "C": [1000.0], + "kernel": ["linear"], + "probability": [""] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "a9a", + "training": + { + "x": "data/a9a_x_train.csv", + "y": "data/a9a_y_train.csv" + }, + "testing": + { + "x": "data/a9a_x_test.csv", + "y": "data/a9a_y_test.csv" + } + } + ], + "C": [500.0], + "kernel": ["rbf"], + "probability": [""] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "gisette", + "training": + { + "x": "data/gisette_x_train.csv", + "y": "data/gisette_y_train.csv" + }, + "testing": + { + "x": "data/gisette_x_test.csv", + "y": "data/gisette_y_test.csv" + } + } + ], + "C": [1.5e-3], + "kernel": ["linear"], + "probability": [""] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "klaverjas", + "training": + { + "x": "data/klaverjas_x_train.csv", + "y": "data/klaverjas_y_train.csv" + }, + "testing": + { + "x": "data/klaverjas_x_test.csv", + "y": "data/klaverjas_y_test.csv" + } + } + ], + "C": [1.0], + "kernel": ["rbf"], + "probability": [""] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "connect4", + "training": + { + "x": "data/connect_x_train.csv", + "y": "data/connect_y_train.csv" + }, + "testing": + { + "x": "data/connect_x_test.csv", + "y": "data/connect_y_test.csv" + } + } + ], + "C": [100.0], + "kernel": ["linear"], + "probability": [""] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "mnist", + "training": + { + "x": "data/mnist_x_train.csv", + "y": "data/mnist_y_train.csv" + }, + "testing": + { + "x": "data/mnist_x_test.csv", + "y": "data/mnist_y_test.csv" + } + } + ], + "C": [50.0], + "kernel": ["rbf"], + "probability": [""] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "sensit", + "training": + { + "x": "data/sensit_x_train.csv", + "y": "data/sensit_y_train.csv" + }, + "testing": + { + "x": "data/sensit_x_test.csv", + "y": "data/sensit_y_test.csv" + } + } + ], + "C": [500.0], + "kernel": ["linear"], + "probability": [""] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "skin_segmentation", + "training": + { + "x": "data/skin_segmentation_x_train.csv", + "y": "data/skin_segmentation_y_train.csv" + }, + "testing": + { + "x": "data/skin_segmentation_x_test.csv", + "y": "data/skin_segmentation_y_test.csv" + } + } + ], + "C": [1.0], + "kernel": ["rbf"], + "probability": [""] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "covertype", + "training": + { + "x": "data/covertype_x_train.csv", + "y": "data/covertype_y_train.csv" + }, + "testing": + { + "x": "data/covertype_x_test.csv", + "y": "data/covertype_y_test.csv" + } + } + ], + "C": [100.0], + "kernel": ["rbf"], + "probability": [""] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "codrnanorm", + "training": + { + "x": "data/codrnanorm_x_train.csv", + "y": "data/codrnanorm_y_train.csv" + }, + "testing": + { + "x": "data/codrnanorm_x_test.csv", + "y": "data/codrnanorm_y_test.csv" + } + } + ], + "C": [1000.0], + "kernel": ["linear"], + "probability": [""] + } + ] +} diff --git a/cuml_bench/README.md b/cuml_bench/README.md index bf6b12ada..e65f11432 100644 --- a/cuml_bench/README.md +++ b/cuml_bench/README.md @@ -137,10 +137,9 @@ You can launch benchmarks for each algorithm separately. The tables below list a | C | float | 0.01 | SVM slack parameter | | kernel | str | linear | *linear* or *rbf*. SVM kernel function | | gamma | float | None | Parameter for kernel="rbf" | -| maxiter | int | 2000 | Maximum iterations for the iterative solver | | max-cache-size | int | 64 | Maximum cache size for SVM. | | tol | float | 1e-16 | Tolerance passed to sklearn.svm.SVC | -| no-shrinking | action | True | Don't use shrinking heuristic | +| probability | action | True | Use probability for SVC | #### train_test_split diff --git a/cuml_bench/svm.py b/cuml_bench/svm.py index 0f24a5ec0..d6e9b12df 100644 --- a/cuml_bench/svm.py +++ b/cuml_bench/svm.py @@ -26,18 +26,17 @@ help='SVM regularization parameter') parser.add_argument('--kernel', choices=('linear', 'rbf'), default='linear', help='SVM kernel function') -parser.add_argument('--maxiter', type=int, default=-1, - help='Maximum iterations for the iterative solver. ' - '-1 means no limit.') parser.add_argument('--gamma', type=float, default=None, help='Parameter for kernel="rbf"') parser.add_argument('--max-cache-size', type=int, default=8, help='Maximum cache size, in gigabytes, for SVM.') parser.add_argument('--tol', type=float, default=1e-3, help='Tolerance passed to sklearn.svm.SVC') +parser.add_argument('--probability', action='store_true', default=False, + dest='probability', help="Use probability for SVC") + params = bench.parse_args(parser) -# Load data X_train, X_test, y_train, y_test = bench.load_data(params) if params.gamma is None: @@ -48,25 +47,36 @@ params.cache_size_mb = cache_size_bytes / 1024**2 params.n_classes = y_train[y_train.columns[0]].nunique() -# Create our C-SVM classifier clf = SVC(C=params.C, kernel=params.kernel, max_iter=params.maxiter, cache_size=params.cache_size_mb, tol=params.tol, gamma=params.gamma) -# Time fit and predict fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) params.sv_len = clf.support_.shape[0] -predict_time, y_pred = bench.measure_function_time( - clf.predict, X_train, params=params) -train_acc = 100 * bench.accuracy_score(y_pred, y_train) +if params.probability: + state_predict = 'predict_proba' + accuracy_type = 'log_loss' + def metric_call(x, y): return bench.log_loss(x, y) + clf_predict = clf.predict_proba +else: + state_predict = 'prediction' + accuracy_type = 'accuracy[%]' + def metric_call(x, y): return 100 * bench.accuracy_score(x, y) + clf_predict = clf.predict + + +predict_train_time, y_pred = bench.measure_function_time( + clf_predict, X_train, params=params) +train_acc = metric_call(y_train, y_pred) -y_pred = clf.predict(X_test) -test_acc = 100 * bench.accuracy_score(y_pred, y_test) +predict_test_time, y_pred = bench.measure_function_time( + clf_predict, X_test, params=params) +test_acc = metric_call(y_test, y_pred) bench.print_output(library='cuml', algorithm='svc', - stages=['training', 'prediction'], params=params, + stages=['training', state_predict], params=params, functions=['SVM.fit', 'SVM.predict'], - times=[fit_time, predict_time], accuracy_type='accuracy[%]', + times=[fit_time, predict_train_time], accuracy_type=accuracy_type, accuracies=[train_acc, test_acc], data=[X_train, X_train], alg_instance=clf) diff --git a/datasets/loader.py b/datasets/loader.py index 055fd52a6..d6cd97954 100755 --- a/datasets/loader.py +++ b/datasets/loader.py @@ -179,9 +179,9 @@ def connect(dataset_dir=None): http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.htm Classification task. n_classes = 3. - connect X train dataset (196045, 127) - connect y train dataset (196045, 1) - connect X test dataset (49012, 127) + connect X train dataset (60801, 126) + connect y train dataset (60801, 1) + connect X test dataset (49012, 126) connect y test dataset (49012, 1) """ dataset_name = 'connect' diff --git a/runner.py b/runner.py index 58f99588a..a77c29a51 100755 --- a/runner.py +++ b/runner.py @@ -238,6 +238,7 @@ class GenerationArgs: except json.JSONDecodeError as decoding_exception: stderr += f'CASE {case} JSON DECODING ERROR:\n' \ + f'{decoding_exception}\n{stdout}\n' + if stderr != '': is_successful = False logging.warning('Error in benchmark: \n' + stderr) diff --git a/sklearn_bench/README.md b/sklearn_bench/README.md index d3859ab1f..9f6887197 100644 --- a/sklearn_bench/README.md +++ b/sklearn_bench/README.md @@ -3,7 +3,13 @@ If you want to test scikit-learn, then use -`conda create -n bench -c intel python=3.7 scikit-learn daal4py pandas` +```bash +pip install -r sklearn_bench/requirements.txt +``` +or +```bash +conda create -n bench -c conda-forge scikit-learn scikit-learn-intelex pandas +``` ## Algorithms parameters @@ -144,10 +150,9 @@ You can launch benchmarks for each algorithm separately. The tables below list a | C | float | 0.01 | SVM slack parameter | | kernel | str | linear | *linear* or *rbf*. SVM kernel function | | gamma | float | None | Parameter for kernel="rbf" | -| maxiter | int | 2000 | Maximum iterations for the iterative solver | | max-cache-size | int | 64 | Maximum cache size for SVM. | | tol | float | 1e-16 | Tolerance passed to sklearn.svm.SVC | -| no-shrinking | action | True | Don't use shrinking heuristic | +| probability | action | True | Use probability for SVC | #### train_test_split diff --git a/sklearn_bench/requirements.txt b/sklearn_bench/requirements.txt new file mode 100755 index 000000000..d25373e5e --- /dev/null +++ b/sklearn_bench/requirements.txt @@ -0,0 +1,4 @@ +scikit-learn +pandas +scikit-learn-intelex +openpyxl diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py index 102a6e62a..914f0a98a 100644 --- a/sklearn_bench/svm.py +++ b/sklearn_bench/svm.py @@ -18,13 +18,12 @@ import bench import numpy as np -from sklearn.metrics import accuracy_score +from sklearn.metrics import accuracy_score, log_loss def main(): from sklearn.svm import SVC - # Load data X_train, X_test, y_train, y_test = bench.load_data(params) if params.gamma is None: @@ -35,26 +34,35 @@ def main(): params.cache_size_mb = cache_size_bytes / 1024**2 params.n_classes = len(np.unique(y_train)) - # Create our C-SVM classifier - clf = SVC(C=params.C, kernel=params.kernel, max_iter=params.maxiter, - cache_size=params.cache_size_mb, tol=params.tol, - shrinking=params.shrinking, gamma=params.gamma) + clf = SVC(C=params.C, kernel=params.kernel, cache_size=params.cache_size_mb, + tol=params.tol, gamma=params.gamma, probability=params.probability) - # Time fit and predict fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) params.sv_len = clf.support_.shape[0] - predict_time, y_pred = bench.measure_function_time( - clf.predict, X_train, params=params) - train_acc = 100 * accuracy_score(y_pred, y_train) + if params.probability: + state_predict = 'predict_proba' + accuracy_type = 'log_loss' + def metric_call(x, y): return log_loss(x, y) + clf_predict = clf.predict_proba + else: + state_predict = 'predict' + accuracy_type = 'accuracy[%]' + def metric_call(x, y): return 100 * accuracy_score(x, y) + clf_predict = clf.predict - y_pred = clf.predict(X_test) - test_acc = 100 * accuracy_score(y_pred, y_test) + predict_train_time, y_pred = bench.measure_function_time( + clf_predict, X_train, params=params) + train_acc = metric_call(y_train, y_pred) + + predict_test_time, y_pred = bench.measure_function_time( + clf_predict, X_test, params=params) + test_acc = metric_call(y_test, y_pred) bench.print_output(library='sklearn', algorithm='svc', - stages=['training', 'prediction'], - params=params, functions=['SVM.fit', 'SVM.predict'], - times=[fit_time, predict_time], accuracy_type='accuracy[%]', + stages=['training', state_predict], + params=params, functions=['SVM.fit', f'SVM.{state_predict}'], + times=[fit_time, predict_train_time], accuracy_type=accuracy_type, accuracies=[train_acc, test_acc], data=[X_train, X_train], alg_instance=clf) @@ -64,18 +72,16 @@ def main(): parser.add_argument('-C', dest='C', type=float, default=1.0, help='SVM regularization parameter') - parser.add_argument('--kernel', choices=('linear', 'rbf'), + parser.add_argument('--kernel', choices=('linear', 'rbf', 'poly'), default='linear', help='SVM kernel function') parser.add_argument('--gamma', type=float, default=None, help='Parameter for kernel="rbf"') - parser.add_argument('--maxiter', type=int, default=-1, - help='Maximum iterations for the iterative solver. ' - '-1 means no limit.') parser.add_argument('--max-cache-size', type=int, default=8, help='Maximum cache size, in gigabytes, for SVM.') parser.add_argument('--tol', type=float, default=1e-3, help='Tolerance passed to sklearn.svm.SVC') - parser.add_argument('--no-shrinking', action='store_false', default=True, - dest='shrinking', help="Don't use shrinking heuristic") + parser.add_argument('--probability', action='store_true', default=False, + dest='probability', help="Use probability for SVC") + params = bench.parse_args(parser, loop_types=('fit', 'predict')) bench.run_with_context(params, main) diff --git a/utils.py b/utils.py index a3cfa7a68..6d77d4765 100755 --- a/utils.py +++ b/utils.py @@ -24,10 +24,9 @@ def filter_stderr(text): - # delete 'Intel(R) DAAL usage in sklearn' messages - fake_error_message = 'Intel(R) oneAPI Data Analytics Library solvers ' + \ - 'for sklearn enabled: ' + \ - 'https://intelpython.github.io/daal4py/sklearn.html' + # delete 'Intel(R) Extension for Scikit-learn usage in sklearn' messages + fake_error_message = 'Intel(R) Extension for Scikit-learn* enabled ' + \ + '(https://github.com/intel/scikit-learn-intelex)' while fake_error_message in text: text = text.replace(fake_error_message, '') return text From b081c3c457fd5c39a68d420ddbf42f1a59e80968 Mon Sep 17 00:00:00 2001 From: Kirill Petrov Date: Fri, 16 Apr 2021 01:57:38 +0300 Subject: [PATCH 02/14] extend; fix problems --- README.md | 12 +- bench.py | 4 +- configs/algorithms/svc_proba_cuml.json | 230 ++++++++++++++++++ ...{svc_proba.json => svc_proba_sklearn.json} | 2 +- sklearn_bench/README.md | 2 +- xgboost_bench/README.md | 9 +- xgboost_bench/requirements.txt | 4 + 7 files changed, 255 insertions(+), 8 deletions(-) create mode 100755 configs/algorithms/svc_proba_cuml.json rename configs/algorithms/{svc_proba.json => svc_proba_sklearn.json} (96%) create mode 100755 xgboost_bench/requirements.txt diff --git a/README.md b/README.md index ad1fc4ab4..130902392 100755 --- a/README.md +++ b/README.md @@ -43,25 +43,29 @@ pip install -r sklearn_bench/requirements.txt ``` or ```bash -conda create -n bench -c conda-forge scikit-learn scikit-learn-intelex pandas +conda install -c conda-forge scikit-learn scikit-learn-intelex pandas ``` * [**daal4py**](daal4py_bench#how-to-create-conda-environment-for-benchmarking) ```bash -conda create -n bench -c intel python=3.7 scikit-learn daal4py pandas +conda install -c intel python=3.7 scikit-learn daal4py pandas ``` * [**cuml**](cuml_bench#how-to-create-conda-environment-for-benchmarking) ```bash -conda create -n bench -c rapidsai -c conda-forge python=3.7 cuml pandas cudf +conda install -c rapidsai -c conda-forge python=3.7 cuml pandas cudf ``` * [**xgboost**](xgboost_bench#how-to-create-conda-environment-for-benchmarking) ```bash -conda create -n bench -c conda-forge python=3.7 xgboost pandas +pip install -r xgboost_bench/requirements.txt +``` +or +```bash +conda install -c conda-forge xgboost pandas ``` ## Running Python benchmarks with runner script diff --git a/bench.py b/bench.py index 2dd766073..3802c7a7b 100644 --- a/bench.py +++ b/bench.py @@ -339,9 +339,11 @@ def columnwise_score(y, yp, score_func): def accuracy_score(y, yp): return columnwise_score(y, yp, lambda y1, y2: np.mean(y1 == y2)) + def log_loss(y, yp): from sklearn.metrics import log_loss - return columnwise_score(y, yp, log_loss(y1 == y2)) + return columnwise_score(y, yp, lambda y1, y2: log_loss(y1, y2)) + def rmse_score(y, yp): return columnwise_score( diff --git a/configs/algorithms/svc_proba_cuml.json b/configs/algorithms/svc_proba_cuml.json new file mode 100755 index 000000000..eaeb3b79a --- /dev/null +++ b/configs/algorithms/svc_proba_cuml.json @@ -0,0 +1,230 @@ +{ + "common": { + "lib": ["cuml"], + "data-format": ["cudf"], + "data-order": ["F"], + "dtype": ["float64"] + }, + "cases": [ + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "ijcnn", + "training": + { + "x": "data/ijcnn_x_train.csv", + "y": "data/ijcnn_y_train.csv" + }, + "testing": + { + "x": "data/ijcnn_x_test.csv", + "y": "data/ijcnn_y_test.csv" + } + } + ], + "C": [1000.0], + "kernel": ["linear"], + "probability": [""] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "a9a", + "training": + { + "x": "data/a9a_x_train.csv", + "y": "data/a9a_y_train.csv" + }, + "testing": + { + "x": "data/a9a_x_test.csv", + "y": "data/a9a_y_test.csv" + } + } + ], + "C": [500.0], + "kernel": ["rbf"], + "probability": [""] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "gisette", + "training": + { + "x": "data/gisette_x_train.csv", + "y": "data/gisette_y_train.csv" + }, + "testing": + { + "x": "data/gisette_x_test.csv", + "y": "data/gisette_y_test.csv" + } + } + ], + "C": [1.5e-3], + "kernel": ["linear"], + "probability": [""] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "klaverjas", + "training": + { + "x": "data/klaverjas_x_train.csv", + "y": "data/klaverjas_y_train.csv" + }, + "testing": + { + "x": "data/klaverjas_x_test.csv", + "y": "data/klaverjas_y_test.csv" + } + } + ], + "C": [1.0], + "kernel": ["rbf"], + "probability": [""] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "connect4", + "training": + { + "x": "data/connect_x_train.csv", + "y": "data/connect_y_train.csv" + }, + "testing": + { + "x": "data/connect_x_test.csv", + "y": "data/connect_y_test.csv" + } + } + ], + "C": [100.0], + "kernel": ["linear"], + "probability": [""] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "mnist", + "training": + { + "x": "data/mnist_x_train.csv", + "y": "data/mnist_y_train.csv" + }, + "testing": + { + "x": "data/mnist_x_test.csv", + "y": "data/mnist_y_test.csv" + } + } + ], + "C": [50.0], + "kernel": ["rbf"], + "probability": [""] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "sensit", + "training": + { + "x": "data/sensit_x_train.csv", + "y": "data/sensit_y_train.csv" + }, + "testing": + { + "x": "data/sensit_x_test.csv", + "y": "data/sensit_y_test.csv" + } + } + ], + "C": [500.0], + "kernel": ["linear"], + "probability": [""] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "skin_segmentation", + "training": + { + "x": "data/skin_segmentation_x_train.csv", + "y": "data/skin_segmentation_y_train.csv" + }, + "testing": + { + "x": "data/skin_segmentation_x_test.csv", + "y": "data/skin_segmentation_y_test.csv" + } + } + ], + "C": [1.0], + "kernel": ["rbf"], + "probability": [""] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "covertype", + "training": + { + "x": "data/covertype_x_train.csv", + "y": "data/covertype_y_train.csv" + }, + "testing": + { + "x": "data/covertype_x_test.csv", + "y": "data/covertype_y_test.csv" + } + } + ], + "C": [100.0], + "kernel": ["rbf"], + "probability": [""] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "codrnanorm", + "training": + { + "x": "data/codrnanorm_x_train.csv", + "y": "data/codrnanorm_y_train.csv" + }, + "testing": + { + "x": "data/codrnanorm_x_test.csv", + "y": "data/codrnanorm_y_test.csv" + } + } + ], + "C": [1000.0], + "kernel": ["linear"], + "probability": [""] + } + ] +} diff --git a/configs/algorithms/svc_proba.json b/configs/algorithms/svc_proba_sklearn.json similarity index 96% rename from configs/algorithms/svc_proba.json rename to configs/algorithms/svc_proba_sklearn.json index 7b7ec3c85..98fa6d3e4 100755 --- a/configs/algorithms/svc_proba.json +++ b/configs/algorithms/svc_proba_sklearn.json @@ -99,7 +99,7 @@ "dataset": [ { "source": "csv", - "name": "connect4", + "name": "connect", "training": { "x": "data/connect_x_train.csv", diff --git a/sklearn_bench/README.md b/sklearn_bench/README.md index 9f6887197..3ec6c2b42 100644 --- a/sklearn_bench/README.md +++ b/sklearn_bench/README.md @@ -8,7 +8,7 @@ pip install -r sklearn_bench/requirements.txt ``` or ```bash -conda create -n bench -c conda-forge scikit-learn scikit-learn-intelex pandas +conda install -c conda-forge scikit-learn scikit-learn-intelex pandas ``` ## Algorithms parameters diff --git a/xgboost_bench/README.md b/xgboost_bench/README.md index e8d93df58..e00feb7cb 100644 --- a/xgboost_bench/README.md +++ b/xgboost_bench/README.md @@ -1,6 +1,13 @@ ## How to create conda environment for benchmarking - conda create -n bench -c conda-forge python=3.7 xgboost pandas +```bash +pip install -r xgboost_bench/requirements.txt +``` +or + +```bash +conda install -c conda-forge xgboost pandas +``` ## Algorithms parameters diff --git a/xgboost_bench/requirements.txt b/xgboost_bench/requirements.txt new file mode 100755 index 000000000..1540ec04f --- /dev/null +++ b/xgboost_bench/requirements.txt @@ -0,0 +1,4 @@ +scikit-learn +pandas +xgboost +openpyxl From c19f11a533563b9554e87fe8dc41a5021a522444 Mon Sep 17 00:00:00 2001 From: Kirill Petrov Date: Fri, 16 Apr 2021 02:01:09 +0300 Subject: [PATCH 03/14] minnor fix --- bench.py | 4 ++-- cuml_bench/svm.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bench.py b/bench.py index 3802c7a7b..7da2328d7 100644 --- a/bench.py +++ b/bench.py @@ -341,8 +341,8 @@ def accuracy_score(y, yp): def log_loss(y, yp): - from sklearn.metrics import log_loss - return columnwise_score(y, yp, lambda y1, y2: log_loss(y1, y2)) + from sklearn.metrics import log_loss as sklearn_log_loss + return columnwise_score(y, yp, sklearn_log_loss) def rmse_score(y, yp): diff --git a/cuml_bench/svm.py b/cuml_bench/svm.py index d6e9b12df..9421e6ece 100644 --- a/cuml_bench/svm.py +++ b/cuml_bench/svm.py @@ -24,7 +24,7 @@ parser.add_argument('-C', dest='C', type=float, default=1.0, help='SVM regularization parameter') -parser.add_argument('--kernel', choices=('linear', 'rbf'), +parser.add_argument('--kernel', choices=('linear', 'rbf', 'poly'), default='linear', help='SVM kernel function') parser.add_argument('--gamma', type=float, default=None, help='Parameter for kernel="rbf"') From 8eadd82d0e0d1bb84d78af04412a9b33142d56bb Mon Sep 17 00:00:00 2001 From: Kirill Petrov Date: Fri, 16 Apr 2021 02:12:04 +0300 Subject: [PATCH 04/14] cache size is 2 --- configs/algorithms/svc_proba_cuml.json | 10 ++++++++++ configs/algorithms/svc_proba_sklearn.json | 7 +++++++ cuml_bench/svm.py | 6 ++++-- sklearn_bench/svm.py | 6 ++++-- 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/configs/algorithms/svc_proba_cuml.json b/configs/algorithms/svc_proba_cuml.json index eaeb3b79a..08ac3dec2 100755 --- a/configs/algorithms/svc_proba_cuml.json +++ b/configs/algorithms/svc_proba_cuml.json @@ -26,6 +26,7 @@ ], "C": [1000.0], "kernel": ["linear"], + "max-cache-size": [2], "probability": [""] }, { @@ -48,6 +49,7 @@ ], "C": [500.0], "kernel": ["rbf"], + "max-cache-size": [2], "probability": [""] }, { @@ -70,6 +72,7 @@ ], "C": [1.5e-3], "kernel": ["linear"], + "max-cache-size": [2], "probability": [""] }, { @@ -92,6 +95,7 @@ ], "C": [1.0], "kernel": ["rbf"], + "max-cache-size": [2], "probability": [""] }, { @@ -114,6 +118,7 @@ ], "C": [100.0], "kernel": ["linear"], + "max-cache-size": [2], "probability": [""] }, { @@ -136,6 +141,7 @@ ], "C": [50.0], "kernel": ["rbf"], + "max-cache-size": [2], "probability": [""] }, { @@ -158,6 +164,7 @@ ], "C": [500.0], "kernel": ["linear"], + "max-cache-size": [2], "probability": [""] }, { @@ -180,6 +187,7 @@ ], "C": [1.0], "kernel": ["rbf"], + "max-cache-size": [2], "probability": [""] }, { @@ -202,6 +210,7 @@ ], "C": [100.0], "kernel": ["rbf"], + "max-cache-size": [2], "probability": [""] }, { @@ -224,6 +233,7 @@ ], "C": [1000.0], "kernel": ["linear"], + "max-cache-size": [2], "probability": [""] } ] diff --git a/configs/algorithms/svc_proba_sklearn.json b/configs/algorithms/svc_proba_sklearn.json index 98fa6d3e4..db9a1bc33 100755 --- a/configs/algorithms/svc_proba_sklearn.json +++ b/configs/algorithms/svc_proba_sklearn.json @@ -92,6 +92,7 @@ ], "C": [1.0], "kernel": ["rbf"], + "max-cache-size": [2], "probability": [""] }, { @@ -114,6 +115,7 @@ ], "C": [100.0], "kernel": ["linear"], + "max-cache-size": [2], "probability": [""] }, { @@ -136,6 +138,7 @@ ], "C": [50.0], "kernel": ["rbf"], + "max-cache-size": [2], "probability": [""] }, { @@ -158,6 +161,7 @@ ], "C": [500.0], "kernel": ["linear"], + "max-cache-size": [2], "probability": [""] }, { @@ -180,6 +184,7 @@ ], "C": [1.0], "kernel": ["rbf"], + "max-cache-size": [2], "probability": [""] }, { @@ -202,6 +207,7 @@ ], "C": [100.0], "kernel": ["rbf"], + "max-cache-size": [2], "probability": [""] }, { @@ -224,6 +230,7 @@ ], "C": [1000.0], "kernel": ["linear"], + "max-cache-size": [2], "probability": [""] } ] diff --git a/cuml_bench/svm.py b/cuml_bench/svm.py index 9421e6ece..e5e633b6b 100644 --- a/cuml_bench/svm.py +++ b/cuml_bench/svm.py @@ -57,12 +57,14 @@ if params.probability: state_predict = 'predict_proba' accuracy_type = 'log_loss' - def metric_call(x, y): return bench.log_loss(x, y) + def metric_call(x, y): + return bench.log_loss(x, y) clf_predict = clf.predict_proba else: state_predict = 'prediction' accuracy_type = 'accuracy[%]' - def metric_call(x, y): return 100 * bench.accuracy_score(x, y) + def metric_call(x, y): + return 100 * bench.accuracy_score(x, y) clf_predict = clf.predict diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py index 914f0a98a..fad43c6a9 100644 --- a/sklearn_bench/svm.py +++ b/sklearn_bench/svm.py @@ -43,12 +43,14 @@ def main(): if params.probability: state_predict = 'predict_proba' accuracy_type = 'log_loss' - def metric_call(x, y): return log_loss(x, y) + def metric_call(x, y): + return log_loss(x, y) clf_predict = clf.predict_proba else: state_predict = 'predict' accuracy_type = 'accuracy[%]' - def metric_call(x, y): return 100 * accuracy_score(x, y) + def metric_call(x, y): + return 100 * accuracy_score(x, y) clf_predict = clf.predict predict_train_time, y_pred = bench.measure_function_time( From 6fde87519650a1512a3b1905fa60bc8c0949a9d1 Mon Sep 17 00:00:00 2001 From: Kirill Petrov Date: Fri, 16 Apr 2021 02:20:14 +0300 Subject: [PATCH 05/14] add random_state --- cuml_bench/svm.py | 2 +- sklearn_bench/svm.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cuml_bench/svm.py b/cuml_bench/svm.py index e5e633b6b..a37f8ad1a 100644 --- a/cuml_bench/svm.py +++ b/cuml_bench/svm.py @@ -49,7 +49,7 @@ clf = SVC(C=params.C, kernel=params.kernel, max_iter=params.maxiter, cache_size=params.cache_size_mb, tol=params.tol, - gamma=params.gamma) + gamma=params.gamma, random_state=43) fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) params.sv_len = clf.support_.shape[0] diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py index fad43c6a9..20a7d2d7e 100644 --- a/sklearn_bench/svm.py +++ b/sklearn_bench/svm.py @@ -35,7 +35,8 @@ def main(): params.n_classes = len(np.unique(y_train)) clf = SVC(C=params.C, kernel=params.kernel, cache_size=params.cache_size_mb, - tol=params.tol, gamma=params.gamma, probability=params.probability) + tol=params.tol, gamma=params.gamma, probability=params.probability, + random_state=43) fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) params.sv_len = clf.support_.shape[0] From b62efc70313f558317037038ed9ee80e381398af Mon Sep 17 00:00:00 2001 From: Kirill Petrov Date: Fri, 16 Apr 2021 02:25:25 +0300 Subject: [PATCH 06/14] fix pep8 --- cuml_bench/svm.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cuml_bench/svm.py b/cuml_bench/svm.py index a37f8ad1a..0350c5101 100644 --- a/cuml_bench/svm.py +++ b/cuml_bench/svm.py @@ -57,14 +57,12 @@ if params.probability: state_predict = 'predict_proba' accuracy_type = 'log_loss' - def metric_call(x, y): - return bench.log_loss(x, y) + def metric_call(x, y): return bench.log_loss(x, y) clf_predict = clf.predict_proba else: state_predict = 'prediction' accuracy_type = 'accuracy[%]' - def metric_call(x, y): - return 100 * bench.accuracy_score(x, y) + def metric_call(x, y): return 100 * bench.accuracy_score(x, y) clf_predict = clf.predict From 5b9a6496059ba8f4586cb42643c451485a18d18e Mon Sep 17 00:00:00 2001 From: Kirill Petrov Date: Fri, 16 Apr 2021 02:28:37 +0300 Subject: [PATCH 07/14] fix parameters --- cuml_bench/svm.py | 5 ++--- sklearn_bench/svm.py | 6 ++---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/cuml_bench/svm.py b/cuml_bench/svm.py index 0350c5101..cc12e010b 100644 --- a/cuml_bench/svm.py +++ b/cuml_bench/svm.py @@ -47,9 +47,8 @@ params.cache_size_mb = cache_size_bytes / 1024**2 params.n_classes = y_train[y_train.columns[0]].nunique() -clf = SVC(C=params.C, kernel=params.kernel, max_iter=params.maxiter, - cache_size=params.cache_size_mb, tol=params.tol, - gamma=params.gamma, random_state=43) +clf = SVC(C=params.C, kernel=params.kernel, cache_size=params.cache_size_mb, + tol=params.tol, gamma=params.gamma, random_state=43) fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) params.sv_len = clf.support_.shape[0] diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py index 20a7d2d7e..694b21bfe 100644 --- a/sklearn_bench/svm.py +++ b/sklearn_bench/svm.py @@ -44,14 +44,12 @@ def main(): if params.probability: state_predict = 'predict_proba' accuracy_type = 'log_loss' - def metric_call(x, y): - return log_loss(x, y) + def metric_call(x, y): return log_loss(x, y) clf_predict = clf.predict_proba else: state_predict = 'predict' accuracy_type = 'accuracy[%]' - def metric_call(x, y): - return 100 * accuracy_score(x, y) + def metric_call(x, y): return 100 * accuracy_score(x, y) clf_predict = clf.predict predict_train_time, y_pred = bench.measure_function_time( From 6a7efb56ba9153262694b5b47fcef1c59eb73ce0 Mon Sep 17 00:00:00 2001 From: Kirill Petrov Date: Fri, 16 Apr 2021 02:34:48 +0300 Subject: [PATCH 08/14] fix cuml --- cuml_bench/svm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cuml_bench/svm.py b/cuml_bench/svm.py index cc12e010b..347ebf79b 100644 --- a/cuml_bench/svm.py +++ b/cuml_bench/svm.py @@ -48,7 +48,8 @@ params.n_classes = y_train[y_train.columns[0]].nunique() clf = SVC(C=params.C, kernel=params.kernel, cache_size=params.cache_size_mb, - tol=params.tol, gamma=params.gamma, random_state=43) + tol=params.tol, gamma=params.gamma, + probability=params.probability, random_state=43) fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) params.sv_len = clf.support_.shape[0] From 452366477035d67c3f3388d7a0b6a8bd45d45de4 Mon Sep 17 00:00:00 2001 From: Kirill Petrov Date: Fri, 16 Apr 2021 02:49:15 +0300 Subject: [PATCH 09/14] improve configs; fix warning --- .../{algorithms => svm}/svc_proba_cuml.json | 44 ++++++------------- .../svc_proba_sklearn.json | 41 ++++++----------- datasets/loader.py | 2 +- 3 files changed, 27 insertions(+), 60 deletions(-) rename configs/{algorithms => svm}/svc_proba_cuml.json (83%) rename configs/{algorithms => svm}/svc_proba_sklearn.json (84%) diff --git a/configs/algorithms/svc_proba_cuml.json b/configs/svm/svc_proba_cuml.json similarity index 83% rename from configs/algorithms/svc_proba_cuml.json rename to configs/svm/svc_proba_cuml.json index 08ac3dec2..7d0fb7d58 100755 --- a/configs/algorithms/svc_proba_cuml.json +++ b/configs/svm/svc_proba_cuml.json @@ -3,7 +3,9 @@ "lib": ["cuml"], "data-format": ["cudf"], "data-order": ["F"], - "dtype": ["float64"] + "dtype": ["float64"], + "max-cache-size": [2], + "probability": [""] }, "cases": [ { @@ -25,9 +27,7 @@ } ], "C": [1000.0], - "kernel": ["linear"], - "max-cache-size": [2], - "probability": [""] + "kernel": ["linear"] }, { "algorithm": "svm", @@ -48,9 +48,7 @@ } ], "C": [500.0], - "kernel": ["rbf"], - "max-cache-size": [2], - "probability": [""] + "kernel": ["rbf"] }, { "algorithm": "svm", @@ -71,9 +69,7 @@ } ], "C": [1.5e-3], - "kernel": ["linear"], - "max-cache-size": [2], - "probability": [""] + "kernel": ["linear"] }, { "algorithm": "svm", @@ -94,9 +90,7 @@ } ], "C": [1.0], - "kernel": ["rbf"], - "max-cache-size": [2], - "probability": [""] + "kernel": ["rbf"] }, { "algorithm": "svm", @@ -117,9 +111,7 @@ } ], "C": [100.0], - "kernel": ["linear"], - "max-cache-size": [2], - "probability": [""] + "kernel": ["linear"] }, { "algorithm": "svm", @@ -140,9 +132,7 @@ } ], "C": [50.0], - "kernel": ["rbf"], - "max-cache-size": [2], - "probability": [""] + "kernel": ["rbf"] }, { "algorithm": "svm", @@ -163,9 +153,7 @@ } ], "C": [500.0], - "kernel": ["linear"], - "max-cache-size": [2], - "probability": [""] + "kernel": ["linear"] }, { "algorithm": "svm", @@ -186,9 +174,7 @@ } ], "C": [1.0], - "kernel": ["rbf"], - "max-cache-size": [2], - "probability": [""] + "kernel": ["rbf"] }, { "algorithm": "svm", @@ -209,9 +195,7 @@ } ], "C": [100.0], - "kernel": ["rbf"], - "max-cache-size": [2], - "probability": [""] + "kernel": ["rbf"] }, { "algorithm": "svm", @@ -232,9 +216,7 @@ } ], "C": [1000.0], - "kernel": ["linear"], - "max-cache-size": [2], - "probability": [""] + "kernel": ["linear"] } ] } diff --git a/configs/algorithms/svc_proba_sklearn.json b/configs/svm/svc_proba_sklearn.json similarity index 84% rename from configs/algorithms/svc_proba_sklearn.json rename to configs/svm/svc_proba_sklearn.json index db9a1bc33..53c1676cf 100755 --- a/configs/algorithms/svc_proba_sklearn.json +++ b/configs/svm/svc_proba_sklearn.json @@ -3,7 +3,9 @@ "lib": ["sklearn"], "data-format": ["pandas"], "data-order": ["F"], - "dtype": ["float64"] + "dtype": ["float64"], + "max-cache-size": [2], + "probability": [""] }, "cases": [ { @@ -25,8 +27,7 @@ } ], "C": [1000.0], - "kernel": ["linear"], - "probability": [""] + "kernel": ["linear"] }, { "algorithm": "svm", @@ -47,8 +48,7 @@ } ], "C": [500.0], - "kernel": ["rbf"], - "probability": [""] + "kernel": ["rbf"] }, { "algorithm": "svm", @@ -69,8 +69,7 @@ } ], "C": [1.5e-3], - "kernel": ["linear"], - "probability": [""] + "kernel": ["linear"] }, { "algorithm": "svm", @@ -91,9 +90,7 @@ } ], "C": [1.0], - "kernel": ["rbf"], - "max-cache-size": [2], - "probability": [""] + "kernel": ["rbf"] }, { "algorithm": "svm", @@ -114,9 +111,7 @@ } ], "C": [100.0], - "kernel": ["linear"], - "max-cache-size": [2], - "probability": [""] + "kernel": ["linear"] }, { "algorithm": "svm", @@ -137,9 +132,7 @@ } ], "C": [50.0], - "kernel": ["rbf"], - "max-cache-size": [2], - "probability": [""] + "kernel": ["rbf"] }, { "algorithm": "svm", @@ -160,9 +153,7 @@ } ], "C": [500.0], - "kernel": ["linear"], - "max-cache-size": [2], - "probability": [""] + "kernel": ["linear"] }, { "algorithm": "svm", @@ -183,9 +174,7 @@ } ], "C": [1.0], - "kernel": ["rbf"], - "max-cache-size": [2], - "probability": [""] + "kernel": ["rbf"] }, { "algorithm": "svm", @@ -206,9 +195,7 @@ } ], "C": [100.0], - "kernel": ["rbf"], - "max-cache-size": [2], - "probability": [""] + "kernel": ["rbf"] }, { "algorithm": "svm", @@ -229,9 +216,7 @@ } ], "C": [1000.0], - "kernel": ["linear"], - "max-cache-size": [2], - "probability": [""] + "kernel": ["linear"] } ] } diff --git a/datasets/loader.py b/datasets/loader.py index d6cd97954..45690c8ae 100755 --- a/datasets/loader.py +++ b/datasets/loader.py @@ -187,7 +187,7 @@ def connect(dataset_dir=None): dataset_name = 'connect' os.makedirs(dataset_dir, exist_ok=True) - X, y = fetch_openml(name='connect-4', return_X_y=True, + X, y = fetch_openml(name='connect-4', version=1, return_X_y=True, as_frame=False, data_home=dataset_dir) X = pd.DataFrame(X.todense()) y = pd.DataFrame(y) From 2d27ecfb33d702a623cec001b04b52bfc4282cfe Mon Sep 17 00:00:00 2001 From: Kirill Petrov Date: Fri, 16 Apr 2021 02:57:54 +0300 Subject: [PATCH 10/14] fix cuml --- cuml_bench/svm.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cuml_bench/svm.py b/cuml_bench/svm.py index 347ebf79b..f0d07a2f5 100644 --- a/cuml_bench/svm.py +++ b/cuml_bench/svm.py @@ -48,11 +48,9 @@ params.n_classes = y_train[y_train.columns[0]].nunique() clf = SVC(C=params.C, kernel=params.kernel, cache_size=params.cache_size_mb, - tol=params.tol, gamma=params.gamma, - probability=params.probability, random_state=43) + tol=params.tol, gamma=params.gamma) fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) -params.sv_len = clf.support_.shape[0] if params.probability: state_predict = 'predict_proba' From c109b9d2b3ba803346405143ba34c563839d1686 Mon Sep 17 00:00:00 2001 From: Kirill Petrov Date: Fri, 16 Apr 2021 03:03:12 +0300 Subject: [PATCH 11/14] fix proba parameter --- cuml_bench/svm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuml_bench/svm.py b/cuml_bench/svm.py index f0d07a2f5..5e402acf8 100644 --- a/cuml_bench/svm.py +++ b/cuml_bench/svm.py @@ -48,7 +48,7 @@ params.n_classes = y_train[y_train.columns[0]].nunique() clf = SVC(C=params.C, kernel=params.kernel, cache_size=params.cache_size_mb, - tol=params.tol, gamma=params.gamma) + tol=params.tol, gamma=params.gamma, probability=params.probability) fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) From cfd7f7f94e6e1489f2c477800380cf2335ef0a99 Mon Sep 17 00:00:00 2001 From: Kirill Petrov Date: Fri, 16 Apr 2021 03:12:56 +0300 Subject: [PATCH 12/14] improve README --- README.md | 12 ++++-------- sklearn_bench/README.md | 4 +--- xgboost_bench/README.md | 5 +---- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 130902392..14944ee8c 100755 --- a/README.md +++ b/README.md @@ -40,31 +40,27 @@ Create a suitable conda environment for each framework to test. Each item in the ```bash pip install -r sklearn_bench/requirements.txt -``` -or -```bash +# or conda install -c conda-forge scikit-learn scikit-learn-intelex pandas ``` * [**daal4py**](daal4py_bench#how-to-create-conda-environment-for-benchmarking) ```bash -conda install -c intel python=3.7 scikit-learn daal4py pandas +conda install -c conda-forge scikit-learn daal4py pandas ``` * [**cuml**](cuml_bench#how-to-create-conda-environment-for-benchmarking) ```bash -conda install -c rapidsai -c conda-forge python=3.7 cuml pandas cudf +conda install -c rapidsai -c conda-forge cuml pandas cudf ``` * [**xgboost**](xgboost_bench#how-to-create-conda-environment-for-benchmarking) ```bash pip install -r xgboost_bench/requirements.txt -``` -or -```bash +# or conda install -c conda-forge xgboost pandas ``` diff --git a/sklearn_bench/README.md b/sklearn_bench/README.md index 3ec6c2b42..afa9c7eec 100644 --- a/sklearn_bench/README.md +++ b/sklearn_bench/README.md @@ -5,9 +5,7 @@ If you want to test scikit-learn, then use ```bash pip install -r sklearn_bench/requirements.txt -``` -or -```bash +# or conda install -c conda-forge scikit-learn scikit-learn-intelex pandas ``` diff --git a/xgboost_bench/README.md b/xgboost_bench/README.md index e00feb7cb..2b4e93ec5 100644 --- a/xgboost_bench/README.md +++ b/xgboost_bench/README.md @@ -2,10 +2,7 @@ ```bash pip install -r xgboost_bench/requirements.txt -``` -or - -```bash +# or conda install -c conda-forge xgboost pandas ``` From f7668b957a9fd3aa005f1fafa3d77f3acb1e7f85 Mon Sep 17 00:00:00 2001 From: Kirill Petrov Date: Fri, 16 Apr 2021 04:01:33 +0300 Subject: [PATCH 13/14] fix --- bench.py | 4 +++- configs/svm/svc_proba_cuml.json | 2 +- sklearn_bench/svm.py | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/bench.py b/bench.py index 7da2328d7..772762503 100644 --- a/bench.py +++ b/bench.py @@ -342,7 +342,9 @@ def accuracy_score(y, yp): def log_loss(y, yp): from sklearn.metrics import log_loss as sklearn_log_loss - return columnwise_score(y, yp, sklearn_log_loss) + y = convert_to_numpy(y) + yp = convert_to_numpy(yp) + return sklearn_log_loss(y, yp) def rmse_score(y, yp): diff --git a/configs/svm/svc_proba_cuml.json b/configs/svm/svc_proba_cuml.json index 7d0fb7d58..85fe1f0df 100755 --- a/configs/svm/svc_proba_cuml.json +++ b/configs/svm/svc_proba_cuml.json @@ -97,7 +97,7 @@ "dataset": [ { "source": "csv", - "name": "connect4", + "name": "connect", "training": { "x": "data/connect_x_train.csv", diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py index 694b21bfe..80fd73b8f 100644 --- a/sklearn_bench/svm.py +++ b/sklearn_bench/svm.py @@ -44,12 +44,12 @@ def main(): if params.probability: state_predict = 'predict_proba' accuracy_type = 'log_loss' - def metric_call(x, y): return log_loss(x, y) + def metric_call(x, y): return bench.log_loss(x, y) clf_predict = clf.predict_proba else: state_predict = 'predict' accuracy_type = 'accuracy[%]' - def metric_call(x, y): return 100 * accuracy_score(x, y) + def metric_call(x, y): return bench.accuracy(x, y) clf_predict = clf.predict predict_train_time, y_pred = bench.measure_function_time( From a8d1f361f008d6bd2a3f6f17bcdc93eb2dc5cc17 Mon Sep 17 00:00:00 2001 From: Kirill Petrov Date: Fri, 16 Apr 2021 04:09:09 +0300 Subject: [PATCH 14/14] fix score --- sklearn_bench/svm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py index 80fd73b8f..4fea1e025 100644 --- a/sklearn_bench/svm.py +++ b/sklearn_bench/svm.py @@ -18,7 +18,6 @@ import bench import numpy as np -from sklearn.metrics import accuracy_score, log_loss def main(): @@ -49,7 +48,7 @@ def metric_call(x, y): return bench.log_loss(x, y) else: state_predict = 'predict' accuracy_type = 'accuracy[%]' - def metric_call(x, y): return bench.accuracy(x, y) + def metric_call(x, y): return bench.accuracy_score(x, y) clf_predict = clf.predict predict_train_time, y_pred = bench.measure_function_time(