diff --git a/README.md b/README.md index d2f2ebe34..14944ee8c 100755 --- a/README.md +++ b/README.md @@ -39,25 +39,29 @@ Create a suitable conda environment for each framework to test. Each item in the * [**scikit-learn**](sklearn_bench#how-to-create-conda-environment-for-benchmarking) ```bash -conda create -n bench -c intel python=3.7 scikit-learn scikit-learn-intelex pandas +pip install -r sklearn_bench/requirements.txt +# or +conda install -c conda-forge scikit-learn scikit-learn-intelex pandas ``` * [**daal4py**](daal4py_bench#how-to-create-conda-environment-for-benchmarking) ```bash -conda create -n bench -c intel python=3.7 scikit-learn daal4py pandas +conda install -c conda-forge scikit-learn daal4py pandas ``` * [**cuml**](cuml_bench#how-to-create-conda-environment-for-benchmarking) ```bash -conda create -n bench -c rapidsai -c conda-forge python=3.7 cuml pandas cudf +conda install -c rapidsai -c conda-forge cuml pandas cudf ``` * [**xgboost**](xgboost_bench#how-to-create-conda-environment-for-benchmarking) ```bash -conda create -n bench -c conda-forge python=3.7 xgboost pandas +pip install -r xgboost_bench/requirements.txt +# or +conda install -c conda-forge xgboost pandas ``` ## Running Python benchmarks with runner script @@ -109,7 +113,7 @@ The configuration of benchmarks allows you to select the frameworks to run, sele ## Intel(R) Extension for Scikit-learn support -When you run scikit-learn benchmarks on CPU, [Intel(R) Extension for Scikit-learn](https://github.com/intel/scikit-learn-intelex) is used by default. Use the ``--no-intel-optimized`` option to run the benchmarks without the extension. +When you run scikit-learn benchmarks on CPU, [Intel(R) Extension for Scikit-learn](https://github.com/intel/scikit-learn-intelex) is used by default. Use the ``--no-intel-optimized`` option to run the benchmarks without the extension. The following benchmarks have a GPU support: * dbscan diff --git a/bench.py b/bench.py index 591db126c..772762503 100644 --- a/bench.py +++ b/bench.py @@ -340,6 +340,13 @@ def accuracy_score(y, yp): return columnwise_score(y, yp, lambda y1, y2: np.mean(y1 == y2)) +def log_loss(y, yp): + from sklearn.metrics import log_loss as sklearn_log_loss + y = convert_to_numpy(y) + yp = convert_to_numpy(yp) + return sklearn_log_loss(y, yp) + + def rmse_score(y, yp): return columnwise_score( y, yp, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2)))) diff --git a/configs/svm/svc_proba_cuml.json b/configs/svm/svc_proba_cuml.json new file mode 100755 index 000000000..85fe1f0df --- /dev/null +++ b/configs/svm/svc_proba_cuml.json @@ -0,0 +1,222 @@ +{ + "common": { + "lib": ["cuml"], + "data-format": ["cudf"], + "data-order": ["F"], + "dtype": ["float64"], + "max-cache-size": [2], + "probability": [""] + }, + "cases": [ + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "ijcnn", + "training": + { + "x": "data/ijcnn_x_train.csv", + "y": "data/ijcnn_y_train.csv" + }, + "testing": + { + "x": "data/ijcnn_x_test.csv", + "y": "data/ijcnn_y_test.csv" + } + } + ], + "C": [1000.0], + "kernel": ["linear"] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "a9a", + "training": + { + "x": "data/a9a_x_train.csv", + "y": "data/a9a_y_train.csv" + }, + "testing": + { + "x": "data/a9a_x_test.csv", + "y": "data/a9a_y_test.csv" + } + } + ], + "C": [500.0], + "kernel": ["rbf"] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "gisette", + "training": + { + "x": "data/gisette_x_train.csv", + "y": "data/gisette_y_train.csv" + }, + "testing": + { + "x": "data/gisette_x_test.csv", + "y": "data/gisette_y_test.csv" + } + } + ], + "C": [1.5e-3], + "kernel": ["linear"] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "klaverjas", + "training": + { + "x": "data/klaverjas_x_train.csv", + "y": "data/klaverjas_y_train.csv" + }, + "testing": + { + "x": "data/klaverjas_x_test.csv", + "y": "data/klaverjas_y_test.csv" + } + } + ], + "C": [1.0], + "kernel": ["rbf"] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "connect", + "training": + { + "x": "data/connect_x_train.csv", + "y": "data/connect_y_train.csv" + }, + "testing": + { + "x": "data/connect_x_test.csv", + "y": "data/connect_y_test.csv" + } + } + ], + "C": [100.0], + "kernel": ["linear"] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "mnist", + "training": + { + "x": "data/mnist_x_train.csv", + "y": "data/mnist_y_train.csv" + }, + "testing": + { + "x": "data/mnist_x_test.csv", + "y": "data/mnist_y_test.csv" + } + } + ], + "C": [50.0], + "kernel": ["rbf"] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "sensit", + "training": + { + "x": "data/sensit_x_train.csv", + "y": "data/sensit_y_train.csv" + }, + "testing": + { + "x": "data/sensit_x_test.csv", + "y": "data/sensit_y_test.csv" + } + } + ], + "C": [500.0], + "kernel": ["linear"] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "skin_segmentation", + "training": + { + "x": "data/skin_segmentation_x_train.csv", + "y": "data/skin_segmentation_y_train.csv" + }, + "testing": + { + "x": "data/skin_segmentation_x_test.csv", + "y": "data/skin_segmentation_y_test.csv" + } + } + ], + "C": [1.0], + "kernel": ["rbf"] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "covertype", + "training": + { + "x": "data/covertype_x_train.csv", + "y": "data/covertype_y_train.csv" + }, + "testing": + { + "x": "data/covertype_x_test.csv", + "y": "data/covertype_y_test.csv" + } + } + ], + "C": [100.0], + "kernel": ["rbf"] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "codrnanorm", + "training": + { + "x": "data/codrnanorm_x_train.csv", + "y": "data/codrnanorm_y_train.csv" + }, + "testing": + { + "x": "data/codrnanorm_x_test.csv", + "y": "data/codrnanorm_y_test.csv" + } + } + ], + "C": [1000.0], + "kernel": ["linear"] + } + ] +} diff --git a/configs/svm/svc_proba_sklearn.json b/configs/svm/svc_proba_sklearn.json new file mode 100755 index 000000000..53c1676cf --- /dev/null +++ b/configs/svm/svc_proba_sklearn.json @@ -0,0 +1,222 @@ +{ + "common": { + "lib": ["sklearn"], + "data-format": ["pandas"], + "data-order": ["F"], + "dtype": ["float64"], + "max-cache-size": [2], + "probability": [""] + }, + "cases": [ + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "ijcnn", + "training": + { + "x": "data/ijcnn_x_train.csv", + "y": "data/ijcnn_y_train.csv" + }, + "testing": + { + "x": "data/ijcnn_x_test.csv", + "y": "data/ijcnn_y_test.csv" + } + } + ], + "C": [1000.0], + "kernel": ["linear"] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "a9a", + "training": + { + "x": "data/a9a_x_train.csv", + "y": "data/a9a_y_train.csv" + }, + "testing": + { + "x": "data/a9a_x_test.csv", + "y": "data/a9a_y_test.csv" + } + } + ], + "C": [500.0], + "kernel": ["rbf"] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "gisette", + "training": + { + "x": "data/gisette_x_train.csv", + "y": "data/gisette_y_train.csv" + }, + "testing": + { + "x": "data/gisette_x_test.csv", + "y": "data/gisette_y_test.csv" + } + } + ], + "C": [1.5e-3], + "kernel": ["linear"] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "klaverjas", + "training": + { + "x": "data/klaverjas_x_train.csv", + "y": "data/klaverjas_y_train.csv" + }, + "testing": + { + "x": "data/klaverjas_x_test.csv", + "y": "data/klaverjas_y_test.csv" + } + } + ], + "C": [1.0], + "kernel": ["rbf"] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "connect", + "training": + { + "x": "data/connect_x_train.csv", + "y": "data/connect_y_train.csv" + }, + "testing": + { + "x": "data/connect_x_test.csv", + "y": "data/connect_y_test.csv" + } + } + ], + "C": [100.0], + "kernel": ["linear"] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "mnist", + "training": + { + "x": "data/mnist_x_train.csv", + "y": "data/mnist_y_train.csv" + }, + "testing": + { + "x": "data/mnist_x_test.csv", + "y": "data/mnist_y_test.csv" + } + } + ], + "C": [50.0], + "kernel": ["rbf"] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "sensit", + "training": + { + "x": "data/sensit_x_train.csv", + "y": "data/sensit_y_train.csv" + }, + "testing": + { + "x": "data/sensit_x_test.csv", + "y": "data/sensit_y_test.csv" + } + } + ], + "C": [500.0], + "kernel": ["linear"] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "skin_segmentation", + "training": + { + "x": "data/skin_segmentation_x_train.csv", + "y": "data/skin_segmentation_y_train.csv" + }, + "testing": + { + "x": "data/skin_segmentation_x_test.csv", + "y": "data/skin_segmentation_y_test.csv" + } + } + ], + "C": [1.0], + "kernel": ["rbf"] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "covertype", + "training": + { + "x": "data/covertype_x_train.csv", + "y": "data/covertype_y_train.csv" + }, + "testing": + { + "x": "data/covertype_x_test.csv", + "y": "data/covertype_y_test.csv" + } + } + ], + "C": [100.0], + "kernel": ["rbf"] + }, + { + "algorithm": "svm", + "dataset": [ + { + "source": "csv", + "name": "codrnanorm", + "training": + { + "x": "data/codrnanorm_x_train.csv", + "y": "data/codrnanorm_y_train.csv" + }, + "testing": + { + "x": "data/codrnanorm_x_test.csv", + "y": "data/codrnanorm_y_test.csv" + } + } + ], + "C": [1000.0], + "kernel": ["linear"] + } + ] +} diff --git a/cuml_bench/README.md b/cuml_bench/README.md index bf6b12ada..e65f11432 100644 --- a/cuml_bench/README.md +++ b/cuml_bench/README.md @@ -137,10 +137,9 @@ You can launch benchmarks for each algorithm separately. The tables below list a | C | float | 0.01 | SVM slack parameter | | kernel | str | linear | *linear* or *rbf*. SVM kernel function | | gamma | float | None | Parameter for kernel="rbf" | -| maxiter | int | 2000 | Maximum iterations for the iterative solver | | max-cache-size | int | 64 | Maximum cache size for SVM. | | tol | float | 1e-16 | Tolerance passed to sklearn.svm.SVC | -| no-shrinking | action | True | Don't use shrinking heuristic | +| probability | action | True | Use probability for SVC | #### train_test_split diff --git a/cuml_bench/svm.py b/cuml_bench/svm.py index 0f24a5ec0..5e402acf8 100644 --- a/cuml_bench/svm.py +++ b/cuml_bench/svm.py @@ -24,20 +24,19 @@ parser.add_argument('-C', dest='C', type=float, default=1.0, help='SVM regularization parameter') -parser.add_argument('--kernel', choices=('linear', 'rbf'), +parser.add_argument('--kernel', choices=('linear', 'rbf', 'poly'), default='linear', help='SVM kernel function') -parser.add_argument('--maxiter', type=int, default=-1, - help='Maximum iterations for the iterative solver. ' - '-1 means no limit.') parser.add_argument('--gamma', type=float, default=None, help='Parameter for kernel="rbf"') parser.add_argument('--max-cache-size', type=int, default=8, help='Maximum cache size, in gigabytes, for SVM.') parser.add_argument('--tol', type=float, default=1e-3, help='Tolerance passed to sklearn.svm.SVC') +parser.add_argument('--probability', action='store_true', default=False, + dest='probability', help="Use probability for SVC") + params = bench.parse_args(parser) -# Load data X_train, X_test, y_train, y_test = bench.load_data(params) if params.gamma is None: @@ -48,25 +47,34 @@ params.cache_size_mb = cache_size_bytes / 1024**2 params.n_classes = y_train[y_train.columns[0]].nunique() -# Create our C-SVM classifier -clf = SVC(C=params.C, kernel=params.kernel, max_iter=params.maxiter, - cache_size=params.cache_size_mb, tol=params.tol, - gamma=params.gamma) +clf = SVC(C=params.C, kernel=params.kernel, cache_size=params.cache_size_mb, + tol=params.tol, gamma=params.gamma, probability=params.probability) -# Time fit and predict fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) -params.sv_len = clf.support_.shape[0] -predict_time, y_pred = bench.measure_function_time( - clf.predict, X_train, params=params) -train_acc = 100 * bench.accuracy_score(y_pred, y_train) +if params.probability: + state_predict = 'predict_proba' + accuracy_type = 'log_loss' + def metric_call(x, y): return bench.log_loss(x, y) + clf_predict = clf.predict_proba +else: + state_predict = 'prediction' + accuracy_type = 'accuracy[%]' + def metric_call(x, y): return 100 * bench.accuracy_score(x, y) + clf_predict = clf.predict + + +predict_train_time, y_pred = bench.measure_function_time( + clf_predict, X_train, params=params) +train_acc = metric_call(y_train, y_pred) -y_pred = clf.predict(X_test) -test_acc = 100 * bench.accuracy_score(y_pred, y_test) +predict_test_time, y_pred = bench.measure_function_time( + clf_predict, X_test, params=params) +test_acc = metric_call(y_test, y_pred) bench.print_output(library='cuml', algorithm='svc', - stages=['training', 'prediction'], params=params, + stages=['training', state_predict], params=params, functions=['SVM.fit', 'SVM.predict'], - times=[fit_time, predict_time], accuracy_type='accuracy[%]', + times=[fit_time, predict_train_time], accuracy_type=accuracy_type, accuracies=[train_acc, test_acc], data=[X_train, X_train], alg_instance=clf) diff --git a/datasets/loader.py b/datasets/loader.py index 055fd52a6..45690c8ae 100755 --- a/datasets/loader.py +++ b/datasets/loader.py @@ -179,15 +179,15 @@ def connect(dataset_dir=None): http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.htm Classification task. n_classes = 3. - connect X train dataset (196045, 127) - connect y train dataset (196045, 1) - connect X test dataset (49012, 127) + connect X train dataset (60801, 126) + connect y train dataset (60801, 1) + connect X test dataset (49012, 126) connect y test dataset (49012, 1) """ dataset_name = 'connect' os.makedirs(dataset_dir, exist_ok=True) - X, y = fetch_openml(name='connect-4', return_X_y=True, + X, y = fetch_openml(name='connect-4', version=1, return_X_y=True, as_frame=False, data_home=dataset_dir) X = pd.DataFrame(X.todense()) y = pd.DataFrame(y) diff --git a/runner.py b/runner.py index 58f99588a..a77c29a51 100755 --- a/runner.py +++ b/runner.py @@ -238,6 +238,7 @@ class GenerationArgs: except json.JSONDecodeError as decoding_exception: stderr += f'CASE {case} JSON DECODING ERROR:\n' \ + f'{decoding_exception}\n{stdout}\n' + if stderr != '': is_successful = False logging.warning('Error in benchmark: \n' + stderr) diff --git a/sklearn_bench/README.md b/sklearn_bench/README.md index d3859ab1f..afa9c7eec 100644 --- a/sklearn_bench/README.md +++ b/sklearn_bench/README.md @@ -3,7 +3,11 @@ If you want to test scikit-learn, then use -`conda create -n bench -c intel python=3.7 scikit-learn daal4py pandas` +```bash +pip install -r sklearn_bench/requirements.txt +# or +conda install -c conda-forge scikit-learn scikit-learn-intelex pandas +``` ## Algorithms parameters @@ -144,10 +148,9 @@ You can launch benchmarks for each algorithm separately. The tables below list a | C | float | 0.01 | SVM slack parameter | | kernel | str | linear | *linear* or *rbf*. SVM kernel function | | gamma | float | None | Parameter for kernel="rbf" | -| maxiter | int | 2000 | Maximum iterations for the iterative solver | | max-cache-size | int | 64 | Maximum cache size for SVM. | | tol | float | 1e-16 | Tolerance passed to sklearn.svm.SVC | -| no-shrinking | action | True | Don't use shrinking heuristic | +| probability | action | True | Use probability for SVC | #### train_test_split diff --git a/sklearn_bench/requirements.txt b/sklearn_bench/requirements.txt new file mode 100755 index 000000000..d25373e5e --- /dev/null +++ b/sklearn_bench/requirements.txt @@ -0,0 +1,4 @@ +scikit-learn +pandas +scikit-learn-intelex +openpyxl diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py index 102a6e62a..4fea1e025 100644 --- a/sklearn_bench/svm.py +++ b/sklearn_bench/svm.py @@ -18,13 +18,11 @@ import bench import numpy as np -from sklearn.metrics import accuracy_score def main(): from sklearn.svm import SVC - # Load data X_train, X_test, y_train, y_test = bench.load_data(params) if params.gamma is None: @@ -35,26 +33,36 @@ def main(): params.cache_size_mb = cache_size_bytes / 1024**2 params.n_classes = len(np.unique(y_train)) - # Create our C-SVM classifier - clf = SVC(C=params.C, kernel=params.kernel, max_iter=params.maxiter, - cache_size=params.cache_size_mb, tol=params.tol, - shrinking=params.shrinking, gamma=params.gamma) + clf = SVC(C=params.C, kernel=params.kernel, cache_size=params.cache_size_mb, + tol=params.tol, gamma=params.gamma, probability=params.probability, + random_state=43) - # Time fit and predict fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) params.sv_len = clf.support_.shape[0] - predict_time, y_pred = bench.measure_function_time( - clf.predict, X_train, params=params) - train_acc = 100 * accuracy_score(y_pred, y_train) + if params.probability: + state_predict = 'predict_proba' + accuracy_type = 'log_loss' + def metric_call(x, y): return bench.log_loss(x, y) + clf_predict = clf.predict_proba + else: + state_predict = 'predict' + accuracy_type = 'accuracy[%]' + def metric_call(x, y): return bench.accuracy_score(x, y) + clf_predict = clf.predict - y_pred = clf.predict(X_test) - test_acc = 100 * accuracy_score(y_pred, y_test) + predict_train_time, y_pred = bench.measure_function_time( + clf_predict, X_train, params=params) + train_acc = metric_call(y_train, y_pred) + + predict_test_time, y_pred = bench.measure_function_time( + clf_predict, X_test, params=params) + test_acc = metric_call(y_test, y_pred) bench.print_output(library='sklearn', algorithm='svc', - stages=['training', 'prediction'], - params=params, functions=['SVM.fit', 'SVM.predict'], - times=[fit_time, predict_time], accuracy_type='accuracy[%]', + stages=['training', state_predict], + params=params, functions=['SVM.fit', f'SVM.{state_predict}'], + times=[fit_time, predict_train_time], accuracy_type=accuracy_type, accuracies=[train_acc, test_acc], data=[X_train, X_train], alg_instance=clf) @@ -64,18 +72,16 @@ def main(): parser.add_argument('-C', dest='C', type=float, default=1.0, help='SVM regularization parameter') - parser.add_argument('--kernel', choices=('linear', 'rbf'), + parser.add_argument('--kernel', choices=('linear', 'rbf', 'poly'), default='linear', help='SVM kernel function') parser.add_argument('--gamma', type=float, default=None, help='Parameter for kernel="rbf"') - parser.add_argument('--maxiter', type=int, default=-1, - help='Maximum iterations for the iterative solver. ' - '-1 means no limit.') parser.add_argument('--max-cache-size', type=int, default=8, help='Maximum cache size, in gigabytes, for SVM.') parser.add_argument('--tol', type=float, default=1e-3, help='Tolerance passed to sklearn.svm.SVC') - parser.add_argument('--no-shrinking', action='store_false', default=True, - dest='shrinking', help="Don't use shrinking heuristic") + parser.add_argument('--probability', action='store_true', default=False, + dest='probability', help="Use probability for SVC") + params = bench.parse_args(parser, loop_types=('fit', 'predict')) bench.run_with_context(params, main) diff --git a/utils.py b/utils.py index a3cfa7a68..6d77d4765 100755 --- a/utils.py +++ b/utils.py @@ -24,10 +24,9 @@ def filter_stderr(text): - # delete 'Intel(R) DAAL usage in sklearn' messages - fake_error_message = 'Intel(R) oneAPI Data Analytics Library solvers ' + \ - 'for sklearn enabled: ' + \ - 'https://intelpython.github.io/daal4py/sklearn.html' + # delete 'Intel(R) Extension for Scikit-learn usage in sklearn' messages + fake_error_message = 'Intel(R) Extension for Scikit-learn* enabled ' + \ + '(https://github.com/intel/scikit-learn-intelex)' while fake_error_message in text: text = text.replace(fake_error_message, '') return text diff --git a/xgboost_bench/README.md b/xgboost_bench/README.md index e8d93df58..2b4e93ec5 100644 --- a/xgboost_bench/README.md +++ b/xgboost_bench/README.md @@ -1,6 +1,10 @@ ## How to create conda environment for benchmarking - conda create -n bench -c conda-forge python=3.7 xgboost pandas +```bash +pip install -r xgboost_bench/requirements.txt +# or +conda install -c conda-forge xgboost pandas +``` ## Algorithms parameters diff --git a/xgboost_bench/requirements.txt b/xgboost_bench/requirements.txt new file mode 100755 index 000000000..1540ec04f --- /dev/null +++ b/xgboost_bench/requirements.txt @@ -0,0 +1,4 @@ +scikit-learn +pandas +xgboost +openpyxl