From 902bec134ee7db7620eab38b91e8fc7ab283a741 Mon Sep 17 00:00:00 2001 From: "alexander.makaryev" Date: Sun, 17 May 2020 18:34:08 +0300 Subject: [PATCH 1/4] adding parameters for device context and patching of Scikit-Learn --- sklearn/bench.py | 36 ++++++++++++ sklearn/dbscan.py | 81 +++++++++++++++------------ sklearn/kmeans.py | 112 +++++++++++++++++++------------------ sklearn/linear.py | 87 +++++++++++++++-------------- sklearn/log_reg.py | 134 +++++++++++++++++++++++---------------------- 5 files changed, 258 insertions(+), 192 deletions(-) diff --git a/sklearn/bench.py b/sklearn/bench.py index cb1de6aa5..3b00d2276 100644 --- a/sklearn/bench.py +++ b/sklearn/bench.py @@ -153,6 +153,12 @@ def parse_args(parser, size=None, loop_types=(), help='Seed to pass as random_state') parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name') + parser.add_argument('--device', type=str, default='None', + choices=('None', 'host', 'cpu', 'gpu'), + help='Execution context device, "None" to run without context.') + parser.add_argument('--patch_sklearn', type=str, default='None', + choices=('None', 'True', 'False'), + help='True for patch, False for unpatch, "None" to leave as is.') for data in ['X', 'y']: for stage in ['train', 'test']: @@ -607,3 +613,33 @@ def import_fptype_getter(): except: from daal4py.sklearn.utils import getFPType return getFPType + + +def patch_sklearn(): + parser = argparse.ArgumentParser() + parser.add_argument('--patch_sklearn', type=str, default='None', + choices=('None', 'True', 'False'), + help='True for patch, False for unpatch, "None" to leave as is.') + args, _ = parser.parse_known_args() + + if args.patch_sklearn is not None and args.patch_sklearn != 'None': + from daal4py.sklearn import patch_sklearn, unpatch_sklearn + if bool(args.patch_sklearn): + patch_sklearn() + else: + unpatch_sklearn() + + +def run_with_context(function): + parser = argparse.ArgumentParser() + parser.add_argument('--device', type=str, default='None', + choices=('None', 'host', 'cpu', 'gpu'), + help='Execution context device, "None" to run without context.') + args, _ = parser.parse_known_args() + + if args.device is not None and args.device != 'None': + from daal4py.oneapi import sycl_context + with sycl_context(args.device): + function() + else: + function() diff --git a/sklearn/dbscan.py b/sklearn/dbscan.py index 262086b61..a0270b8ba 100644 --- a/sklearn/dbscan.py +++ b/sklearn/dbscan.py @@ -2,39 +2,48 @@ # # SPDX-License-Identifier: MIT -import argparse -from bench import measure_function_time, parse_args, load_data, print_output -from sklearn.cluster import DBSCAN - -parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark') -parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10., - help='Radius of neighborhood of a point') -parser.add_argument('-m', '--min-samples', default=5, type=int, - help='The minimum number of samples required in a ' - 'neighborhood to consider a point a core point') -params = parse_args(parser, n_jobs_supported=True) - -# Load generated data -X, _, _, _ = load_data(params, add_dtype=True) - -# Create our clustering object -dbscan = DBSCAN(eps=params.eps, n_jobs=params.n_jobs, - min_samples=params.min_samples, metric='euclidean', - algorithm='auto') - -# N.B. algorithm='auto' will select DAAL's brute force method when running -# daal4py-patched scikit-learn, and probably 'kdtree' when running unpatched -# scikit-learn. - -columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', - 'n_clusters', 'time') - -# Time fit -time, _ = measure_function_time(dbscan.fit, X, params=params) -labels = dbscan.labels_ -params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0) - -print_output(library='sklearn', algorithm='dbscan', stages=['training'], - columns=columns, params=params, functions=['DBSCAN'], - times=[time], accuracies=[None], accuracy_type=None, data=[X], - alg_instance=dbscan) +from bench import (measure_function_time, parse_args, load_data, print_output, + run_with_context, patch_sklearn) + + +def main(): + import argparse + from sklearn.cluster import DBSCAN + + parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark') + parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10., + help='Radius of neighborhood of a point') + parser.add_argument('-m', '--min-samples', default=5, type=int, + help='The minimum number of samples required in a ' + 'neighborhood to consider a point a core point') + params = parse_args(parser, n_jobs_supported=True) + + # Load generated data + X, _, _, _ = load_data(params, add_dtype=True) + + # Create our clustering object + dbscan = DBSCAN(eps=params.eps, n_jobs=params.n_jobs, + min_samples=params.min_samples, metric='euclidean', + algorithm='auto') + + # N.B. algorithm='auto' will select DAAL's brute force method when running + # daal4py-patched scikit-learn, and probably 'kdtree' when running unpatched + # scikit-learn. + + columns = ('batch', 'arch', 'prefix', 'function', 'patch_sklearn', 'device', 'threads', 'dtype', 'size', + 'n_clusters', 'time') + + # Time fit + time, _ = measure_function_time(dbscan.fit, X, params=params) + labels = dbscan.labels_ + params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0) + + print_output(library='sklearn', algorithm='dbscan', stages=['training'], + columns=columns, params=params, functions=['DBSCAN'], + times=[time], accuracies=[None], accuracy_type=None, data=[X], + alg_instance=dbscan) + + +if __name__ == "__main__": + patch_sklearn() + run_with_context(main) diff --git a/sklearn/kmeans.py b/sklearn/kmeans.py index da000f353..5802c74cb 100644 --- a/sklearn/kmeans.py +++ b/sklearn/kmeans.py @@ -2,64 +2,72 @@ # # SPDX-License-Identifier: MIT -import argparse -from bench import ( - parse_args, measure_function_time, load_data, print_output -) -import numpy as np -from sklearn.cluster import KMeans - -parser = argparse.ArgumentParser(description='scikit-learn K-means benchmark') -parser.add_argument('-i', '--filei', '--fileI', '--init', - type=str, help='Initial clusters') -parser.add_argument('-t', '--tol', type=float, default=0., - help='Absolute threshold') -parser.add_argument('--maxiter', type=int, default=100, - help='Maximum number of iterations') -parser.add_argument('--n-clusters', type=int, help='Number of clusters') -params = parse_args(parser) - -# Load and convert generated data -X_train, X_test, _, _ = load_data(params) - -# Load initial centroids from specified path -if params.filei is not None: - X_init = np.load(params.filei).astype(params.dtype) - params.n_clusters = X_init.shape[0] -# or choose random centroids from training data -else: - np.random.seed(params.seed) - centroids_idx = np.random.randint(0, X_train.shape[0], - size=params.n_clusters) - if hasattr(X_train, "iloc"): - X_init = X_train.iloc[centroids_idx].values - else: - X_init = X_train[centroids_idx] +from bench import (measure_function_time, parse_args, load_data, print_output, + run_with_context, patch_sklearn) + +def main(): + import argparse + import numpy as np + from sklearn.cluster import KMeans -def fit_kmeans(X): global X_init, params - alg = KMeans(n_clusters=params.n_clusters, n_jobs=params.n_jobs, - tol=params.tol, max_iter=params.maxiter, n_init=1, init=X_init) - alg.fit(X) - return alg + parser = argparse.ArgumentParser(description='scikit-learn K-means benchmark') + parser.add_argument('-i', '--filei', '--fileI', '--init', + type=str, help='Initial clusters') + parser.add_argument('-t', '--tol', type=float, default=0., + help='Absolute threshold') + parser.add_argument('--maxiter', type=int, default=100, + help='Maximum number of iterations') + parser.add_argument('--n-clusters', type=int, help='Number of clusters') + params = parse_args(parser) + + # Load and convert generated data + X_train, X_test, _, _ = load_data(params) + + # Load initial centroids from specified path + if params.filei is not None: + X_init = np.load(params.filei).astype(params.dtype) + params.n_clusters = X_init.shape[0] + # or choose random centroids from training data + else: + np.random.seed(params.seed) + centroids_idx = np.random.randint(0, X_train.shape[0], + size=params.n_clusters) + if hasattr(X_train, "iloc"): + X_init = X_train.iloc[centroids_idx].values + else: + X_init = X_train[centroids_idx] + + + def fit_kmeans(X): + global X_init, params + alg = KMeans(n_clusters=params.n_clusters, n_jobs=params.n_jobs, + tol=params.tol, max_iter=params.maxiter, n_init=1, init=X_init) + alg.fit(X) + return alg + + + columns = ('batch', 'arch', 'prefix', 'function', 'patch_sklearn', 'device', 'threads', 'dtype', 'size', + 'n_clusters', 'time') + # Time fit + fit_time, kmeans = measure_function_time(fit_kmeans, X_train, params=params) + train_inertia = float(kmeans.inertia_) -columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', - 'n_clusters', 'time') + # Time predict + predict_time, _ = measure_function_time(kmeans.predict, X_test, params=params) + test_inertia = float(kmeans.inertia_) -# Time fit -fit_time, kmeans = measure_function_time(fit_kmeans, X_train, params=params) -train_inertia = float(kmeans.inertia_) -# Time predict -predict_time, _ = measure_function_time(kmeans.predict, X_test, params=params) -test_inertia = float(kmeans.inertia_) + print_output(library='sklearn', algorithm='kmeans', + stages=['training', 'prediction'], columns=columns, + params=params, functions=['KMeans.fit', 'KMeans.predict'], + times=[fit_time, predict_time], accuracy_type='inertia', + accuracies=[train_inertia, test_inertia], data=[X_train, X_test], + alg_instance=kmeans) -print_output(library='sklearn', algorithm='kmeans', - stages=['training', 'prediction'], columns=columns, - params=params, functions=['KMeans.fit', 'KMeans.predict'], - times=[fit_time, predict_time], accuracy_type='inertia', - accuracies=[train_inertia, test_inertia], data=[X_train, X_test], - alg_instance=kmeans) +if __name__ == "__main__": + patch_sklearn() + run_with_context(main) diff --git a/sklearn/linear.py b/sklearn/linear.py index 25601ba83..8b9d5d1e1 100644 --- a/sklearn/linear.py +++ b/sklearn/linear.py @@ -2,43 +2,50 @@ # # SPDX-License-Identifier: MIT -import argparse -from bench import ( - parse_args, measure_function_time, load_data, print_output, rmse_score -) -from sklearn.linear_model import LinearRegression - -parser = argparse.ArgumentParser(description='scikit-learn linear regression ' - 'benchmark') -parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, - action='store_false', - help="Don't fit intercept (assume data already centered)") -params = parse_args(parser, size=(1000000, 50)) - -# Load data -X_train, X_test, y_train, y_test = load_data( - params, generated_data=['X_train', 'y_train']) - -# Create our regression object -regr = LinearRegression(fit_intercept=params.fit_intercept, - n_jobs=params.n_jobs) - -columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', - 'time') - -# Time fit -fit_time, _ = measure_function_time(regr.fit, X_train, y_train, params=params) - -# Time predict -predict_time, yp = measure_function_time(regr.predict, X_test, params=params) - -test_rmse = rmse_score(yp, y_test) -yp = regr.predict(X_train) -train_rmse = rmse_score(yp, y_train) - -print_output(library='sklearn', algorithm='linear_regression', - stages=['training', 'prediction'], columns=columns, - params=params, functions=['Linear.fit', 'Linear.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_test], - alg_instance=regr) +from bench import (measure_function_time, parse_args, load_data, print_output, rmse_score, + run_with_context, patch_sklearn) + + +def main(): + import argparse + from sklearn.linear_model import LinearRegression + + parser = argparse.ArgumentParser(description='scikit-learn linear regression ' + 'benchmark') + parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, + action='store_false', + help="Don't fit intercept (assume data already centered)") + params = parse_args(parser, size=(1000000, 50)) + + # Load data + X_train, X_test, y_train, y_test = load_data( + params, generated_data=['X_train', 'y_train']) + + # Create our regression object + regr = LinearRegression(fit_intercept=params.fit_intercept, + n_jobs=params.n_jobs) + + columns = ('batch', 'arch', 'prefix', 'function', 'patch_sklearn', 'device', 'threads', 'dtype', 'size', + 'time') + + # Time fit + fit_time, _ = measure_function_time(regr.fit, X_train, y_train, params=params) + + # Time predict + predict_time, yp = measure_function_time(regr.predict, X_test, params=params) + + test_rmse = rmse_score(yp, y_test) + yp = regr.predict(X_train) + train_rmse = rmse_score(yp, y_train) + + print_output(library='sklearn', algorithm='linear_regression', + stages=['training', 'prediction'], columns=columns, + params=params, functions=['Linear.fit', 'Linear.predict'], + times=[fit_time, predict_time], accuracy_type='rmse', + accuracies=[train_rmse, test_rmse], data=[X_train, X_test], + alg_instance=regr) + + +if __name__ == "__main__": + patch_sklearn() + run_with_context(main) diff --git a/sklearn/log_reg.py b/sklearn/log_reg.py index eb7933df4..96c198c82 100644 --- a/sklearn/log_reg.py +++ b/sklearn/log_reg.py @@ -2,77 +2,83 @@ # # SPDX-License-Identifier: MIT -import argparse -from bench import ( - parse_args, measure_function_time, load_data, print_output -) -import numpy as np -from sklearn.linear_model import LogisticRegression -from sklearn.metrics import accuracy_score +from bench import (measure_function_time, parse_args, load_data, print_output, + run_with_context, patch_sklearn) -parser = argparse.ArgumentParser(description='scikit-learn logistic ' - 'regression benchmark') -parser.add_argument('--no-fit-intercept', dest='fit_intercept', - action='store_false', default=True, - help="Don't fit intercept") -parser.add_argument('--multiclass', default='auto', - choices=('auto', 'ovr', 'multinomial'), - help='How to treat multi class data. ' - '"auto" picks "ovr" for binary classification, and ' - '"multinomial" otherwise.') -parser.add_argument('--solver', default='lbfgs', - choices=('lbfgs', 'newton-cg', 'saga'), - help='Solver to use.') -parser.add_argument('--maxiter', type=int, default=100, - help='Maximum iterations for the iterative solver') -parser.add_argument('-C', dest='C', type=float, default=1.0, - help='Regularization parameter') -parser.add_argument('--tol', type=float, default=None, - help='Tolerance for solver. If solver == "newton-cg", ' - 'then the default is 1e-3. Otherwise, the default ' - 'is 1e-10.') -params = parse_args(parser, loop_types=('fit', 'predict')) -# Load generated data -X_train, X_test, y_train, y_test = load_data(params) +def main(): + import argparse + import numpy as np + from sklearn.linear_model import LogisticRegression + from sklearn.metrics import accuracy_score -params.n_classes = len(np.unique(y_train)) + parser = argparse.ArgumentParser(description='scikit-learn logistic ' + 'regression benchmark') + parser.add_argument('--no-fit-intercept', dest='fit_intercept', + action='store_false', default=True, + help="Don't fit intercept") + parser.add_argument('--multiclass', default='auto', + choices=('auto', 'ovr', 'multinomial'), + help='How to treat multi class data. ' + '"auto" picks "ovr" for binary classification, and ' + '"multinomial" otherwise.') + parser.add_argument('--solver', default='lbfgs', + choices=('lbfgs', 'newton-cg', 'saga'), + help='Solver to use.') + parser.add_argument('--maxiter', type=int, default=100, + help='Maximum iterations for the iterative solver') + parser.add_argument('-C', dest='C', type=float, default=1.0, + help='Regularization parameter') + parser.add_argument('--tol', type=float, default=None, + help='Tolerance for solver. If solver == "newton-cg", ' + 'then the default is 1e-3. Otherwise, the default ' + 'is 1e-10.') + params = parse_args(parser, loop_types=('fit', 'predict')) -if params.multiclass == 'auto': - params.multiclass = 'ovr' if params.n_classes == 2 else 'multinomial' + # Load generated data + X_train, X_test, y_train, y_test = load_data(params) -if not params.tol: - params.tol = 1e-3 if params.solver == 'newton-cg' else 1e-10 + params.n_classes = len(np.unique(y_train)) -# Create our classifier object -clf = LogisticRegression(penalty='l2', C=params.C, n_jobs=params.n_jobs, - fit_intercept=params.fit_intercept, - verbose=params.verbose, - tol=params.tol, max_iter=params.maxiter, - solver=params.solver, multi_class=params.multiclass) + if params.multiclass == 'auto': + params.multiclass = 'ovr' if params.n_classes == 2 else 'multinomial' -columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', - 'solver', 'C', 'multiclass', 'n_classes', 'accuracy', 'time') + if not params.tol: + params.tol = 1e-3 if params.solver == 'newton-cg' else 1e-10 -# Time fit and predict -fit_time, _ = measure_function_time(clf.fit, X_train, y_train, params=params) -y_pred = clf.predict(X_train) -train_acc = 100 * accuracy_score(y_pred, y_train) + # Create our classifier object + clf = LogisticRegression(penalty='l2', C=params.C, n_jobs=params.n_jobs, + fit_intercept=params.fit_intercept, + verbose=params.verbose, + tol=params.tol, max_iter=params.maxiter, + solver=params.solver, multi_class=params.multiclass) -predict_time, y_pred = measure_function_time( - clf.predict, X_test, params=params) -test_acc = 100 * accuracy_score(y_pred, y_test) + columns = ('batch', 'arch', 'prefix', 'function', 'patch_sklearn', 'device', 'threads', 'dtype', 'size', + 'solver', 'C', 'multiclass', 'n_classes', 'accuracy', 'time') -print_output(library='sklearn', algorithm='logistic_regression', - stages=['training', 'prediction'], columns=columns, - params=params, functions=['LogReg.fit', 'LogReg.predict'], - times=[fit_time, predict_time], accuracy_type='accuracy[%]', - accuracies=[train_acc, test_acc], data=[X_train, X_test], - alg_instance=clf) -if params.verbose: - print() - print(f'@ Number of iterations: {clf.n_iter_}') - print('@ fit coefficients:') - print(f'@ {clf.coef_.tolist()}') - print('@ fit intercept:') - print(f'@ {clf.intercept_.tolist()}') + # Time fit and predict + fit_time, _ = measure_function_time(clf.fit, X_train, y_train, params=params) + y_pred = clf.predict(X_train) + train_acc = 100 * accuracy_score(y_pred, y_train) + + predict_time, y_pred = measure_function_time( + clf.predict, X_test, params=params) + test_acc = 100 * accuracy_score(y_pred, y_test) + + print_output(library='sklearn', algorithm='logistic_regression', + stages=['training', 'prediction'], columns=columns, + params=params, functions=['LogReg.fit', 'LogReg.predict'], + times=[fit_time, predict_time], accuracy_type='accuracy[%]', + accuracies=[train_acc, test_acc], data=[X_train, X_test], + alg_instance=clf) + if params.verbose: + print() + print(f'@ Number of iterations: {clf.n_iter_}') + print('@ fit coefficients:') + print(f'@ {clf.coef_.tolist()}') + print('@ fit intercept:') + print(f'@ {clf.intercept_.tolist()}') + +if __name__ == "__main__": + patch_sklearn() + run_with_context(main) From 607c832cc9d3e8cb22e9052e272875a1dee008ff Mon Sep 17 00:00:00 2001 From: "alexander.makaryev" Date: Mon, 18 May 2020 02:00:19 +0300 Subject: [PATCH 2/4] fix checking of patch_sklearn option --- sklearn/bench.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/bench.py b/sklearn/bench.py index 3b00d2276..4dfb184e4 100644 --- a/sklearn/bench.py +++ b/sklearn/bench.py @@ -624,10 +624,13 @@ def patch_sklearn(): if args.patch_sklearn is not None and args.patch_sklearn != 'None': from daal4py.sklearn import patch_sklearn, unpatch_sklearn - if bool(args.patch_sklearn): + if args.patch_sklearn == "True": patch_sklearn() - else: + elif args.patch_sklearn == "False": unpatch_sklearn() + else: + raise ValueError('Parameter "patch_sklearn" must be ' + '"None", "True" or "False", got {}.'.format(args.patch_sklearn)) def run_with_context(function): From 56cccee90cdb6dd5758ad6906229dac9cdab6b78 Mon Sep 17 00:00:00 2001 From: Michael Smirnov Date: Fri, 13 Nov 2020 10:36:47 +0300 Subject: [PATCH 3/4] merge continuation --- sklearn/kmeans.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/kmeans.py b/sklearn/kmeans.py index c5726cc95..0c143e089 100644 --- a/sklearn/kmeans.py +++ b/sklearn/kmeans.py @@ -1,3 +1,7 @@ +# Copyright (C) 2018-2020 Intel Corporation +# +# SPDX-License-Identifier: MIT + from bench import (measure_function_time, parse_args, load_data, print_output, run_with_context, patch_sklearn) From 8d3833030af5f85cdb0f54cfd2db4bdd2fffea1c Mon Sep 17 00:00:00 2001 From: Michael Smirnov Date: Tue, 8 Dec 2020 12:00:11 +0000 Subject: [PATCH 4/4] kmeans benchmark fix --- sklearn/kmeans.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/kmeans.py b/sklearn/kmeans.py index 0c143e089..e058cd332 100644 --- a/sklearn/kmeans.py +++ b/sklearn/kmeans.py @@ -11,6 +11,8 @@ def main(): from sklearn.cluster import KMeans from sklearn.metrics.cluster import davies_bouldin_score + global X_init, params + parser = argparse.ArgumentParser(description='scikit-learn K-means benchmark') parser.add_argument('-i', '--filei', '--fileI', '--init', type=str, help='Initial clusters')