diff --git a/sklearn/bench.py b/sklearn/bench.py index baf64aae3..91979059e 100755 --- a/sklearn/bench.py +++ b/sklearn/bench.py @@ -164,6 +164,12 @@ def parse_args(parser, size=None, loop_types=(), help='Seed to pass as random_state') parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name') + parser.add_argument('--device', type=str, default='None', + choices=('None', 'host', 'cpu', 'gpu'), + help='Execution context device, "None" to run without context.') + parser.add_argument('--patch_sklearn', type=str, default='None', + choices=('None', 'True', 'False'), + help='True for patch, False for unpatch, "None" to leave as is.') for data in ['X', 'y']: for stage in ['train', 'test']: @@ -618,3 +624,36 @@ def import_fptype_getter(): except: from daal4py.sklearn.utils import getFPType return getFPType + + +def patch_sklearn(): + parser = argparse.ArgumentParser() + parser.add_argument('--patch_sklearn', type=str, default='None', + choices=('None', 'True', 'False'), + help='True for patch, False for unpatch, "None" to leave as is.') + args, _ = parser.parse_known_args() + + if args.patch_sklearn is not None and args.patch_sklearn != 'None': + from daal4py.sklearn import patch_sklearn, unpatch_sklearn + if args.patch_sklearn == "True": + patch_sklearn() + elif args.patch_sklearn == "False": + unpatch_sklearn() + else: + raise ValueError('Parameter "patch_sklearn" must be ' + '"None", "True" or "False", got {}.'.format(args.patch_sklearn)) + + +def run_with_context(function): + parser = argparse.ArgumentParser() + parser.add_argument('--device', type=str, default='None', + choices=('None', 'host', 'cpu', 'gpu'), + help='Execution context device, "None" to run without context.') + args, _ = parser.parse_known_args() + + if args.device is not None and args.device != 'None': + from daal4py.oneapi import sycl_context + with sycl_context(args.device): + function() + else: + function() diff --git a/sklearn/dbscan.py b/sklearn/dbscan.py index f1c3839b4..441c64e32 100644 --- a/sklearn/dbscan.py +++ b/sklearn/dbscan.py @@ -2,42 +2,50 @@ # # SPDX-License-Identifier: MIT -import argparse -from bench import measure_function_time, parse_args, load_data, print_output -from sklearn.cluster import DBSCAN -from sklearn.metrics.cluster import davies_bouldin_score - -parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark') -parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10., - help='Radius of neighborhood of a point') -parser.add_argument('-m', '--min-samples', default=5, type=int, - help='The minimum number of samples required in a ' - 'neighborhood to consider a point a core point') -params = parse_args(parser, n_jobs_supported=True) - -# Load generated data -X, _, _, _ = load_data(params, add_dtype=True) - -# Create our clustering object -dbscan = DBSCAN(eps=params.eps, n_jobs=params.n_jobs, - min_samples=params.min_samples, metric='euclidean', - algorithm='auto') - -# N.B. algorithm='auto' will select DAAL's brute force method when running -# daal4py-patched scikit-learn, and probably 'kdtree' when running unpatched -# scikit-learn. - -columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', - 'n_clusters', 'time') - -# Time fit -time, _ = measure_function_time(dbscan.fit, X, params=params) -labels = dbscan.labels_ - -params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0) -acc = davies_bouldin_score(X, labels) - -print_output(library='sklearn', algorithm='dbscan', stages=['training'], - columns=columns, params=params, functions=['DBSCAN'], - times=[time], accuracies=[acc], accuracy_type='davies_bouldin_score', data=[X], - alg_instance=dbscan) +from bench import (measure_function_time, parse_args, load_data, print_output, + run_with_context, patch_sklearn) + +def main(): + import argparse + from sklearn.cluster import DBSCAN + from sklearn.metrics.cluster import davies_bouldin_score + + parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark') + parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10., + help='Radius of neighborhood of a point') + parser.add_argument('-m', '--min-samples', default=5, type=int, + help='The minimum number of samples required in a ' + 'neighborhood to consider a point a core point') + params = parse_args(parser, n_jobs_supported=True) + + # Load generated data + X, _, _, _ = load_data(params, add_dtype=True) + + # Create our clustering object + dbscan = DBSCAN(eps=params.eps, n_jobs=params.n_jobs, + min_samples=params.min_samples, metric='euclidean', + algorithm='auto') + + # N.B. algorithm='auto' will select DAAL's brute force method when running + # daal4py-patched scikit-learn, and probably 'kdtree' when running unpatched + # scikit-learn. + + columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', + 'n_clusters', 'time') + + # Time fit + time, _ = measure_function_time(dbscan.fit, X, params=params) + labels = dbscan.labels_ + + params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0) + acc = davies_bouldin_score(X, labels) + + print_output(library='sklearn', algorithm='dbscan', stages=['training'], + columns=columns, params=params, functions=['DBSCAN'], + times=[time], accuracies=[acc], accuracy_type='davies_bouldin_score', data=[X], + alg_instance=dbscan) + + +if __name__ == "__main__": + patch_sklearn() + run_with_context(main) diff --git a/sklearn/kmeans.py b/sklearn/kmeans.py index 7ee1617c0..e058cd332 100644 --- a/sklearn/kmeans.py +++ b/sklearn/kmeans.py @@ -1,71 +1,78 @@ -# Copyright (C) 2017-2020 Intel Corporation +# Copyright (C) 2018-2020 Intel Corporation # # SPDX-License-Identifier: MIT -import argparse -from bench import ( - parse_args, measure_function_time, load_data, print_output -) -import numpy as np -from sklearn.cluster import KMeans -from sklearn.metrics.cluster import davies_bouldin_score - -parser = argparse.ArgumentParser(description='scikit-learn K-means benchmark') -parser.add_argument('-i', '--filei', '--fileI', '--init', - type=str, help='Initial clusters') -parser.add_argument('-t', '--tol', type=float, default=0., - help='Absolute threshold') -parser.add_argument('--maxiter', type=int, default=100, - help='Maximum number of iterations') -parser.add_argument('--n-clusters', type=int, help='Number of clusters') -params = parse_args(parser) - -# Load and convert generated data -X_train, X_test, _, _ = load_data(params) - -if params.filei == 'k-means++': - X_init = 'k-means++' -# Load initial centroids from specified path -elif params.filei is not None: - X_init = np.load(params.filei).astype(params.dtype) - params.n_clusters = X_init.shape[0] -# or choose random centroids from training data -else: - np.random.seed(params.seed) - centroids_idx = np.random.randint(0, X_train.shape[0], - size=params.n_clusters) - if hasattr(X_train, "iloc"): - X_init = X_train.iloc[centroids_idx].values - else: - X_init = X_train[centroids_idx] +from bench import (measure_function_time, parse_args, load_data, print_output, + run_with_context, patch_sklearn) +def main(): + import argparse + import numpy as np + from sklearn.cluster import KMeans + from sklearn.metrics.cluster import davies_bouldin_score -def fit_kmeans(X): global X_init, params - alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, - max_iter=params.maxiter, init=X_init, n_init=1) - alg.fit(X) - return alg + parser = argparse.ArgumentParser(description='scikit-learn K-means benchmark') + parser.add_argument('-i', '--filei', '--fileI', '--init', + type=str, help='Initial clusters') + parser.add_argument('-t', '--tol', type=float, default=0., + help='Absolute threshold') + parser.add_argument('--maxiter', type=int, default=100, + help='Maximum number of iterations') + parser.add_argument('--n-clusters', type=int, help='Number of clusters') + params = parse_args(parser) + + # Load and convert generated data + X_train, X_test, _, _ = load_data(params) + + if params.filei == 'k-means++': + X_init = 'k-means++' + # Load initial centroids from specified path + elif params.filei is not None: + X_init = np.load(params.filei).astype(params.dtype) + params.n_clusters = X_init.shape[0] + # or choose random centroids from training data + else: + np.random.seed(params.seed) + centroids_idx = np.random.randint(0, X_train.shape[0], + size=params.n_clusters) + if hasattr(X_train, "iloc"): + X_init = X_train.iloc[centroids_idx].values + else: + X_init = X_train[centroids_idx] + + + def fit_kmeans(X): + global X_init, params + alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, + max_iter=params.maxiter, init=X_init, n_init=1) + alg.fit(X) + return alg + + + columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', + 'n_clusters', 'time') -columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', - 'n_clusters', 'time') + # Time fit + fit_time, kmeans = measure_function_time(fit_kmeans, X_train, params=params) -# Time fit -fit_time, kmeans = measure_function_time(fit_kmeans, X_train, params=params) + train_predict = kmeans.predict(X_train) + acc_train = davies_bouldin_score(X_train, train_predict) -train_predict = kmeans.predict(X_train) -acc_train = davies_bouldin_score(X_train, train_predict) + # Time predict + predict_time, test_predict = measure_function_time( + kmeans.predict, X_test, params=params) -# Time predict -predict_time, test_predict = measure_function_time( - kmeans.predict, X_test, params=params) + acc_test = davies_bouldin_score(X_test, test_predict) -acc_test = davies_bouldin_score(X_test, test_predict) + print_output(library='sklearn', algorithm='kmeans', + stages=['training', 'prediction'], columns=columns, + params=params, functions=['KMeans.fit', 'KMeans.predict'], + times=[fit_time, predict_time], accuracy_type='davies_bouldin_score', + accuracies=[acc_train, acc_test], data=[X_train, X_test], + alg_instance=kmeans) -print_output(library='sklearn', algorithm='kmeans', - stages=['training', 'prediction'], columns=columns, - params=params, functions=['KMeans.fit', 'KMeans.predict'], - times=[fit_time, predict_time], accuracy_type='davies_bouldin_score', - accuracies=[acc_train, acc_test], data=[X_train, X_test], - alg_instance=kmeans) +if __name__ == "__main__": + patch_sklearn() + run_with_context(main) diff --git a/sklearn/linear.py b/sklearn/linear.py index 9da3353e4..9017cf60a 100644 --- a/sklearn/linear.py +++ b/sklearn/linear.py @@ -2,43 +2,50 @@ # # SPDX-License-Identifier: MIT -import argparse -from bench import ( - parse_args, measure_function_time, load_data, print_output, rmse_score -) -from sklearn.linear_model import LinearRegression - -parser = argparse.ArgumentParser(description='scikit-learn linear regression ' - 'benchmark') -parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, - action='store_false', - help="Don't fit intercept (assume data already centered)") -params = parse_args(parser, size=(1000000, 50)) - -# Load data -X_train, X_test, y_train, y_test = load_data( - params, generated_data=['X_train', 'y_train']) - -# Create our regression object -regr = LinearRegression(fit_intercept=params.fit_intercept, - n_jobs=params.n_jobs, copy_X=False) - -columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', - 'time') - -# Time fit -fit_time, _ = measure_function_time(regr.fit, X_train, y_train, params=params) - -# Time predict -predict_time, yp = measure_function_time(regr.predict, X_test, params=params) - -test_rmse = rmse_score(yp, y_test) -yp = regr.predict(X_train) -train_rmse = rmse_score(yp, y_train) - -print_output(library='sklearn', algorithm='linear_regression', - stages=['training', 'prediction'], columns=columns, - params=params, functions=['Linear.fit', 'Linear.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_test], - alg_instance=regr) +from bench import (measure_function_time, parse_args, load_data, print_output, rmse_score, + run_with_context, patch_sklearn) + + +def main(): + import argparse + from sklearn.linear_model import LinearRegression + + parser = argparse.ArgumentParser(description='scikit-learn linear regression ' + 'benchmark') + parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, + action='store_false', + help="Don't fit intercept (assume data already centered)") + params = parse_args(parser, size=(1000000, 50)) + + # Load data + X_train, X_test, y_train, y_test = load_data( + params, generated_data=['X_train', 'y_train']) + + # Create our regression object + regr = LinearRegression(fit_intercept=params.fit_intercept, + n_jobs=params.n_jobs, copy_X=False) + + columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', + 'time') + + # Time fit + fit_time, _ = measure_function_time(regr.fit, X_train, y_train, params=params) + + # Time predict + predict_time, yp = measure_function_time(regr.predict, X_test, params=params) + + test_rmse = rmse_score(yp, y_test) + yp = regr.predict(X_train) + train_rmse = rmse_score(yp, y_train) + + print_output(library='sklearn', algorithm='linear_regression', + stages=['training', 'prediction'], columns=columns, + params=params, functions=['Linear.fit', 'Linear.predict'], + times=[fit_time, predict_time], accuracy_type='rmse', + accuracies=[train_rmse, test_rmse], data=[X_train, X_test], + alg_instance=regr) + + +if __name__ == "__main__": + patch_sklearn() + run_with_context(main) diff --git a/sklearn/log_reg.py b/sklearn/log_reg.py index 576a82ec4..d3e213ce3 100644 --- a/sklearn/log_reg.py +++ b/sklearn/log_reg.py @@ -2,77 +2,83 @@ # # SPDX-License-Identifier: MIT -import argparse -from bench import ( - parse_args, measure_function_time, load_data, print_output -) -import numpy as np -from sklearn.linear_model import LogisticRegression -from sklearn.metrics import accuracy_score -parser = argparse.ArgumentParser(description='scikit-learn logistic ' - 'regression benchmark') -parser.add_argument('--no-fit-intercept', dest='fit_intercept', - action='store_false', default=True, - help="Don't fit intercept") -parser.add_argument('--multiclass', default='auto', - choices=('auto', 'ovr', 'multinomial'), - help='How to treat multi class data. ' - '"auto" picks "ovr" for binary classification, and ' - '"multinomial" otherwise.') -parser.add_argument('--solver', default='lbfgs', - choices=('lbfgs', 'newton-cg', 'saga'), - help='Solver to use.') -parser.add_argument('--maxiter', type=int, default=100, - help='Maximum iterations for the iterative solver') -parser.add_argument('-C', dest='C', type=float, default=1.0, - help='Regularization parameter') -parser.add_argument('--tol', type=float, default=None, - help='Tolerance for solver. If solver == "newton-cg", ' - 'then the default is 1e-3. Otherwise, the default ' - 'is 1e-10.') -params = parse_args(parser, loop_types=('fit', 'predict')) +from bench import (measure_function_time, parse_args, load_data, print_output, + run_with_context, patch_sklearn) -# Load generated data -X_train, X_test, y_train, y_test = load_data(params) -params.n_classes = len(np.unique(y_train)) +def main(): + import argparse + import numpy as np + from sklearn.linear_model import LogisticRegression + from sklearn.metrics import accuracy_score + parser = argparse.ArgumentParser(description='scikit-learn logistic ' + 'regression benchmark') + parser.add_argument('--no-fit-intercept', dest='fit_intercept', + action='store_false', default=True, + help="Don't fit intercept") + parser.add_argument('--multiclass', default='auto', + choices=('auto', 'ovr', 'multinomial'), + help='How to treat multi class data. ' + '"auto" picks "ovr" for binary classification, and ' + '"multinomial" otherwise.') + parser.add_argument('--solver', default='lbfgs', + choices=('lbfgs', 'newton-cg', 'saga'), + help='Solver to use.') + parser.add_argument('--maxiter', type=int, default=100, + help='Maximum iterations for the iterative solver') + parser.add_argument('-C', dest='C', type=float, default=1.0, + help='Regularization parameter') + parser.add_argument('--tol', type=float, default=None, + help='Tolerance for solver. If solver == "newton-cg", ' + 'then the default is 1e-3. Otherwise, the default ' + 'is 1e-10.') + params = parse_args(parser, loop_types=('fit', 'predict')) -if params.multiclass == 'auto': - params.multiclass = 'ovr' if params.n_classes == 2 else 'multinomial' + # Load generated data + X_train, X_test, y_train, y_test = load_data(params) -if not params.tol: - params.tol = 1e-3 if params.solver == 'newton-cg' else 1e-10 + params.n_classes = len(np.unique(y_train)) -# Create our classifier object -clf = LogisticRegression(penalty='l2', C=params.C, n_jobs=params.n_jobs, - fit_intercept=params.fit_intercept, - verbose=params.verbose, - tol=params.tol, max_iter=params.maxiter, - solver=params.solver, multi_class=params.multiclass) + if params.multiclass == 'auto': + params.multiclass = 'ovr' if params.n_classes == 2 else 'multinomial' -columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', - 'solver', 'C', 'multiclass', 'n_classes', 'accuracy', 'time') + if not params.tol: + params.tol = 1e-3 if params.solver == 'newton-cg' else 1e-10 -# Time fit and predict -fit_time, _ = measure_function_time(clf.fit, X_train, y_train, params=params) + # Create our classifier object + clf = LogisticRegression(penalty='l2', C=params.C, n_jobs=params.n_jobs, + fit_intercept=params.fit_intercept, + verbose=params.verbose, + tol=params.tol, max_iter=params.maxiter, + solver=params.solver, multi_class=params.multiclass) -y_pred = clf.predict(X_train) -train_acc = 100 * accuracy_score(y_pred, y_train) + columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', + 'solver', 'C', 'multiclass', 'n_classes', 'accuracy', 'time') -predict_time, y_pred = measure_function_time( - clf.predict, X_test, params=params) -test_acc = 100 * accuracy_score(y_pred, y_test) + # Time fit and predict + fit_time, _ = measure_function_time(clf.fit, X_train, y_train, params=params) -print_output(library='sklearn', algorithm='logistic_regression', - stages=['training', 'prediction'], columns=columns, - params=params, functions=['LogReg.fit', 'LogReg.predict'], - times=[fit_time, predict_time], accuracy_type='accuracy[%]', - accuracies=[train_acc, test_acc], data=[X_train, X_test], - alg_instance=clf) -if params.verbose: - print() - print(f'@ Number of iterations: {clf.n_iter_}') - print('@ fit coefficients:') - print(f'@ {clf.coef_.tolist()}') - print('@ fit intercept:') - print(f'@ {clf.intercept_.tolist()}') + y_pred = clf.predict(X_train) + train_acc = 100 * accuracy_score(y_pred, y_train) + + predict_time, y_pred = measure_function_time( + clf.predict, X_test, params=params) + test_acc = 100 * accuracy_score(y_pred, y_test) + + print_output(library='sklearn', algorithm='logistic_regression', + stages=['training', 'prediction'], columns=columns, + params=params, functions=['LogReg.fit', 'LogReg.predict'], + times=[fit_time, predict_time], accuracy_type='accuracy[%]', + accuracies=[train_acc, test_acc], data=[X_train, X_test], + alg_instance=clf) + if params.verbose: + print() + print(f'@ Number of iterations: {clf.n_iter_}') + print('@ fit coefficients:') + print(f'@ {clf.coef_.tolist()}') + print('@ fit intercept:') + print(f'@ {clf.intercept_.tolist()}') + +if __name__ == "__main__": + patch_sklearn() + run_with_context(main)