diff --git a/bench.py b/bench.py index e33998603..620818e97 100644 --- a/bench.py +++ b/bench.py @@ -176,6 +176,10 @@ def parse_args(parser, size=None, loop_types=(), parser.add_argument('--no-intel-optimized', default=False, action='store_true', help='Use no intel optimized version. ' 'Now avalible for scikit-learn benchmarks'), + parser.add_argument('--device', default='None', type=str, + choices=('host', 'cpu', 'gpu', 'None'), + help='Execution context device') + for data in ['X', 'y']: for stage in ['train', 'test']: parser.add_argument(f'--file-{data}-{stage}', @@ -197,6 +201,14 @@ def parse_args(parser, size=None, loop_types=(), except ImportError: print('Failed to import daal4py.sklearn.patch_sklearn.' 'Use stock version scikit-learn', file=sys.stderr) + params.device = 'None' + else: + if params.device != 'None': + print('Device context is not supported for stock scikit-learn.' + 'Please use --no-intel-optimized=False with' + f'--device={params.device} parameter. Fallback to --device=None.', + file=sys.stderr) + params.device = 'None' # disable finiteness check (default) if not params.check_finiteness: @@ -492,3 +504,12 @@ def print_output(library, algorithm, stages, params, functions, del result['algorithm_parameters']['handle'] output.append(result) print(json.dumps(output, indent=4)) + + +def run_with_context(params, function): + if params.device != 'None': + from daal4py.oneapi import sycl_context + with sycl_context(params.device): + function() + else: + function() diff --git a/configs/skl_with_context_config.json b/configs/skl_with_context_config.json new file mode 100644 index 000000000..023850c38 --- /dev/null +++ b/configs/skl_with_context_config.json @@ -0,0 +1,77 @@ +{ + "common": { + "lib": ["sklearn"], + "data-format": ["pandas"], + "data-order": ["F"], + "dtype": ["float64"], + "device": ["host", "cpu", "gpu", "None"] + }, + "cases": [ + { + "algorithm": "kmeans", + "dataset": [ + { + "source": "synthetic", + "type": "blobs", + "n_clusters": 10, + "n_features": 50, + "training": { + "n_samples": 1000000 + } + } + ], + "n-clusters": [10] + }, + { + "algorithm": "dbscan", + "dataset": [ + { + "source": "synthetic", + "type": "blobs", + "n_clusters": 10, + "n_features": 50, + "training": { + "n_samples": 10000 + } + } + ] + }, + { + "algorithm": "linear", + "dataset": [ + { + "source": "synthetic", + "type": "regression", + "n_features": 50, + "training": { + "n_samples": 1000000 + } + } + ] + }, + { + "algorithm": "log_reg", + "solver":["lbfgs", "newton-cg"], + "dataset": [ + { + "source": "synthetic", + "type": "classification", + "n_classes": 2, + "n_features": 100, + "training": { + "n_samples": 100000 + } + }, + { + "source": "synthetic", + "type": "classification", + "n_classes": 5, + "n_features": 100, + "training": { + "n_samples": 100000 + } + } + ] + } + ] +} diff --git a/sklearn_bench/dbscan.py b/sklearn_bench/dbscan.py index 542da102f..d0d3e0edd 100644 --- a/sklearn_bench/dbscan.py +++ b/sklearn_bench/dbscan.py @@ -19,38 +19,43 @@ import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import bench -from sklearn.metrics.cluster import davies_bouldin_score -parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark') -parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10., - help='Radius of neighborhood of a point') -parser.add_argument('-m', '--min-samples', default=5, type=int, - help='The minimum number of samples required in a ' - 'neighborhood to consider a point a core point') -params = bench.parse_args(parser) -from sklearn.cluster import DBSCAN +def main(): + from sklearn.cluster import DBSCAN + from sklearn.metrics.cluster import davies_bouldin_score -# Load generated data -X, _, _, _ = bench.load_data(params, add_dtype=True) + # Load generated data + X, _, _, _ = bench.load_data(params, add_dtype=True) -# Create our clustering object -dbscan = DBSCAN(eps=params.eps, n_jobs=params.n_jobs, - min_samples=params.min_samples, metric='euclidean', - algorithm='auto') + # Create our clustering object + dbscan = DBSCAN(eps=params.eps, n_jobs=params.n_jobs, + min_samples=params.min_samples, metric='euclidean', + algorithm='auto') -# N.B. algorithm='auto' will select DAAL's brute force method when running -# daal4py-patched scikit-learn, and probably 'kdtree' when running unpatched -# scikit-learn. + # N.B. algorithm='auto' will select oneAPI Data Analytics Library (oneDAL) + # brute force method when running daal4py-patched scikit-learn, and probably + # 'kdtree' when running unpatched scikit-learn. -# Time fit -time, _ = bench.measure_function_time(dbscan.fit, X, params=params) -labels = dbscan.labels_ + # Time fit + time, _ = bench.measure_function_time(dbscan.fit, X, params=params) + labels = dbscan.labels_ -params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0) -acc = davies_bouldin_score(X, labels) + params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0) + acc = davies_bouldin_score(X, labels) -bench.print_output(library='sklearn', algorithm='dbscan', stages=['training'], - params=params, functions=['DBSCAN'], times=[time], accuracies=[acc], - accuracy_type='davies_bouldin_score', data=[X], - alg_instance=dbscan) + bench.print_output(library='sklearn', algorithm='dbscan', stages=['training'], + params=params, functions=['DBSCAN'], times=[time], + accuracies=[acc], accuracy_type='davies_bouldin_score', + data=[X], alg_instance=dbscan) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark') + parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10., + help='Radius of neighborhood of a point') + parser.add_argument('-m', '--min-samples', default=5, type=int, + help='The minimum number of samples required in a ' + 'neighborhood to consider a point a core point') + params = bench.parse_args(parser) + bench.run_with_context(params, main) diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index a6e1a5f1e..1cf03ff0f 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -20,63 +20,67 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import bench import numpy as np -from sklearn.metrics.cluster import davies_bouldin_score -parser = argparse.ArgumentParser(description='scikit-learn K-means benchmark') -parser.add_argument('-i', '--filei', '--fileI', '--init', - type=str, help='Initial clusters') -parser.add_argument('-t', '--tol', type=float, default=0., - help='Absolute threshold') -parser.add_argument('--maxiter', type=int, default=100, - help='Maximum number of iterations') -parser.add_argument('--n-clusters', type=int, help='Number of clusters') -params = bench.parse_args(parser) -from sklearn.cluster import KMeans +def main(): + from sklearn.cluster import KMeans + from sklearn.metrics.cluster import davies_bouldin_score -# Load and convert generated data -X_train, X_test, _, _ = bench.load_data(params) + # Load and convert generated data + X_train, X_test, _, _ = bench.load_data(params) -if params.filei == 'k-means++': - X_init = 'k-means++' -# Load initial centroids from specified path -elif params.filei is not None: - X_init = np.load(params.filei).astype(params.dtype) - params.n_clusters = X_init.shape[0] -# or choose random centroids from training data -else: - np.random.seed(params.seed) - centroids_idx = np.random.randint(0, X_train.shape[0], - size=params.n_clusters) - if hasattr(X_train, "iloc"): - X_init = X_train.iloc[centroids_idx].values + if params.filei == 'k-means++': + X_init = 'k-means++' + # Load initial centroids from specified path + elif params.filei is not None: + X_init = np.load(params.filei).astype(params.dtype) + params.n_clusters = X_init.shape[0] + # or choose random centroids from training data else: - X_init = X_train[centroids_idx] + np.random.seed(params.seed) + centroids_idx = np.random.randint(0, X_train.shape[0], + size=params.n_clusters) + if hasattr(X_train, "iloc"): + X_init = X_train.iloc[centroids_idx].values + else: + X_init = X_train[centroids_idx] + def fit_kmeans(X, X_init): + alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, + max_iter=params.maxiter, init=X_init, n_init=1) + alg.fit(X) + return alg -def fit_kmeans(X): - global X_init, params - alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, - max_iter=params.maxiter, init=X_init, n_init=1) - alg.fit(X) - return alg + # Time fit + fit_time, kmeans = bench.measure_function_time(fit_kmeans, X_train, + X_init, params=params) + train_predict = kmeans.predict(X_train) + acc_train = davies_bouldin_score(X_train, train_predict) -# Time fit -fit_time, kmeans = bench.measure_function_time(fit_kmeans, X_train, params=params) + # Time predict + predict_time, test_predict = bench.measure_function_time( + kmeans.predict, X_test, params=params) -train_predict = kmeans.predict(X_train) -acc_train = davies_bouldin_score(X_train, train_predict) + acc_test = davies_bouldin_score(X_test, test_predict) -# Time predict -predict_time, test_predict = bench.measure_function_time( - kmeans.predict, X_test, params=params) + bench.print_output(library='sklearn', algorithm='kmeans', + stages=['training', 'prediction'], + params=params, functions=['KMeans.fit', 'KMeans.predict'], + times=[fit_time, predict_time], + accuracy_type='davies_bouldin_score', + accuracies=[acc_train, acc_test], data=[X_train, X_test], + alg_instance=kmeans) -acc_test = davies_bouldin_score(X_test, test_predict) -bench.print_output(library='sklearn', algorithm='kmeans', - stages=['training', 'prediction'], - params=params, functions=['KMeans.fit', 'KMeans.predict'], - times=[fit_time, predict_time], accuracy_type='davies_bouldin_score', - accuracies=[acc_train, acc_test], data=[X_train, X_test], - alg_instance=kmeans) +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='scikit-learn K-means benchmark') + parser.add_argument('-i', '--filei', '--fileI', '--init', + type=str, help='Initial clusters') + parser.add_argument('-t', '--tol', type=float, default=0., + help='Absolute threshold') + parser.add_argument('--maxiter', type=int, default=100, + help='Maximum number of iterations') + parser.add_argument('--n-clusters', type=int, help='Number of clusters') + params = bench.parse_args(parser) + bench.run_with_context(params, main) diff --git a/sklearn_bench/linear.py b/sklearn_bench/linear.py index e059d96ef..4ddf2007f 100644 --- a/sklearn_bench/linear.py +++ b/sklearn_bench/linear.py @@ -21,36 +21,41 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import bench -parser = argparse.ArgumentParser(description='scikit-learn linear regression ' - 'benchmark') -parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, - action='store_false', - help="Don't fit intercept (assume data already centered)") -params = bench.parse_args(parser) - -from sklearn.linear_model import LinearRegression - -# Load data -X_train, X_test, y_train, y_test = bench.load_data( - params, generated_data=['X_train', 'y_train']) - -# Create our regression object -regr = LinearRegression(fit_intercept=params.fit_intercept, - n_jobs=params.n_jobs, copy_X=False) - -# Time fit -fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) - -# Time predict -predict_time, yp = bench.measure_function_time(regr.predict, X_test, params=params) - -test_rmse = bench.rmse_score(yp, y_test) -yp = regr.predict(X_train) -train_rmse = bench.rmse_score(yp, y_train) - -bench.print_output(library='sklearn', algorithm='linear_regression', - stages=['training', 'prediction'], - params=params, functions=['Linear.fit', 'Linear.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_test], - alg_instance=regr) + +def main(): + from sklearn.linear_model import LinearRegression + + # Load data + X_train, X_test, y_train, y_test = bench.load_data( + params, generated_data=['X_train', 'y_train']) + + # Create our regression object + regr = LinearRegression(fit_intercept=params.fit_intercept, + n_jobs=params.n_jobs, copy_X=False) + + # Time fit + fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) + + # Time predict + predict_time, yp = bench.measure_function_time(regr.predict, X_test, params=params) + + test_rmse = bench.rmse_score(yp, y_test) + yp = regr.predict(X_train) + train_rmse = bench.rmse_score(yp, y_train) + + bench.print_output(library='sklearn', algorithm='linear_regression', + stages=['training', 'prediction'], + params=params, functions=['Linear.fit', 'Linear.predict'], + times=[fit_time, predict_time], accuracy_type='rmse', + accuracies=[train_rmse, test_rmse], data=[X_train, X_test], + alg_instance=regr) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='scikit-learn linear regression ' + 'benchmark') + parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, + action='store_false', + help="Don't fit intercept (assume data already centered)") + params = bench.parse_args(parser) + bench.run_with_context(params, main) diff --git a/sklearn_bench/log_reg.py b/sklearn_bench/log_reg.py index 073fa549f..bcf455b56 100644 --- a/sklearn_bench/log_reg.py +++ b/sklearn_bench/log_reg.py @@ -20,63 +20,68 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import bench import numpy as np -from sklearn.metrics import accuracy_score -parser = argparse.ArgumentParser(description='scikit-learn logistic ' - 'regression benchmark') -parser.add_argument('--no-fit-intercept', dest='fit_intercept', - action='store_false', default=True, - help="Don't fit intercept") -parser.add_argument('--multiclass', default='auto', - choices=('auto', 'ovr', 'multinomial'), - help='How to treat multi class data. ' - '"auto" picks "ovr" for binary classification, and ' - '"multinomial" otherwise.') -parser.add_argument('--solver', default='lbfgs', - choices=('lbfgs', 'newton-cg', 'saga'), - help='Solver to use.') -parser.add_argument('--maxiter', type=int, default=100, - help='Maximum iterations for the iterative solver') -parser.add_argument('-C', dest='C', type=float, default=1.0, - help='Regularization parameter') -parser.add_argument('--tol', type=float, default=None, - help='Tolerance for solver. If solver == "newton-cg", ' - 'then the default is 1e-3. Otherwise, the default ' - 'is 1e-10.') -params = bench.parse_args(parser, loop_types=('fit', 'predict')) -from sklearn.linear_model import LogisticRegression +def main(): + from sklearn.linear_model import LogisticRegression + from sklearn.metrics import accuracy_score -# Load generated data -X_train, X_test, y_train, y_test = bench.load_data(params) + # Load generated data + X_train, X_test, y_train, y_test = bench.load_data(params) -params.n_classes = len(np.unique(y_train)) + params.n_classes = len(np.unique(y_train)) -if params.multiclass == 'auto': - params.multiclass = 'ovr' if params.n_classes == 2 else 'multinomial' + if params.multiclass == 'auto': + params.multiclass = 'ovr' if params.n_classes == 2 else 'multinomial' -if not params.tol: - params.tol = 1e-3 if params.solver == 'newton-cg' else 1e-10 + if not params.tol: + params.tol = 1e-3 if params.solver == 'newton-cg' else 1e-10 -# Create our classifier object -clf = LogisticRegression(penalty='l2', C=params.C, n_jobs=params.n_jobs, - fit_intercept=params.fit_intercept, - verbose=params.verbose, - tol=params.tol, max_iter=params.maxiter, - solver=params.solver, multi_class=params.multiclass) -# Time fit and predict -fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) + # Create our classifier object + clf = LogisticRegression(penalty='l2', C=params.C, n_jobs=params.n_jobs, + fit_intercept=params.fit_intercept, + verbose=params.verbose, + tol=params.tol, max_iter=params.maxiter, + solver=params.solver, multi_class=params.multiclass) + # Time fit and predict + fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) -y_pred = clf.predict(X_train) -train_acc = 100 * accuracy_score(y_pred, y_train) + y_pred = clf.predict(X_train) + train_acc = 100 * accuracy_score(y_pred, y_train) -predict_time, y_pred = bench.measure_function_time( - clf.predict, X_test, params=params) -test_acc = 100 * accuracy_score(y_pred, y_test) + predict_time, y_pred = bench.measure_function_time( + clf.predict, X_test, params=params) + test_acc = 100 * accuracy_score(y_pred, y_test) -bench.print_output(library='sklearn', algorithm='logistic_regression', - stages=['training', 'prediction'], params=params, - functions=['LogReg.fit', 'LogReg.predict'], - times=[fit_time, predict_time], accuracy_type='accuracy[%]', - accuracies=[train_acc, test_acc], data=[X_train, X_test], - alg_instance=clf) + bench.print_output(library='sklearn', algorithm='logistic_regression', + stages=['training', 'prediction'], params=params, + functions=['LogReg.fit', 'LogReg.predict'], + times=[fit_time, predict_time], accuracy_type='accuracy[%]', + accuracies=[train_acc, test_acc], data=[X_train, X_test], + alg_instance=clf) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='scikit-learn logistic ' + 'regression benchmark') + parser.add_argument('--no-fit-intercept', dest='fit_intercept', + action='store_false', default=True, + help="Don't fit intercept") + parser.add_argument('--multiclass', default='auto', + choices=('auto', 'ovr', 'multinomial'), + help='How to treat multi class data. ' + '"auto" picks "ovr" for binary classification, and ' + '"multinomial" otherwise.') + parser.add_argument('--solver', default='lbfgs', + choices=('lbfgs', 'newton-cg', 'saga'), + help='Solver to use.') + parser.add_argument('--maxiter', type=int, default=100, + help='Maximum iterations for the iterative solver') + parser.add_argument('-C', dest='C', type=float, default=1.0, + help='Regularization parameter') + parser.add_argument('--tol', type=float, default=None, + help='Tolerance for solver. If solver == "newton-cg", ' + 'then the default is 1e-3. Otherwise, the default ' + 'is 1e-10.') + params = bench.parse_args(parser, loop_types=('fit', 'predict')) + bench.run_with_context(params, main)