From 1c6eefd911a12c59ba20d2ad5c622ae991102b8c Mon Sep 17 00:00:00 2001 From: Vladislav Nazarov Date: Wed, 17 Mar 2021 05:01:55 +0300 Subject: [PATCH 1/7] Initial impl --- bench.py | 16 ++++- configs/skl_with_context_config.json | 69 ++++++++++++++++++ runner.py | 3 + sklearn_bench/dbscan.py | 71 ++++++++++--------- sklearn_bench/kmeans.py | 100 ++++++++++++++------------- 5 files changed, 177 insertions(+), 82 deletions(-) create mode 100644 configs/skl_with_context_config.json diff --git a/bench.py b/bench.py index e33998603..ae45e7c62 100644 --- a/bench.py +++ b/bench.py @@ -175,7 +175,11 @@ def parse_args(parser, size=None, loop_types=(), help='Dataset name') parser.add_argument('--no-intel-optimized', default=False, action='store_true', help='Use no intel optimized version. ' - 'Now avalible for scikit-learn benchmarks'), + 'Now avalible for scikit-learn benchmarks') + parser.add_argument('--device', default=None, type=str, + choices=("host", "cpu", "gpu"), + help='Execution context device') + for data in ['X', 'y']: for stage in ['train', 'test']: parser.add_argument(f'--file-{data}-{stage}', @@ -197,6 +201,8 @@ def parse_args(parser, size=None, loop_types=(), except ImportError: print('Failed to import daal4py.sklearn.patch_sklearn.' 'Use stock version scikit-learn', file=sys.stderr) + else: + params.device = None # disable finiteness check (default) if not params.check_finiteness: @@ -492,3 +498,11 @@ def print_output(library, algorithm, stages, params, functions, del result['algorithm_parameters']['handle'] output.append(result) print(json.dumps(output, indent=4)) + +def run_with_context(params, function): + if params.device is not None: + from daal4py.oneapi import sycl_context + with sycl_context(params.device): + function() + else: + function() diff --git a/configs/skl_with_context_config.json b/configs/skl_with_context_config.json new file mode 100644 index 000000000..0212e155c --- /dev/null +++ b/configs/skl_with_context_config.json @@ -0,0 +1,69 @@ +{ + "common": { + "lib": ["sklearn"], + "data-format": ["pandas"], + "data-order": ["F"], + "dtype": ["float64"], + "device": ["host", "cpu", "gpu"] + }, + "cases": [ + { + "algorithm": "kmeans", + "dataset": [ + { + "source": "synthetic", + "type": "blobs", + "n_clusters": 1000, + "n_features": 20, + "training": { + "n_samples": 1000000 + } + } + ], + "time-method": ["box_filter"], + "time-limit": [50], + "n-clusters": [1000], + "maxiter": [50], + "tol": [0.0] + }, + { + "algorithm": "kmeans", + "dataset": [ + { + "source": "synthetic", + "type": "blobs", + "n_clusters": 5, + "n_features": 50, + "training": { + "n_samples": 10000000 + } + } + ], + "time-method": ["box_filter"], + "time-limit": [50], + "n-clusters": [5], + "maxiter": [50], + "init": ["k-means++"], + "tol": [0.0] + }, + { + "algorithm": "kmeans", + "dataset": [ + { + "source": "synthetic", + "type": "blobs", + "n_clusters": 20, + "n_features": 50, + "training": { + "n_samples": 3000000 + } + } + ], + "time-method": ["box_filter"], + "time-limit": [50], + "n-clusters": [20], + "maxiter": [50], + "tol": [0.0] + } + ] +} \ No newline at end of file diff --git a/runner.py b/runner.py index df9dde749..63c5ed95c 100755 --- a/runner.py +++ b/runner.py @@ -70,6 +70,9 @@ def generate_cases(params): parser.add_argument('--report', default=False, action='store_true', help='Create an Excel report based on benchmarks results. ' 'Need "openpyxl" library') + parser.add_argument('--device', default=None, type=str, + choices=("host", "cpu", "gpu"), + help='Execution context device') args = parser.parse_args() env = os.environ.copy() diff --git a/sklearn_bench/dbscan.py b/sklearn_bench/dbscan.py index 542da102f..5731243c3 100644 --- a/sklearn_bench/dbscan.py +++ b/sklearn_bench/dbscan.py @@ -19,38 +19,41 @@ import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import bench -from sklearn.metrics.cluster import davies_bouldin_score -parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark') -parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10., - help='Radius of neighborhood of a point') -parser.add_argument('-m', '--min-samples', default=5, type=int, - help='The minimum number of samples required in a ' - 'neighborhood to consider a point a core point') -params = bench.parse_args(parser) - -from sklearn.cluster import DBSCAN - -# Load generated data -X, _, _, _ = bench.load_data(params, add_dtype=True) - -# Create our clustering object -dbscan = DBSCAN(eps=params.eps, n_jobs=params.n_jobs, - min_samples=params.min_samples, metric='euclidean', - algorithm='auto') - -# N.B. algorithm='auto' will select DAAL's brute force method when running -# daal4py-patched scikit-learn, and probably 'kdtree' when running unpatched -# scikit-learn. - -# Time fit -time, _ = bench.measure_function_time(dbscan.fit, X, params=params) -labels = dbscan.labels_ - -params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0) -acc = davies_bouldin_score(X, labels) - -bench.print_output(library='sklearn', algorithm='dbscan', stages=['training'], - params=params, functions=['DBSCAN'], times=[time], accuracies=[acc], - accuracy_type='davies_bouldin_score', data=[X], - alg_instance=dbscan) +def main(): + from sklearn.cluster import DBSCAN + from sklearn.metrics.cluster import davies_bouldin_score + + # Load generated data + X, _, _, _ = bench.load_data(params, add_dtype=True) + + # Create our clustering object + dbscan = DBSCAN(eps=params.eps, n_jobs=params.n_jobs, + min_samples=params.min_samples, metric='euclidean', + algorithm='auto') + + # N.B. algorithm='auto' will select DAAL's brute force method when running + # daal4py-patched scikit-learn, and probably 'kdtree' when running unpatched + # scikit-learn. + + # Time fit + time, _ = bench.measure_function_time(dbscan.fit, X, params=params) + labels = dbscan.labels_ + + params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0) + acc = davies_bouldin_score(X, labels) + + bench.print_output(library='sklearn', algorithm='dbscan', stages=['training'], + params=params, functions=['DBSCAN'], times=[time], accuracies=[acc], + accuracy_type='davies_bouldin_score', data=[X], + alg_instance=dbscan) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark') + parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10., + help='Radius of neighborhood of a point') + parser.add_argument('-m', '--min-samples', default=5, type=int, + help='The minimum number of samples required in a ' + 'neighborhood to consider a point a core point') + params = bench.parse_args(parser) + bench.run_with_context(params, main) diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index a6e1a5f1e..94fc46804 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -20,63 +20,69 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import bench import numpy as np -from sklearn.metrics.cluster import davies_bouldin_score -parser = argparse.ArgumentParser(description='scikit-learn K-means benchmark') -parser.add_argument('-i', '--filei', '--fileI', '--init', - type=str, help='Initial clusters') -parser.add_argument('-t', '--tol', type=float, default=0., - help='Absolute threshold') -parser.add_argument('--maxiter', type=int, default=100, - help='Maximum number of iterations') -parser.add_argument('--n-clusters', type=int, help='Number of clusters') -params = bench.parse_args(parser) +def main(): + from sklearn.cluster import KMeans + from sklearn.metrics.cluster import davies_bouldin_score -from sklearn.cluster import KMeans + global X_init, params -# Load and convert generated data -X_train, X_test, _, _ = bench.load_data(params) + # Load and convert generated data + X_train, X_test, _, _ = bench.load_data(params) -if params.filei == 'k-means++': - X_init = 'k-means++' -# Load initial centroids from specified path -elif params.filei is not None: - X_init = np.load(params.filei).astype(params.dtype) - params.n_clusters = X_init.shape[0] -# or choose random centroids from training data -else: - np.random.seed(params.seed) - centroids_idx = np.random.randint(0, X_train.shape[0], - size=params.n_clusters) - if hasattr(X_train, "iloc"): - X_init = X_train.iloc[centroids_idx].values + if params.filei == 'k-means++': + X_init = 'k-means++' + # Load initial centroids from specified path + elif params.filei is not None: + X_init = np.load(params.filei).astype(params.dtype) + params.n_clusters = X_init.shape[0] + # or choose random centroids from training data else: - X_init = X_train[centroids_idx] + np.random.seed(params.seed) + centroids_idx = np.random.randint(0, X_train.shape[0], + size=params.n_clusters) + if hasattr(X_train, "iloc"): + X_init = X_train.iloc[centroids_idx].values + else: + X_init = X_train[centroids_idx] -def fit_kmeans(X): - global X_init, params - alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, - max_iter=params.maxiter, init=X_init, n_init=1) - alg.fit(X) - return alg + def fit_kmeans(X): + global X_init, params + alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, + max_iter=params.maxiter, init=X_init, n_init=1) + alg.fit(X) + return alg + + # Time fit + fit_time, kmeans = bench.measure_function_time(fit_kmeans, X_train, params=params) -# Time fit -fit_time, kmeans = bench.measure_function_time(fit_kmeans, X_train, params=params) + train_predict = kmeans.predict(X_train) + acc_train = davies_bouldin_score(X_train, train_predict) -train_predict = kmeans.predict(X_train) -acc_train = davies_bouldin_score(X_train, train_predict) + # Time predict + predict_time, test_predict = bench.measure_function_time( + kmeans.predict, X_test, params=params) -# Time predict -predict_time, test_predict = bench.measure_function_time( - kmeans.predict, X_test, params=params) + acc_test = davies_bouldin_score(X_test, test_predict) -acc_test = davies_bouldin_score(X_test, test_predict) + bench.print_output(library='sklearn', algorithm='kmeans', + stages=['training', 'prediction'], + params=params, functions=['KMeans.fit', 'KMeans.predict'], + times=[fit_time, predict_time], accuracy_type='davies_bouldin_score', + accuracies=[acc_train, acc_test], data=[X_train, X_test], + alg_instance=kmeans) -bench.print_output(library='sklearn', algorithm='kmeans', - stages=['training', 'prediction'], - params=params, functions=['KMeans.fit', 'KMeans.predict'], - times=[fit_time, predict_time], accuracy_type='davies_bouldin_score', - accuracies=[acc_train, acc_test], data=[X_train, X_test], - alg_instance=kmeans) +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='scikit-learn K-means benchmark') + parser.add_argument('-i', '--filei', '--fileI', '--init', + type=str, help='Initial clusters') + parser.add_argument('-t', '--tol', type=float, default=0., + help='Absolute threshold') + parser.add_argument('--maxiter', type=int, default=100, + help='Maximum number of iterations') + parser.add_argument('--n-clusters', type=int, help='Number of clusters') + params = bench.parse_args(parser) + bench.run_with_context(params, main) + From 24c24f24831a0787202a651a982a520bc6924b45 Mon Sep 17 00:00:00 2001 From: Vladislav Nazarov Date: Wed, 17 Mar 2021 12:29:53 +0300 Subject: [PATCH 2/7] Add another algs --- configs/skl_with_context_config.json | 98 ++++++++++++++++++++++++- sklearn_bench/kmeans.py | 1 + sklearn_bench/linear.py | 55 +++++++------- sklearn_bench/log_reg.py | 103 ++++++++++++++------------- 4 files changed, 180 insertions(+), 77 deletions(-) diff --git a/configs/skl_with_context_config.json b/configs/skl_with_context_config.json index 0212e155c..ee46eb033 100644 --- a/configs/skl_with_context_config.json +++ b/configs/skl_with_context_config.json @@ -64,6 +64,102 @@ "n-clusters": [20], "maxiter": [50], "tol": [0.0] + }, + { + "algorithm": "dbscan", + "dataset": [ + { + "source": "synthetic", + "type": "blobs", + "n_clusters": 50, + "n_features": 3, + "training": { + "n_samples": 500000 + } + }, + { + "source": "synthetic", + "type": "blobs", + "n_clusters": 50, + "n_features": 10, + "training": { + "n_samples": 500000 + } + }, + { + "source": "synthetic", + "type": "blobs", + "n_clusters": 100, + "n_features": 50, + "training": { + "n_samples": 500000 + } + } + ] + }, + { + "algorithm": "linear", + "dataset": [ + { + "source": "synthetic", + "type": "regression", + "n_features": 20, + "training": { + "n_samples": 10000000 + } + }, + { + "source": "synthetic", + "type": "regression", + "n_features": 100, + "training": { + "n_samples": 2000000 + } + } + ] + }, + { + "algorithm": "log_reg", + "dataset": [ + { + "source": "synthetic", + "type": "classification", + "n_classes": 2, + "n_features": 20, + "training": { + "n_samples": 10000000 + } + }, + { + "source": "synthetic", + "type": "classification", + "n_classes": 2, + "n_features": 100, + "training": { + "n_samples": 2000000 + } + }, + { + "source": "synthetic", + "type": "classification", + "n_classes": 5, + "n_features": 20, + "training": { + "n_samples": 10000000 + } + }, + { + "source": "synthetic", + "type": "classification", + "n_classes": 5, + "n_features": 100, + "training": { + "n_samples": 2000000 + } + } + ], + "maxiter": [100], + "tol": [0] } ] -} \ No newline at end of file +} diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index 94fc46804..faff8b783 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -83,6 +83,7 @@ def fit_kmeans(X): parser.add_argument('--maxiter', type=int, default=100, help='Maximum number of iterations') parser.add_argument('--n-clusters', type=int, help='Number of clusters') + global X_init, params params = bench.parse_args(parser) bench.run_with_context(params, main) diff --git a/sklearn_bench/linear.py b/sklearn_bench/linear.py index e059d96ef..b3a35dc81 100644 --- a/sklearn_bench/linear.py +++ b/sklearn_bench/linear.py @@ -21,36 +21,39 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import bench -parser = argparse.ArgumentParser(description='scikit-learn linear regression ' - 'benchmark') -parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, - action='store_false', - help="Don't fit intercept (assume data already centered)") -params = bench.parse_args(parser) +def main(): + from sklearn.linear_model import LinearRegression -from sklearn.linear_model import LinearRegression + # Load data + X_train, X_test, y_train, y_test = bench.load_data( + params, generated_data=['X_train', 'y_train']) -# Load data -X_train, X_test, y_train, y_test = bench.load_data( - params, generated_data=['X_train', 'y_train']) + # Create our regression object + regr = LinearRegression(fit_intercept=params.fit_intercept, + n_jobs=params.n_jobs, copy_X=False) -# Create our regression object -regr = LinearRegression(fit_intercept=params.fit_intercept, - n_jobs=params.n_jobs, copy_X=False) + # Time fit + fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) -# Time fit -fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) + # Time predict + predict_time, yp = bench.measure_function_time(regr.predict, X_test, params=params) -# Time predict -predict_time, yp = bench.measure_function_time(regr.predict, X_test, params=params) + test_rmse = bench.rmse_score(yp, y_test) + yp = regr.predict(X_train) + train_rmse = bench.rmse_score(yp, y_train) -test_rmse = bench.rmse_score(yp, y_test) -yp = regr.predict(X_train) -train_rmse = bench.rmse_score(yp, y_train) + bench.print_output(library='sklearn', algorithm='linear_regression', + stages=['training', 'prediction'], + params=params, functions=['Linear.fit', 'Linear.predict'], + times=[fit_time, predict_time], accuracy_type='rmse', + accuracies=[train_rmse, test_rmse], data=[X_train, X_test], + alg_instance=regr) -bench.print_output(library='sklearn', algorithm='linear_regression', - stages=['training', 'prediction'], - params=params, functions=['Linear.fit', 'Linear.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_test], - alg_instance=regr) +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='scikit-learn linear regression ' + 'benchmark') + parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, + action='store_false', + help="Don't fit intercept (assume data already centered)") + params = bench.parse_args(parser) + bench.run_with_context(params, main) diff --git a/sklearn_bench/log_reg.py b/sklearn_bench/log_reg.py index 073fa549f..8673edfbd 100644 --- a/sklearn_bench/log_reg.py +++ b/sklearn_bench/log_reg.py @@ -20,63 +20,66 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import bench import numpy as np -from sklearn.metrics import accuracy_score -parser = argparse.ArgumentParser(description='scikit-learn logistic ' - 'regression benchmark') -parser.add_argument('--no-fit-intercept', dest='fit_intercept', - action='store_false', default=True, - help="Don't fit intercept") -parser.add_argument('--multiclass', default='auto', - choices=('auto', 'ovr', 'multinomial'), - help='How to treat multi class data. ' - '"auto" picks "ovr" for binary classification, and ' - '"multinomial" otherwise.') -parser.add_argument('--solver', default='lbfgs', - choices=('lbfgs', 'newton-cg', 'saga'), - help='Solver to use.') -parser.add_argument('--maxiter', type=int, default=100, - help='Maximum iterations for the iterative solver') -parser.add_argument('-C', dest='C', type=float, default=1.0, - help='Regularization parameter') -parser.add_argument('--tol', type=float, default=None, - help='Tolerance for solver. If solver == "newton-cg", ' - 'then the default is 1e-3. Otherwise, the default ' - 'is 1e-10.') -params = bench.parse_args(parser, loop_types=('fit', 'predict')) +def main(): + from sklearn.linear_model import LogisticRegression + from sklearn.metrics import accuracy_score -from sklearn.linear_model import LogisticRegression + # Load generated data + X_train, X_test, y_train, y_test = bench.load_data(params) -# Load generated data -X_train, X_test, y_train, y_test = bench.load_data(params) + params.n_classes = len(np.unique(y_train)) -params.n_classes = len(np.unique(y_train)) + if params.multiclass == 'auto': + params.multiclass = 'ovr' if params.n_classes == 2 else 'multinomial' -if params.multiclass == 'auto': - params.multiclass = 'ovr' if params.n_classes == 2 else 'multinomial' + if not params.tol: + params.tol = 1e-3 if params.solver == 'newton-cg' else 1e-10 -if not params.tol: - params.tol = 1e-3 if params.solver == 'newton-cg' else 1e-10 + # Create our classifier object + clf = LogisticRegression(penalty='l2', C=params.C, n_jobs=params.n_jobs, + fit_intercept=params.fit_intercept, + verbose=params.verbose, + tol=params.tol, max_iter=params.maxiter, + solver=params.solver, multi_class=params.multiclass) + # Time fit and predict + fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) -# Create our classifier object -clf = LogisticRegression(penalty='l2', C=params.C, n_jobs=params.n_jobs, - fit_intercept=params.fit_intercept, - verbose=params.verbose, - tol=params.tol, max_iter=params.maxiter, - solver=params.solver, multi_class=params.multiclass) -# Time fit and predict -fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) + y_pred = clf.predict(X_train) + train_acc = 100 * accuracy_score(y_pred, y_train) -y_pred = clf.predict(X_train) -train_acc = 100 * accuracy_score(y_pred, y_train) + predict_time, y_pred = bench.measure_function_time( + clf.predict, X_test, params=params) + test_acc = 100 * accuracy_score(y_pred, y_test) -predict_time, y_pred = bench.measure_function_time( - clf.predict, X_test, params=params) -test_acc = 100 * accuracy_score(y_pred, y_test) + bench.print_output(library='sklearn', algorithm='logistic_regression', + stages=['training', 'prediction'], params=params, + functions=['LogReg.fit', 'LogReg.predict'], + times=[fit_time, predict_time], accuracy_type='accuracy[%]', + accuracies=[train_acc, test_acc], data=[X_train, X_test], + alg_instance=clf) -bench.print_output(library='sklearn', algorithm='logistic_regression', - stages=['training', 'prediction'], params=params, - functions=['LogReg.fit', 'LogReg.predict'], - times=[fit_time, predict_time], accuracy_type='accuracy[%]', - accuracies=[train_acc, test_acc], data=[X_train, X_test], - alg_instance=clf) +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='scikit-learn logistic ' + 'regression benchmark') + parser.add_argument('--no-fit-intercept', dest='fit_intercept', + action='store_false', default=True, + help="Don't fit intercept") + parser.add_argument('--multiclass', default='auto', + choices=('auto', 'ovr', 'multinomial'), + help='How to treat multi class data. ' + '"auto" picks "ovr" for binary classification, and ' + '"multinomial" otherwise.') + parser.add_argument('--solver', default='lbfgs', + choices=('lbfgs', 'newton-cg', 'saga'), + help='Solver to use.') + parser.add_argument('--maxiter', type=int, default=100, + help='Maximum iterations for the iterative solver') + parser.add_argument('-C', dest='C', type=float, default=1.0, + help='Regularization parameter') + parser.add_argument('--tol', type=float, default=None, + help='Tolerance for solver. If solver == "newton-cg", ' + 'then the default is 1e-3. Otherwise, the default ' + 'is 1e-10.') + params = bench.parse_args(parser, loop_types=('fit', 'predict')) + bench.run_with_context(params, main) From 3da61ba6054a8f36f45190565babfef7b5927544 Mon Sep 17 00:00:00 2001 From: Vladislav Nazarov Date: Wed, 17 Mar 2021 12:50:41 +0300 Subject: [PATCH 3/7] Fix kmeans warning --- sklearn_bench/kmeans.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index faff8b783..2368aafb4 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -25,8 +25,6 @@ def main(): from sklearn.cluster import KMeans from sklearn.metrics.cluster import davies_bouldin_score - global X_init, params - # Load and convert generated data X_train, X_test, _, _ = bench.load_data(params) @@ -47,16 +45,14 @@ def main(): X_init = X_train[centroids_idx] - def fit_kmeans(X): - global X_init, params + def fit_kmeans(X, X_init): alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, max_iter=params.maxiter, init=X_init, n_init=1) alg.fit(X) return alg - # Time fit - fit_time, kmeans = bench.measure_function_time(fit_kmeans, X_train, params=params) + fit_time, kmeans = bench.measure_function_time(fit_kmeans, X_train, X_init, params=params) train_predict = kmeans.predict(X_train) acc_train = davies_bouldin_score(X_train, train_predict) @@ -83,7 +79,6 @@ def fit_kmeans(X): parser.add_argument('--maxiter', type=int, default=100, help='Maximum number of iterations') parser.add_argument('--n-clusters', type=int, help='Number of clusters') - global X_init, params params = bench.parse_args(parser) bench.run_with_context(params, main) From f75263e10b45287c0e348ea94f81f73a9351ddfa Mon Sep 17 00:00:00 2001 From: Vladislav Nazarov Date: Wed, 17 Mar 2021 19:20:03 +0300 Subject: [PATCH 4/7] Update config --- configs/skl_with_context_config.json | 110 +++------------------------ 1 file changed, 11 insertions(+), 99 deletions(-) diff --git a/configs/skl_with_context_config.json b/configs/skl_with_context_config.json index ee46eb033..22e9b2b04 100644 --- a/configs/skl_with_context_config.json +++ b/configs/skl_with_context_config.json @@ -13,57 +13,14 @@ { "source": "synthetic", "type": "blobs", - "n_clusters": 1000, - "n_features": 20, - "training": { - "n_samples": 1000000 - } - } - ], - "time-method": ["box_filter"], - "time-limit": [50], - "n-clusters": [1000], - "maxiter": [50], - "tol": [0.0] - }, - { - "algorithm": "kmeans", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 5, + "n_clusters": 10, "n_features": 50, "training": { - "n_samples": 10000000 - } - } - ], - "time-method": ["box_filter"], - "time-limit": [50], - "n-clusters": [5], - "maxiter": [50], - "init": ["k-means++"], - "tol": [0.0] - }, - { - "algorithm": "kmeans", - "dataset": [ - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 20, - "n_features": 50, - "training": { - "n_samples": 3000000 + "n_samples": 1000000 } } ], - "time-method": ["box_filter"], - "time-limit": [50], - "n-clusters": [20], - "maxiter": [50], - "tol": [0.0] + "n-clusters": [10] }, { "algorithm": "dbscan", @@ -71,28 +28,10 @@ { "source": "synthetic", "type": "blobs", - "n_clusters": 50, - "n_features": 3, - "training": { - "n_samples": 500000 - } - }, - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 50, - "n_features": 10, - "training": { - "n_samples": 500000 - } - }, - { - "source": "synthetic", - "type": "blobs", - "n_clusters": 100, + "n_clusters": 10, "n_features": 50, "training": { - "n_samples": 500000 + "n_samples": 10000 } } ] @@ -103,49 +42,24 @@ { "source": "synthetic", "type": "regression", - "n_features": 20, - "training": { - "n_samples": 10000000 - } - }, - { - "source": "synthetic", - "type": "regression", - "n_features": 100, + "n_features": 50, "training": { - "n_samples": 2000000 + "n_samples": 1000000 } } ] }, { "algorithm": "log_reg", + "solver":["lbfgs", "newton-cg"], "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 2, - "n_features": 20, - "training": { - "n_samples": 10000000 - } - }, { "source": "synthetic", "type": "classification", "n_classes": 2, "n_features": 100, "training": { - "n_samples": 2000000 - } - }, - { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 20, - "training": { - "n_samples": 10000000 + "n_samples": 100000 } }, { @@ -154,12 +68,10 @@ "n_classes": 5, "n_features": 100, "training": { - "n_samples": 2000000 + "n_samples": 100000 } } - ], - "maxiter": [100], - "tol": [0] + ] } ] } From 9653d96e0c56807aa265af303ec29d2fd79eed1f Mon Sep 17 00:00:00 2001 From: Vladislav Nazarov Date: Fri, 19 Mar 2021 01:24:38 +0300 Subject: [PATCH 5/7] Apply comments and fix pep --- bench.py | 15 ++++++++++----- runner.py | 3 --- sklearn_bench/dbscan.py | 14 ++++++++------ sklearn_bench/kmeans.py | 22 ++++++++++++---------- sklearn_bench/linear.py | 14 ++++++++------ sklearn_bench/log_reg.py | 30 ++++++++++++++++-------------- 6 files changed, 54 insertions(+), 44 deletions(-) diff --git a/bench.py b/bench.py index ae45e7c62..53a8e7bef 100644 --- a/bench.py +++ b/bench.py @@ -176,10 +176,10 @@ def parse_args(parser, size=None, loop_types=(), parser.add_argument('--no-intel-optimized', default=False, action='store_true', help='Use no intel optimized version. ' 'Now avalible for scikit-learn benchmarks') - parser.add_argument('--device', default=None, type=str, - choices=("host", "cpu", "gpu"), + parser.add_argument('--device', default='host', type=str, + choices=('host', 'cpu', 'gpu'), help='Execution context device') - + for data in ['X', 'y']: for stage in ['train', 'test']: parser.add_argument(f'--file-{data}-{stage}', @@ -201,8 +201,12 @@ def parse_args(parser, size=None, loop_types=(), except ImportError: print('Failed to import daal4py.sklearn.patch_sklearn.' 'Use stock version scikit-learn', file=sys.stderr) + params.device = 'host' else: - params.device = None + if params.device != 'host': + print('Device context not supported without intel optimized version', + file=sys.stderr) + params.device = 'host' # disable finiteness check (default) if not params.check_finiteness: @@ -499,8 +503,9 @@ def print_output(library, algorithm, stages, params, functions, output.append(result) print(json.dumps(output, indent=4)) + def run_with_context(params, function): - if params.device is not None: + if params.device != 'host': from daal4py.oneapi import sycl_context with sycl_context(params.device): function() diff --git a/runner.py b/runner.py index 63c5ed95c..df9dde749 100755 --- a/runner.py +++ b/runner.py @@ -70,9 +70,6 @@ def generate_cases(params): parser.add_argument('--report', default=False, action='store_true', help='Create an Excel report based on benchmarks results. ' 'Need "openpyxl" library') - parser.add_argument('--device', default=None, type=str, - choices=("host", "cpu", "gpu"), - help='Execution context device') args = parser.parse_args() env = os.environ.copy() diff --git a/sklearn_bench/dbscan.py b/sklearn_bench/dbscan.py index 5731243c3..d0d3e0edd 100644 --- a/sklearn_bench/dbscan.py +++ b/sklearn_bench/dbscan.py @@ -20,6 +20,7 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import bench + def main(): from sklearn.cluster import DBSCAN from sklearn.metrics.cluster import davies_bouldin_score @@ -32,9 +33,9 @@ def main(): min_samples=params.min_samples, metric='euclidean', algorithm='auto') - # N.B. algorithm='auto' will select DAAL's brute force method when running - # daal4py-patched scikit-learn, and probably 'kdtree' when running unpatched - # scikit-learn. + # N.B. algorithm='auto' will select oneAPI Data Analytics Library (oneDAL) + # brute force method when running daal4py-patched scikit-learn, and probably + # 'kdtree' when running unpatched scikit-learn. # Time fit time, _ = bench.measure_function_time(dbscan.fit, X, params=params) @@ -44,9 +45,10 @@ def main(): acc = davies_bouldin_score(X, labels) bench.print_output(library='sklearn', algorithm='dbscan', stages=['training'], - params=params, functions=['DBSCAN'], times=[time], accuracies=[acc], - accuracy_type='davies_bouldin_score', data=[X], - alg_instance=dbscan) + params=params, functions=['DBSCAN'], times=[time], + accuracies=[acc], accuracy_type='davies_bouldin_score', + data=[X], alg_instance=dbscan) + if __name__ == "__main__": parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark') diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index 2368aafb4..1cf03ff0f 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -21,6 +21,7 @@ import bench import numpy as np + def main(): from sklearn.cluster import KMeans from sklearn.metrics.cluster import davies_bouldin_score @@ -38,21 +39,21 @@ def main(): else: np.random.seed(params.seed) centroids_idx = np.random.randint(0, X_train.shape[0], - size=params.n_clusters) + size=params.n_clusters) if hasattr(X_train, "iloc"): X_init = X_train.iloc[centroids_idx].values else: X_init = X_train[centroids_idx] - def fit_kmeans(X, X_init): alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, - max_iter=params.maxiter, init=X_init, n_init=1) + max_iter=params.maxiter, init=X_init, n_init=1) alg.fit(X) return alg # Time fit - fit_time, kmeans = bench.measure_function_time(fit_kmeans, X_train, X_init, params=params) + fit_time, kmeans = bench.measure_function_time(fit_kmeans, X_train, + X_init, params=params) train_predict = kmeans.predict(X_train) acc_train = davies_bouldin_score(X_train, train_predict) @@ -64,11 +65,13 @@ def fit_kmeans(X, X_init): acc_test = davies_bouldin_score(X_test, test_predict) bench.print_output(library='sklearn', algorithm='kmeans', - stages=['training', 'prediction'], - params=params, functions=['KMeans.fit', 'KMeans.predict'], - times=[fit_time, predict_time], accuracy_type='davies_bouldin_score', - accuracies=[acc_train, acc_test], data=[X_train, X_test], - alg_instance=kmeans) + stages=['training', 'prediction'], + params=params, functions=['KMeans.fit', 'KMeans.predict'], + times=[fit_time, predict_time], + accuracy_type='davies_bouldin_score', + accuracies=[acc_train, acc_test], data=[X_train, X_test], + alg_instance=kmeans) + if __name__ == "__main__": parser = argparse.ArgumentParser(description='scikit-learn K-means benchmark') @@ -81,4 +84,3 @@ def fit_kmeans(X, X_init): parser.add_argument('--n-clusters', type=int, help='Number of clusters') params = bench.parse_args(parser) bench.run_with_context(params, main) - diff --git a/sklearn_bench/linear.py b/sklearn_bench/linear.py index b3a35dc81..4ddf2007f 100644 --- a/sklearn_bench/linear.py +++ b/sklearn_bench/linear.py @@ -21,6 +21,7 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import bench + def main(): from sklearn.linear_model import LinearRegression @@ -43,15 +44,16 @@ def main(): train_rmse = bench.rmse_score(yp, y_train) bench.print_output(library='sklearn', algorithm='linear_regression', - stages=['training', 'prediction'], - params=params, functions=['Linear.fit', 'Linear.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_test], - alg_instance=regr) + stages=['training', 'prediction'], + params=params, functions=['Linear.fit', 'Linear.predict'], + times=[fit_time, predict_time], accuracy_type='rmse', + accuracies=[train_rmse, test_rmse], data=[X_train, X_test], + alg_instance=regr) + if __name__ == "__main__": parser = argparse.ArgumentParser(description='scikit-learn linear regression ' - 'benchmark') + 'benchmark') parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, action='store_false', help="Don't fit intercept (assume data already centered)") diff --git a/sklearn_bench/log_reg.py b/sklearn_bench/log_reg.py index 8673edfbd..bcf455b56 100644 --- a/sklearn_bench/log_reg.py +++ b/sklearn_bench/log_reg.py @@ -21,6 +21,7 @@ import bench import numpy as np + def main(): from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score @@ -38,10 +39,10 @@ def main(): # Create our classifier object clf = LogisticRegression(penalty='l2', C=params.C, n_jobs=params.n_jobs, - fit_intercept=params.fit_intercept, - verbose=params.verbose, - tol=params.tol, max_iter=params.maxiter, - solver=params.solver, multi_class=params.multiclass) + fit_intercept=params.fit_intercept, + verbose=params.verbose, + tol=params.tol, max_iter=params.maxiter, + solver=params.solver, multi_class=params.multiclass) # Time fit and predict fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) @@ -53,23 +54,24 @@ def main(): test_acc = 100 * accuracy_score(y_pred, y_test) bench.print_output(library='sklearn', algorithm='logistic_regression', - stages=['training', 'prediction'], params=params, - functions=['LogReg.fit', 'LogReg.predict'], - times=[fit_time, predict_time], accuracy_type='accuracy[%]', - accuracies=[train_acc, test_acc], data=[X_train, X_test], - alg_instance=clf) + stages=['training', 'prediction'], params=params, + functions=['LogReg.fit', 'LogReg.predict'], + times=[fit_time, predict_time], accuracy_type='accuracy[%]', + accuracies=[train_acc, test_acc], data=[X_train, X_test], + alg_instance=clf) + if __name__ == "__main__": parser = argparse.ArgumentParser(description='scikit-learn logistic ' - 'regression benchmark') + 'regression benchmark') parser.add_argument('--no-fit-intercept', dest='fit_intercept', action='store_false', default=True, help="Don't fit intercept") parser.add_argument('--multiclass', default='auto', choices=('auto', 'ovr', 'multinomial'), help='How to treat multi class data. ' - '"auto" picks "ovr" for binary classification, and ' - '"multinomial" otherwise.') + '"auto" picks "ovr" for binary classification, and ' + '"multinomial" otherwise.') parser.add_argument('--solver', default='lbfgs', choices=('lbfgs', 'newton-cg', 'saga'), help='Solver to use.') @@ -79,7 +81,7 @@ def main(): help='Regularization parameter') parser.add_argument('--tol', type=float, default=None, help='Tolerance for solver. If solver == "newton-cg", ' - 'then the default is 1e-3. Otherwise, the default ' - 'is 1e-10.') + 'then the default is 1e-3. Otherwise, the default ' + 'is 1e-10.') params = bench.parse_args(parser, loop_types=('fit', 'predict')) bench.run_with_context(params, main) From 25b09b471ffb26d47312236dc2c88878e4d29321 Mon Sep 17 00:00:00 2001 From: Vladislav Nazarov Date: Tue, 23 Mar 2021 02:16:27 +0300 Subject: [PATCH 6/7] Fix device args --- bench.py | 18 ++++++++++-------- configs/skl_with_context_config.json | 2 +- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/bench.py b/bench.py index 53a8e7bef..a9723672e 100644 --- a/bench.py +++ b/bench.py @@ -175,9 +175,9 @@ def parse_args(parser, size=None, loop_types=(), help='Dataset name') parser.add_argument('--no-intel-optimized', default=False, action='store_true', help='Use no intel optimized version. ' - 'Now avalible for scikit-learn benchmarks') - parser.add_argument('--device', default='host', type=str, - choices=('host', 'cpu', 'gpu'), + 'Now avalible for scikit-learn benchmarks'), + parser.add_argument('--device', default='None', type=str, + choices=('host', 'cpu', 'gpu', 'None'), help='Execution context device') for data in ['X', 'y']: @@ -201,12 +201,14 @@ def parse_args(parser, size=None, loop_types=(), except ImportError: print('Failed to import daal4py.sklearn.patch_sklearn.' 'Use stock version scikit-learn', file=sys.stderr) - params.device = 'host' + params.device = 'None' else: - if params.device != 'host': - print('Device context not supported without intel optimized version', + if params.device != 'None': + print(f'Device context is not supported for stock scikit-learn.' + 'Please use --no-intel-optimized=False with' + '--device={params.device} parameter. Fallback to --device=None.', file=sys.stderr) - params.device = 'host' + params.device = 'None' # disable finiteness check (default) if not params.check_finiteness: @@ -505,7 +507,7 @@ def print_output(library, algorithm, stages, params, functions, def run_with_context(params, function): - if params.device != 'host': + if params.device != 'None': from daal4py.oneapi import sycl_context with sycl_context(params.device): function() diff --git a/configs/skl_with_context_config.json b/configs/skl_with_context_config.json index 22e9b2b04..023850c38 100644 --- a/configs/skl_with_context_config.json +++ b/configs/skl_with_context_config.json @@ -4,7 +4,7 @@ "data-format": ["pandas"], "data-order": ["F"], "dtype": ["float64"], - "device": ["host", "cpu", "gpu"] + "device": ["host", "cpu", "gpu", "None"] }, "cases": [ { From 3f9c35dff0fa515fc2b9536d5a854cb600a43d05 Mon Sep 17 00:00:00 2001 From: Vladislav Nazarov Date: Tue, 23 Mar 2021 22:45:45 +0300 Subject: [PATCH 7/7] Fix pep8 --- bench.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bench.py b/bench.py index a9723672e..620818e97 100644 --- a/bench.py +++ b/bench.py @@ -204,9 +204,9 @@ def parse_args(parser, size=None, loop_types=(), params.device = 'None' else: if params.device != 'None': - print(f'Device context is not supported for stock scikit-learn.' + print('Device context is not supported for stock scikit-learn.' 'Please use --no-intel-optimized=False with' - '--device={params.device} parameter. Fallback to --device=None.', + f'--device={params.device} parameter. Fallback to --device=None.', file=sys.stderr) params.device = 'None'