Skip to content

Commit b847226

Browse files
author
vlad-nazarov
authored
Add device context parameter (#57)
* Initial impl * Add another algs * Fix kmeans warning * Update config * Apply comments and fix pep * Fix device args * Fix pep8
1 parent 50d4d6e commit b847226

File tree

6 files changed

+274
-157
lines changed

6 files changed

+274
-157
lines changed

bench.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,10 @@ def parse_args(parser, size=None, loop_types=(),
176176
parser.add_argument('--no-intel-optimized', default=False, action='store_true',
177177
help='Use no intel optimized version. '
178178
'Now avalible for scikit-learn benchmarks'),
179+
parser.add_argument('--device', default='None', type=str,
180+
choices=('host', 'cpu', 'gpu', 'None'),
181+
help='Execution context device')
182+
179183
for data in ['X', 'y']:
180184
for stage in ['train', 'test']:
181185
parser.add_argument(f'--file-{data}-{stage}',
@@ -197,6 +201,14 @@ def parse_args(parser, size=None, loop_types=(),
197201
except ImportError:
198202
print('Failed to import daal4py.sklearn.patch_sklearn.'
199203
'Use stock version scikit-learn', file=sys.stderr)
204+
params.device = 'None'
205+
else:
206+
if params.device != 'None':
207+
print('Device context is not supported for stock scikit-learn.'
208+
'Please use --no-intel-optimized=False with'
209+
f'--device={params.device} parameter. Fallback to --device=None.',
210+
file=sys.stderr)
211+
params.device = 'None'
200212

201213
# disable finiteness check (default)
202214
if not params.check_finiteness:
@@ -492,3 +504,12 @@ def print_output(library, algorithm, stages, params, functions,
492504
del result['algorithm_parameters']['handle']
493505
output.append(result)
494506
print(json.dumps(output, indent=4))
507+
508+
509+
def run_with_context(params, function):
510+
if params.device != 'None':
511+
from daal4py.oneapi import sycl_context
512+
with sycl_context(params.device):
513+
function()
514+
else:
515+
function()

configs/skl_with_context_config.json

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
{
2+
"common": {
3+
"lib": ["sklearn"],
4+
"data-format": ["pandas"],
5+
"data-order": ["F"],
6+
"dtype": ["float64"],
7+
"device": ["host", "cpu", "gpu", "None"]
8+
},
9+
"cases": [
10+
{
11+
"algorithm": "kmeans",
12+
"dataset": [
13+
{
14+
"source": "synthetic",
15+
"type": "blobs",
16+
"n_clusters": 10,
17+
"n_features": 50,
18+
"training": {
19+
"n_samples": 1000000
20+
}
21+
}
22+
],
23+
"n-clusters": [10]
24+
},
25+
{
26+
"algorithm": "dbscan",
27+
"dataset": [
28+
{
29+
"source": "synthetic",
30+
"type": "blobs",
31+
"n_clusters": 10,
32+
"n_features": 50,
33+
"training": {
34+
"n_samples": 10000
35+
}
36+
}
37+
]
38+
},
39+
{
40+
"algorithm": "linear",
41+
"dataset": [
42+
{
43+
"source": "synthetic",
44+
"type": "regression",
45+
"n_features": 50,
46+
"training": {
47+
"n_samples": 1000000
48+
}
49+
}
50+
]
51+
},
52+
{
53+
"algorithm": "log_reg",
54+
"solver":["lbfgs", "newton-cg"],
55+
"dataset": [
56+
{
57+
"source": "synthetic",
58+
"type": "classification",
59+
"n_classes": 2,
60+
"n_features": 100,
61+
"training": {
62+
"n_samples": 100000
63+
}
64+
},
65+
{
66+
"source": "synthetic",
67+
"type": "classification",
68+
"n_classes": 5,
69+
"n_features": 100,
70+
"training": {
71+
"n_samples": 100000
72+
}
73+
}
74+
]
75+
}
76+
]
77+
}

sklearn_bench/dbscan.py

Lines changed: 32 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -19,38 +19,43 @@
1919
import os
2020
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
2121
import bench
22-
from sklearn.metrics.cluster import davies_bouldin_score
2322

24-
parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark')
25-
parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10.,
26-
help='Radius of neighborhood of a point')
27-
parser.add_argument('-m', '--min-samples', default=5, type=int,
28-
help='The minimum number of samples required in a '
29-
'neighborhood to consider a point a core point')
30-
params = bench.parse_args(parser)
3123

32-
from sklearn.cluster import DBSCAN
24+
def main():
25+
from sklearn.cluster import DBSCAN
26+
from sklearn.metrics.cluster import davies_bouldin_score
3327

34-
# Load generated data
35-
X, _, _, _ = bench.load_data(params, add_dtype=True)
28+
# Load generated data
29+
X, _, _, _ = bench.load_data(params, add_dtype=True)
3630

37-
# Create our clustering object
38-
dbscan = DBSCAN(eps=params.eps, n_jobs=params.n_jobs,
39-
min_samples=params.min_samples, metric='euclidean',
40-
algorithm='auto')
31+
# Create our clustering object
32+
dbscan = DBSCAN(eps=params.eps, n_jobs=params.n_jobs,
33+
min_samples=params.min_samples, metric='euclidean',
34+
algorithm='auto')
4135

42-
# N.B. algorithm='auto' will select DAAL's brute force method when running
43-
# daal4py-patched scikit-learn, and probably 'kdtree' when running unpatched
44-
# scikit-learn.
36+
# N.B. algorithm='auto' will select oneAPI Data Analytics Library (oneDAL)
37+
# brute force method when running daal4py-patched scikit-learn, and probably
38+
# 'kdtree' when running unpatched scikit-learn.
4539

46-
# Time fit
47-
time, _ = bench.measure_function_time(dbscan.fit, X, params=params)
48-
labels = dbscan.labels_
40+
# Time fit
41+
time, _ = bench.measure_function_time(dbscan.fit, X, params=params)
42+
labels = dbscan.labels_
4943

50-
params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
51-
acc = davies_bouldin_score(X, labels)
44+
params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
45+
acc = davies_bouldin_score(X, labels)
5246

53-
bench.print_output(library='sklearn', algorithm='dbscan', stages=['training'],
54-
params=params, functions=['DBSCAN'], times=[time], accuracies=[acc],
55-
accuracy_type='davies_bouldin_score', data=[X],
56-
alg_instance=dbscan)
47+
bench.print_output(library='sklearn', algorithm='dbscan', stages=['training'],
48+
params=params, functions=['DBSCAN'], times=[time],
49+
accuracies=[acc], accuracy_type='davies_bouldin_score',
50+
data=[X], alg_instance=dbscan)
51+
52+
53+
if __name__ == "__main__":
54+
parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark')
55+
parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10.,
56+
help='Radius of neighborhood of a point')
57+
parser.add_argument('-m', '--min-samples', default=5, type=int,
58+
help='The minimum number of samples required in a '
59+
'neighborhood to consider a point a core point')
60+
params = bench.parse_args(parser)
61+
bench.run_with_context(params, main)

sklearn_bench/kmeans.py

Lines changed: 51 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -20,63 +20,67 @@
2020
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
2121
import bench
2222
import numpy as np
23-
from sklearn.metrics.cluster import davies_bouldin_score
2423

25-
parser = argparse.ArgumentParser(description='scikit-learn K-means benchmark')
26-
parser.add_argument('-i', '--filei', '--fileI', '--init',
27-
type=str, help='Initial clusters')
28-
parser.add_argument('-t', '--tol', type=float, default=0.,
29-
help='Absolute threshold')
30-
parser.add_argument('--maxiter', type=int, default=100,
31-
help='Maximum number of iterations')
32-
parser.add_argument('--n-clusters', type=int, help='Number of clusters')
33-
params = bench.parse_args(parser)
3424

35-
from sklearn.cluster import KMeans
25+
def main():
26+
from sklearn.cluster import KMeans
27+
from sklearn.metrics.cluster import davies_bouldin_score
3628

37-
# Load and convert generated data
38-
X_train, X_test, _, _ = bench.load_data(params)
29+
# Load and convert generated data
30+
X_train, X_test, _, _ = bench.load_data(params)
3931

40-
if params.filei == 'k-means++':
41-
X_init = 'k-means++'
42-
# Load initial centroids from specified path
43-
elif params.filei is not None:
44-
X_init = np.load(params.filei).astype(params.dtype)
45-
params.n_clusters = X_init.shape[0]
46-
# or choose random centroids from training data
47-
else:
48-
np.random.seed(params.seed)
49-
centroids_idx = np.random.randint(0, X_train.shape[0],
50-
size=params.n_clusters)
51-
if hasattr(X_train, "iloc"):
52-
X_init = X_train.iloc[centroids_idx].values
32+
if params.filei == 'k-means++':
33+
X_init = 'k-means++'
34+
# Load initial centroids from specified path
35+
elif params.filei is not None:
36+
X_init = np.load(params.filei).astype(params.dtype)
37+
params.n_clusters = X_init.shape[0]
38+
# or choose random centroids from training data
5339
else:
54-
X_init = X_train[centroids_idx]
40+
np.random.seed(params.seed)
41+
centroids_idx = np.random.randint(0, X_train.shape[0],
42+
size=params.n_clusters)
43+
if hasattr(X_train, "iloc"):
44+
X_init = X_train.iloc[centroids_idx].values
45+
else:
46+
X_init = X_train[centroids_idx]
5547

48+
def fit_kmeans(X, X_init):
49+
alg = KMeans(n_clusters=params.n_clusters, tol=params.tol,
50+
max_iter=params.maxiter, init=X_init, n_init=1)
51+
alg.fit(X)
52+
return alg
5653

57-
def fit_kmeans(X):
58-
global X_init, params
59-
alg = KMeans(n_clusters=params.n_clusters, tol=params.tol,
60-
max_iter=params.maxiter, init=X_init, n_init=1)
61-
alg.fit(X)
62-
return alg
54+
# Time fit
55+
fit_time, kmeans = bench.measure_function_time(fit_kmeans, X_train,
56+
X_init, params=params)
6357

58+
train_predict = kmeans.predict(X_train)
59+
acc_train = davies_bouldin_score(X_train, train_predict)
6460

65-
# Time fit
66-
fit_time, kmeans = bench.measure_function_time(fit_kmeans, X_train, params=params)
61+
# Time predict
62+
predict_time, test_predict = bench.measure_function_time(
63+
kmeans.predict, X_test, params=params)
6764

68-
train_predict = kmeans.predict(X_train)
69-
acc_train = davies_bouldin_score(X_train, train_predict)
65+
acc_test = davies_bouldin_score(X_test, test_predict)
7066

71-
# Time predict
72-
predict_time, test_predict = bench.measure_function_time(
73-
kmeans.predict, X_test, params=params)
67+
bench.print_output(library='sklearn', algorithm='kmeans',
68+
stages=['training', 'prediction'],
69+
params=params, functions=['KMeans.fit', 'KMeans.predict'],
70+
times=[fit_time, predict_time],
71+
accuracy_type='davies_bouldin_score',
72+
accuracies=[acc_train, acc_test], data=[X_train, X_test],
73+
alg_instance=kmeans)
7474

75-
acc_test = davies_bouldin_score(X_test, test_predict)
7675

77-
bench.print_output(library='sklearn', algorithm='kmeans',
78-
stages=['training', 'prediction'],
79-
params=params, functions=['KMeans.fit', 'KMeans.predict'],
80-
times=[fit_time, predict_time], accuracy_type='davies_bouldin_score',
81-
accuracies=[acc_train, acc_test], data=[X_train, X_test],
82-
alg_instance=kmeans)
76+
if __name__ == "__main__":
77+
parser = argparse.ArgumentParser(description='scikit-learn K-means benchmark')
78+
parser.add_argument('-i', '--filei', '--fileI', '--init',
79+
type=str, help='Initial clusters')
80+
parser.add_argument('-t', '--tol', type=float, default=0.,
81+
help='Absolute threshold')
82+
parser.add_argument('--maxiter', type=int, default=100,
83+
help='Maximum number of iterations')
84+
parser.add_argument('--n-clusters', type=int, help='Number of clusters')
85+
params = bench.parse_args(parser)
86+
bench.run_with_context(params, main)

sklearn_bench/linear.py

Lines changed: 38 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -21,36 +21,41 @@
2121
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
2222
import bench
2323

24-
parser = argparse.ArgumentParser(description='scikit-learn linear regression '
25-
'benchmark')
26-
parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True,
27-
action='store_false',
28-
help="Don't fit intercept (assume data already centered)")
29-
params = bench.parse_args(parser)
30-
31-
from sklearn.linear_model import LinearRegression
32-
33-
# Load data
34-
X_train, X_test, y_train, y_test = bench.load_data(
35-
params, generated_data=['X_train', 'y_train'])
36-
37-
# Create our regression object
38-
regr = LinearRegression(fit_intercept=params.fit_intercept,
39-
n_jobs=params.n_jobs, copy_X=False)
40-
41-
# Time fit
42-
fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params)
43-
44-
# Time predict
45-
predict_time, yp = bench.measure_function_time(regr.predict, X_test, params=params)
46-
47-
test_rmse = bench.rmse_score(yp, y_test)
48-
yp = regr.predict(X_train)
49-
train_rmse = bench.rmse_score(yp, y_train)
50-
51-
bench.print_output(library='sklearn', algorithm='linear_regression',
52-
stages=['training', 'prediction'],
53-
params=params, functions=['Linear.fit', 'Linear.predict'],
54-
times=[fit_time, predict_time], accuracy_type='rmse',
55-
accuracies=[train_rmse, test_rmse], data=[X_train, X_test],
56-
alg_instance=regr)
24+
25+
def main():
26+
from sklearn.linear_model import LinearRegression
27+
28+
# Load data
29+
X_train, X_test, y_train, y_test = bench.load_data(
30+
params, generated_data=['X_train', 'y_train'])
31+
32+
# Create our regression object
33+
regr = LinearRegression(fit_intercept=params.fit_intercept,
34+
n_jobs=params.n_jobs, copy_X=False)
35+
36+
# Time fit
37+
fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params)
38+
39+
# Time predict
40+
predict_time, yp = bench.measure_function_time(regr.predict, X_test, params=params)
41+
42+
test_rmse = bench.rmse_score(yp, y_test)
43+
yp = regr.predict(X_train)
44+
train_rmse = bench.rmse_score(yp, y_train)
45+
46+
bench.print_output(library='sklearn', algorithm='linear_regression',
47+
stages=['training', 'prediction'],
48+
params=params, functions=['Linear.fit', 'Linear.predict'],
49+
times=[fit_time, predict_time], accuracy_type='rmse',
50+
accuracies=[train_rmse, test_rmse], data=[X_train, X_test],
51+
alg_instance=regr)
52+
53+
54+
if __name__ == "__main__":
55+
parser = argparse.ArgumentParser(description='scikit-learn linear regression '
56+
'benchmark')
57+
parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True,
58+
action='store_false',
59+
help="Don't fit intercept (assume data already centered)")
60+
params = bench.parse_args(parser)
61+
bench.run_with_context(params, main)

0 commit comments

Comments
 (0)