From 62f87c381468bc99979da1b6f7d75354294e781c Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Mon, 22 Mar 2021 20:22:33 +0300 Subject: [PATCH 01/31] Applied mypy + flake8 for all files --- bench.py | 4 ++-- cuml_bench/dbscan.py | 15 ++++++++------- cuml_bench/df_clsf.py | 13 +++++++------ cuml_bench/df_regr.py | 13 +++++++------ cuml_bench/elasticnet.py | 13 +++++++------ cuml_bench/kmeans.py | 22 ++++++++++++---------- cuml_bench/knn_clsf.py | 14 ++++++++------ cuml_bench/lasso.py | 13 +++++++------ cuml_bench/linear.py | 13 +++++++------ cuml_bench/log_reg.py | 13 +++++++------ cuml_bench/pca.py | 13 +++++++------ cuml_bench/ridge.py | 13 +++++++------ cuml_bench/svm.py | 13 +++++++------ cuml_bench/train_test_split.py | 13 +++++++------ daal4py_bench/dbscan.py | 15 ++++++++------- daal4py_bench/df_clsf.py | 18 +++++++++--------- daal4py_bench/df_regr.py | 14 +++++++------- daal4py_bench/distances.py | 14 +++++++------- daal4py_bench/kmeans.py | 17 +++++++++-------- daal4py_bench/linear.py | 15 ++++++++------- daal4py_bench/pca.py | 16 ++++++++-------- daal4py_bench/ridge.py | 15 ++++++++------- datasets/load_datasets.py | 16 ++++++---------- datasets/loader.py | 11 ++++------- datasets/make_datasets.py | 7 +++---- modelbuilders_bench/lgbm_mb.py | 22 +++++++++++----------- modelbuilders_bench/utils.py | 4 ++-- modelbuilders_bench/xgb_mb.py | 18 +++++++++--------- report_generator/report_generator.py | 20 +++++++++++--------- runner.py | 26 +++++++++++++++++--------- sklearn_bench/dbscan.py | 13 +++++++------ sklearn_bench/df_clsf.py | 17 +++++++++-------- sklearn_bench/df_regr.py | 13 +++++++------ sklearn_bench/distances.py | 13 +++++++------ sklearn_bench/elasticnet.py | 11 ++++++----- sklearn_bench/kmeans.py | 18 ++++++++++-------- sklearn_bench/knn_clsf.py | 16 +++++++++------- sklearn_bench/lasso.py | 11 ++++++----- sklearn_bench/linear.py | 12 ++++++------ sklearn_bench/log_reg.py | 15 ++++++++------- sklearn_bench/pca.py | 11 ++++++----- sklearn_bench/ridge.py | 11 ++++++----- sklearn_bench/svm.py | 16 ++++++++-------- sklearn_bench/train_test_split.py | 17 +++++++++-------- utils.py | 14 +++++++------- xgboost_bench/gbt.py | 15 +++++++-------- 46 files changed, 345 insertions(+), 311 deletions(-) diff --git a/bench.py b/bench.py index e33998603..d919c56ad 100644 --- a/bench.py +++ b/bench.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import argparse import numpy as np diff --git a/cuml_bench/dbscan.py b/cuml_bench/dbscan.py index 07122ed07..64c1cf5bc 100644 --- a/cuml_bench/dbscan.py +++ b/cuml_bench/dbscan.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,16 +12,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench - from cuml import DBSCAN +import os from sklearn.metrics.cluster import davies_bouldin_score +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + parser = argparse.ArgumentParser(description='cuML DBSCAN benchmark') parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10., diff --git a/cuml_bench/df_clsf.py b/cuml_bench/df_clsf.py index cf7878df8..c8d857faa 100755 --- a/cuml_bench/df_clsf.py +++ b/cuml_bench/df_clsf.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,15 +12,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench import cuml from cuml.ensemble import RandomForestClassifier +import os +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='cuml random forest ' 'classification benchmark') diff --git a/cuml_bench/df_regr.py b/cuml_bench/df_regr.py index f01f31220..eb7329fbf 100644 --- a/cuml_bench/df_regr.py +++ b/cuml_bench/df_regr.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,15 +12,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from cuml.ensemble import RandomForestRegressor +import os +import sys + import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from cuml.ensemble import RandomForestRegressor parser = argparse.ArgumentParser(description='cuml random forest ' 'regression benchmark') diff --git a/cuml_bench/elasticnet.py b/cuml_bench/elasticnet.py index 10fea7855..8755a51b7 100755 --- a/cuml_bench/elasticnet.py +++ b/cuml_bench/elasticnet.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,15 +12,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from cuml.linear_model import ElasticNet +import os +import sys + import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from cuml.linear_model import ElasticNet parser = argparse.ArgumentParser(description='scikit-learn elastic-net regression ' 'benchmark') diff --git a/cuml_bench/kmeans.py b/cuml_bench/kmeans.py index 07951860d..6c647477f 100644 --- a/cuml_bench/kmeans.py +++ b/cuml_bench/kmeans.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,18 +12,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench - -import numpy as np from cuml import KMeans -import warnings +import numpy as np +import os from sklearn.metrics.cluster import davies_bouldin_score +import sys +import warnings + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + warnings.filterwarnings('ignore', category=FutureWarning) parser = argparse.ArgumentParser(description='cuML K-means benchmark') @@ -46,7 +47,8 @@ # Load initial centroids from specified path elif params.filei is not None: X_init = np.load(params.filei).astype(params.dtype) - params.n_clusters = X_init.shape[0] + if isinstance(X_init, np.ndarray): + params.n_clusters = X_init.shape[0] # or choose random centroids from training data else: np.random.seed(params.seed) diff --git a/cuml_bench/knn_clsf.py b/cuml_bench/knn_clsf.py index c59e00537..aa0fb5a8f 100755 --- a/cuml_bench/knn_clsf.py +++ b/cuml_bench/knn_clsf.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,14 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench from cuml.neighbors import KNeighborsClassifier +import os +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser( description='cuML kNN classifier benchmark') @@ -77,4 +78,5 @@ stages=['training', 'search'], params=params, functions=['knn_clsf.fit', 'knn_clsf.kneighbors'], times=[train_time, predict_time], + accuracies=[], accuracy_type=None, data=[X_train, X_test], alg_instance=knn_clsf) diff --git a/cuml_bench/lasso.py b/cuml_bench/lasso.py index 4cbd7c602..f8b827763 100755 --- a/cuml_bench/lasso.py +++ b/cuml_bench/lasso.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,15 +12,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from cuml.linear_model import Lasso +import os +import sys + import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from cuml.linear_model import Lasso parser = argparse.ArgumentParser(description='scikit-learn lasso regression ' 'benchmark') diff --git a/cuml_bench/linear.py b/cuml_bench/linear.py index 17160ce1e..72f92b4d6 100644 --- a/cuml_bench/linear.py +++ b/cuml_bench/linear.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,14 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench from cuml import LinearRegression +import os +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='cuML linear regression ' 'benchmark') diff --git a/cuml_bench/log_reg.py b/cuml_bench/log_reg.py index 288362fdc..5077a1a8e 100644 --- a/cuml_bench/log_reg.py +++ b/cuml_bench/log_reg.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,14 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench from cuml import LogisticRegression +import os +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='cuML logistic ' 'regression benchmark') diff --git a/cuml_bench/pca.py b/cuml_bench/pca.py index 93c03b13c..b8f08f739 100644 --- a/cuml_bench/pca.py +++ b/cuml_bench/pca.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,15 +12,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from cuml import PCA +import os +import sys + import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from cuml import PCA parser = argparse.ArgumentParser(description='cuML PCA benchmark') parser.add_argument('--svd-solver', type=str, default='full', diff --git a/cuml_bench/ridge.py b/cuml_bench/ridge.py index e2e901be7..7586f9358 100644 --- a/cuml_bench/ridge.py +++ b/cuml_bench/ridge.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,14 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench from cuml import Ridge +import os +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='cuML ridge regression ' 'benchmark') diff --git a/cuml_bench/svm.py b/cuml_bench/svm.py index 76794cee4..802fb27b5 100644 --- a/cuml_bench/svm.py +++ b/cuml_bench/svm.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,14 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench from cuml.svm import SVC +import os +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='cuML SVM benchmark') diff --git a/cuml_bench/train_test_split.py b/cuml_bench/train_test_split.py index 6001635f2..94f652527 100644 --- a/cuml_bench/train_test_split.py +++ b/cuml_bench/train_test_split.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,14 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench from cuml import train_test_split +import os +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser( description='cuml train_test_split benchmark') diff --git a/daal4py_bench/dbscan.py b/daal4py_bench/dbscan.py index b70786770..2dfacd8ff 100644 --- a/daal4py_bench/dbscan.py +++ b/daal4py_bench/dbscan.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,16 +12,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench - from daal4py import dbscan from daal4py.sklearn._utils import getFPType +import os +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + parser = argparse.ArgumentParser(description='daal4py DBSCAN clustering ' 'benchmark') diff --git a/daal4py_bench/df_clsf.py b/daal4py_bench/df_clsf.py index 26c4c1e5d..a5f6b55b7 100644 --- a/daal4py_bench/df_clsf.py +++ b/daal4py_bench/df_clsf.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,21 +12,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench - -import numpy as np -from sklearn.metrics import accuracy_score from daal4py import ( decision_forest_classification_training, decision_forest_classification_prediction, engines_mt2203 ) from daal4py.sklearn._utils import getFPType +import numpy as np +import os +from sklearn.metrics import accuracy_score +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) def df_clsf_fit(X, y, n_classes, n_trees=100, seed=12345, diff --git a/daal4py_bench/df_regr.py b/daal4py_bench/df_regr.py index a1dfe0550..dbb71895b 100644 --- a/daal4py_bench/df_regr.py +++ b/daal4py_bench/df_regr.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,20 +12,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench - from daal4py import ( decision_forest_regression_training, decision_forest_regression_prediction, engines_mt2203 ) from daal4py.sklearn._utils import getFPType +import os +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) def df_regr_fit(X, y, n_trees=100, seed=12345, n_features_per_node=0, diff --git a/daal4py_bench/distances.py b/daal4py_bench/distances.py index 84f26abd6..ad243fd25 100644 --- a/daal4py_bench/distances.py +++ b/daal4py_bench/distances.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,16 +12,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench - import daal4py from daal4py.sklearn._utils import getFPType +import os +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) def compute_distances(pairwise_distances, X): diff --git a/daal4py_bench/kmeans.py b/daal4py_bench/kmeans.py index 1f4fba729..3fd883a8e 100644 --- a/daal4py_bench/kmeans.py +++ b/daal4py_bench/kmeans.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,17 +12,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench - -import numpy as np from daal4py import kmeans from daal4py.sklearn._utils import getFPType +import numpy as np +import os +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + parser = argparse.ArgumentParser(description='daal4py K-Means clustering ' 'benchmark') diff --git a/daal4py_bench/linear.py b/daal4py_bench/linear.py index ac9458946..7fb7f8244 100644 --- a/daal4py_bench/linear.py +++ b/daal4py_bench/linear.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,16 +12,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench - from daal4py import linear_regression_training, linear_regression_prediction from daal4py.sklearn._utils import getFPType +import os +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + parser = argparse.ArgumentParser(description='daal4py linear regression ' 'benchmark') diff --git a/daal4py_bench/pca.py b/daal4py_bench/pca.py index c15d745fb..0c00b5d71 100644 --- a/daal4py_bench/pca.py +++ b/daal4py_bench/pca.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,18 +12,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench -import numpy as np from daal4py import pca, pca_transform, normalization_zscore from daal4py.sklearn._utils import getFPType - +import numpy as np +import os from sklearn.utils.extmath import svd_flip +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='daal4py PCA benchmark') diff --git a/daal4py_bench/ridge.py b/daal4py_bench/ridge.py index 06217613a..8758ec30f 100644 --- a/daal4py_bench/ridge.py +++ b/daal4py_bench/ridge.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,16 +12,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench -import numpy as np from daal4py import ridge_regression_training, ridge_regression_prediction from daal4py.sklearn._utils import getFPType +import numpy as np +import os +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='daal4py ridge regression ' diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index 4a920dc60..85e6cb7bf 100755 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,12 +12,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import os -import sys import argparse import logging +import os +import sys from .loader import (a9a, gisette, ijcnn, skin_segmentation, klaverjas, connect, mnist, sensit, @@ -41,7 +41,7 @@ def try_load_dataset(dataset_name, output_directory): if dataset_name in dataset_loaders.keys(): try: return dataset_loaders[dataset_name](output_directory) - except: + except BaseException: logging.warning("Internal error loading dataset") return False else: @@ -70,10 +70,6 @@ def try_load_dataset(dataset_name, output_directory): if args.datasets is not None: for val in dataset_loaders.values(): val(root_dir) - elif len(args.datasets) == 0: + else: logging.warning( 'Warning: Enumerate dataset(s) which should be downloaded') - else: - for key, val in dataset_loaders.items(): - if key in args.datasets: - val(root_dir) diff --git a/datasets/loader.py b/datasets/loader.py index f3cd83b68..22b077089 100755 --- a/datasets/loader.py +++ b/datasets/loader.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,16 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import os import logging - -import pandas as pd import numpy as np - +import os +import pandas as pd from urllib.request import urlretrieve - from sklearn.datasets import fetch_openml from sklearn.model_selection import train_test_split diff --git a/datasets/make_datasets.py b/datasets/make_datasets.py index 4e1c19dda..4b3629dd1 100644 --- a/datasets/make_datasets.py +++ b/datasets/make_datasets.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,14 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import argparse -import sys - import numpy as np from sklearn.datasets import make_classification, make_regression, make_blobs from sklearn.utils import check_random_state +import sys def gen_blobs(args): diff --git a/modelbuilders_bench/lgbm_mb.py b/modelbuilders_bench/lgbm_mb.py index e6b260ed3..9ffe3992b 100644 --- a/modelbuilders_bench/lgbm_mb.py +++ b/modelbuilders_bench/lgbm_mb.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,19 +12,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench -import utils - import daal4py import lightgbm as lgbm import numpy as np -from os import environ +import os +import sys + +import bench +import utils +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + parser = argparse.ArgumentParser( description='lightgbm gbt + model transform + daal predict benchmark') @@ -89,8 +89,8 @@ if params.threads != -1: lgbm_params.update({'nthread': params.threads}) -if 'OMP_NUM_THREADS' in environ.keys(): - lgbm_params['nthread'] = int(environ['OMP_NUM_THREADS']) +if 'OMP_NUM_THREADS' in os.environ.keys(): + lgbm_params['nthread'] = int(os.environ['OMP_NUM_THREADS']) if params.objective.startswith('reg'): task = 'regression' diff --git a/modelbuilders_bench/utils.py b/modelbuilders_bench/utils.py index be7125e1d..84da884e2 100644 --- a/modelbuilders_bench/utils.py +++ b/modelbuilders_bench/utils.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import json import numpy as np diff --git a/modelbuilders_bench/xgb_mb.py b/modelbuilders_bench/xgb_mb.py index 16530989d..8bd1ccf3f 100644 --- a/modelbuilders_bench/xgb_mb.py +++ b/modelbuilders_bench/xgb_mb.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,18 +12,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench import daal4py import numpy as np -from os import environ +import os import xgboost as xgb +import sys + +import bench import utils +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser( description='xgboost gbt + model transform + daal predict benchmark') @@ -107,8 +107,8 @@ if params.threads != -1: xgb_params.update({'nthread': params.threads}) -if 'OMP_NUM_THREADS' in environ.keys(): - xgb_params['nthread'] = int(environ['OMP_NUM_THREADS']) +if 'OMP_NUM_THREADS' in os.environ.keys(): + xgb_params['nthread'] = int(os.environ['OMP_NUM_THREADS']) if params.objective.startswith('reg'): task = 'regression' diff --git a/report_generator/report_generator.py b/report_generator/report_generator.py index 8c42e35d1..9db76d34f 100755 --- a/report_generator/report_generator.py +++ b/report_generator/report_generator.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,14 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import openpyxl import argparse -import json +import datetime import hashlib +import json +import openpyxl from string import ascii_uppercase -import datetime +from typing import List, Any def get_property(entry, prop): @@ -88,7 +89,7 @@ def create_list(res_entry, props_list): for prop in props_list: try: val = get_property(res_entry, prop) - except: + except BaseException: val = '' line.append(val) return line @@ -147,7 +148,7 @@ def create_list(res_entry, props_list): while {} in all_res_entries: all_res_entries.remove({}) -diff_combinations = [] +diff_combinations: List[Any] = [] for i, res_entry in enumerate(all_res_entries): already_exist = False for diff_comb in diff_combinations: @@ -158,7 +159,7 @@ def create_list(res_entry, props_list): diff_comb = res_entry.copy() diff_combinations.append(diff_comb) -align_combinations = [] +align_combinations: List[Any] = [] for i, res_entry in enumerate(all_res_entries): already_exist = False for align_comb in align_combinations: @@ -205,7 +206,8 @@ def create_list(res_entry, props_list): for i, res_entry in enumerate(all_res_entries): if res_entry['stage'] not in stages_splitter[stage_key]: continue - x, y = None, None + x: int + y: int for j, align_comb in enumerate(stage_align_combinations): if result_entries_are_comparable(res_entry, align_comb, gen_config): y = j diff --git a/runner.py b/runner.py index df9dde749..1a79ff1ba 100755 --- a/runner.py +++ b/runner.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,20 +12,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import argparse -import os -import sys import json -import socket import logging +import os import pathlib +import socket +import sys import datasets.make_datasets as make_datasets -import utils - from datasets.load_datasets import try_load_dataset +import utils def generate_cases(params): @@ -50,7 +49,6 @@ def generate_cases(params): if __name__ == '__main__': - parser = argparse.ArgumentParser() parser.add_argument('--configs', metavar='ConfigPath', type=str, default='configs/config_example.json', @@ -139,7 +137,17 @@ def generate_cases(params): elif dataset['source'] == 'synthetic': class GenerationArgs: - pass + classes: int + clusters: int + features: int + filex: str + filextest: str + filey: str + fileytest: str + samples: int + seed: int + test_samples: int + type: str gen_args = GenerationArgs() paths = '' diff --git a/sklearn_bench/dbscan.py b/sklearn_bench/dbscan.py index 542da102f..76464cffd 100644 --- a/sklearn_bench/dbscan.py +++ b/sklearn_bench/dbscan.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,14 +12,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import argparse -import sys import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench +from sklearn.cluster import DBSCAN from sklearn.metrics.cluster import davies_bouldin_score +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark') parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10., @@ -29,7 +31,6 @@ 'neighborhood to consider a point a core point') params = bench.parse_args(parser) -from sklearn.cluster import DBSCAN # Load generated data X, _, _, _ = bench.load_data(params, add_dtype=True) diff --git a/sklearn_bench/df_clsf.py b/sklearn_bench/df_clsf.py index 454fd4f4a..07aa3424e 100644 --- a/sklearn_bench/df_clsf.py +++ b/sklearn_bench/df_clsf.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,16 +12,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench - import numpy as np +import os +from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + parser = argparse.ArgumentParser(description='scikit-learn random forest ' 'classification benchmark') @@ -46,7 +48,6 @@ params = bench.parse_args(parser) -from sklearn.ensemble import RandomForestClassifier # Load and convert data X_train, X_test, y_train, y_test = bench.load_data(params) diff --git a/sklearn_bench/df_regr.py b/sklearn_bench/df_regr.py index 73fc82738..15ccdb617 100644 --- a/sklearn_bench/df_regr.py +++ b/sklearn_bench/df_regr.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import os +from sklearn.ensemble import RandomForestRegressor +import sys + import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn random forest ' 'regression benchmark') @@ -44,7 +46,6 @@ params = bench.parse_args(parser) -from sklearn.ensemble import RandomForestRegressor # Load and convert data X_train, X_test, y_train, y_test = bench.load_data(params) diff --git a/sklearn_bench/distances.py b/sklearn_bench/distances.py index b73d49255..0330a4ef5 100644 --- a/sklearn_bench/distances.py +++ b/sklearn_bench/distances.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import os +from sklearn.metrics.pairwise import pairwise_distances +import sys + import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn pairwise distances ' 'benchmark') @@ -27,7 +29,6 @@ help='Metric to test for pairwise distances') params = bench.parse_args(parser) -from sklearn.metrics.pairwise import pairwise_distances # Load data X, _, _, _ = bench.load_data(params, generated_data=['X_train'], add_dtype=True) diff --git a/sklearn_bench/elasticnet.py b/sklearn_bench/elasticnet.py index f0ac87973..e23af0ec6 100755 --- a/sklearn_bench/elasticnet.py +++ b/sklearn_bench/elasticnet.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import argparse -import sys import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from sklearn.linear_model import ElasticNet +import sys + import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn elastic-net regression ' 'benchmark') @@ -35,7 +37,6 @@ help='Tolerance for solver.') params = bench.parse_args(parser) -from sklearn.linear_model import ElasticNet # Load data X_train, X_test, y_train, y_test = bench.load_data(params) diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index a6e1a5f1e..a5ee27419 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,15 +12,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import argparse -import sys -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench import numpy as np +import os +from sklearn.cluster import KMeans from sklearn.metrics.cluster import davies_bouldin_score +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn K-means benchmark') parser.add_argument('-i', '--filei', '--fileI', '--init', @@ -32,7 +34,6 @@ parser.add_argument('--n-clusters', type=int, help='Number of clusters') params = bench.parse_args(parser) -from sklearn.cluster import KMeans # Load and convert generated data X_train, X_test, _, _ = bench.load_data(params) @@ -42,7 +43,8 @@ # Load initial centroids from specified path elif params.filei is not None: X_init = np.load(params.filei).astype(params.dtype) - params.n_clusters = X_init.shape[0] + if isinstance(X_init, np.ndarray): + params.n_clusters = X_init.shape[0] # or choose random centroids from training data else: np.random.seed(params.seed) diff --git a/sklearn_bench/knn_clsf.py b/sklearn_bench/knn_clsf.py index f30983a0b..d144e911b 100755 --- a/sklearn_bench/knn_clsf.py +++ b/sklearn_bench/knn_clsf.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,15 +12,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import argparse -import sys -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench import numpy as np +import os from sklearn.metrics import accuracy_score +from sklearn.neighbors import KNeighborsClassifier +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser( description='scikit-learn kNN classifier benchmark') @@ -39,7 +41,6 @@ help='Distance metric to use') params = bench.parse_args(parser) -from sklearn.neighbors import KNeighborsClassifier # Load generated data X_train, X_test, y_train, y_test = bench.load_data(params) @@ -81,4 +82,5 @@ stages=['training', 'search'], params=params, functions=['knn_clsf.fit', 'knn_clsf.kneighbors'], times=[train_time, predict_time], + accuracies=[], accuracy_type=None, data=[X_train, X_test], alg_instance=knn_clsf) diff --git a/sklearn_bench/lasso.py b/sklearn_bench/lasso.py index 7b2792909..71c689050 100755 --- a/sklearn_bench/lasso.py +++ b/sklearn_bench/lasso.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import argparse -import sys import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from sklearn.linear_model import Lasso +import sys + import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn lasso regression ' 'benchmark') @@ -33,7 +35,6 @@ help='Tolerance for solver.') params = bench.parse_args(parser) -from sklearn.linear_model import Lasso # Load data X_train, X_test, y_train, y_test = bench.load_data(params) diff --git a/sklearn_bench/linear.py b/sklearn_bench/linear.py index e059d96ef..7d8b1c7a1 100644 --- a/sklearn_bench/linear.py +++ b/sklearn_bench/linear.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,14 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import argparse - -import sys import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from sklearn.linear_model import LinearRegression +import sys + import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn linear regression ' 'benchmark') @@ -28,7 +29,6 @@ help="Don't fit intercept (assume data already centered)") params = bench.parse_args(parser) -from sklearn.linear_model import LinearRegression # Load data X_train, X_test, y_train, y_test = bench.load_data( diff --git a/sklearn_bench/log_reg.py b/sklearn_bench/log_reg.py index 073fa549f..9d1cc9031 100644 --- a/sklearn_bench/log_reg.py +++ b/sklearn_bench/log_reg.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,15 +12,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import argparse -import sys -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench import numpy as np +import os +from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn logistic ' 'regression benchmark') @@ -45,7 +47,6 @@ 'is 1e-10.') params = bench.parse_args(parser, loop_types=('fit', 'predict')) -from sklearn.linear_model import LogisticRegression # Load generated data X_train, X_test, y_train, y_test = bench.load_data(params) diff --git a/sklearn_bench/pca.py b/sklearn_bench/pca.py index 6d015e3d9..d181d94da 100644 --- a/sklearn_bench/pca.py +++ b/sklearn_bench/pca.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import argparse -import sys import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from sklearn.decomposition import PCA +import sys + import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn PCA benchmark') parser.add_argument('--svd-solver', type=str, choices=['full'], @@ -29,7 +31,6 @@ help='Perform whitening') params = bench.parse_args(parser) -from sklearn.decomposition import PCA # Load random data X_train, X_test, _, _ = bench.load_data(params, generated_data=['X_train']) diff --git a/sklearn_bench/ridge.py b/sklearn_bench/ridge.py index dce3b7c53..7e250a7b5 100644 --- a/sklearn_bench/ridge.py +++ b/sklearn_bench/ridge.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import argparse -import sys import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from sklearn.linear_model import Ridge +import sys + import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn ridge regression ' 'benchmark') @@ -31,7 +33,6 @@ help='Regularization strength') params = bench.parse_args(parser) -from sklearn.linear_model import Ridge # Load data X_train, X_test, y_train, y_test = bench.load_data(params, diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py index a8672cefb..c476ab618 100644 --- a/sklearn_bench/svm.py +++ b/sklearn_bench/svm.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,16 +12,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import argparse - -import sys -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench import numpy as np +import os from sklearn.metrics import accuracy_score +from sklearn.svm import SVC +import sys + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn SVM benchmark') @@ -42,7 +43,6 @@ dest='shrinking', help="Don't use shrinking heuristic") params = bench.parse_args(parser, loop_types=('fit', 'predict')) -from sklearn.svm import SVC # Load data X_train, X_test, y_train, y_test = bench.load_data(params) diff --git a/sklearn_bench/train_test_split.py b/sklearn_bench/train_test_split.py index 4f61d7bac..ce36c5f83 100644 --- a/sklearn_bench/train_test_split.py +++ b/sklearn_bench/train_test_split.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,14 +12,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import argparse - -import sys import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from sklearn.model_selection import train_test_split +import sys +from typing import Iterable + import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser( description='scikit-learn train_test_split benchmark') @@ -39,11 +41,11 @@ '(only for IDP scikit-learn)') params = bench.parse_args(parser) -from sklearn.model_selection import train_test_split # Load generated data X, y, _, _ = bench.load_data(params) +data_args: Iterable if params.include_y: data_args = (X, y) else: @@ -59,8 +61,7 @@ if params.rng is not None: tts_params['rng'] = params.rng -time, _ = bench.measure_function_time(train_test_split, *data_args, **tts_params, - params=params) +time, _ = bench.measure_function_time(train_test_split, *data_args, params=params, **tts_params) bench.print_output(library='sklearn', algorithm='train_test_split', stages=['training'], params=params, diff --git a/utils.py b/utils.py index a3ad7f7e8..04ee52d84 100755 --- a/utils.py +++ b/utils.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,15 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import os -import sys -import subprocess -import multiprocessing -import logging import json +import logging +import multiprocessing +import os import platform +import subprocess +import sys def filter_stderr(text): diff --git a/xgboost_bench/gbt.py b/xgboost_bench/gbt.py index f82bec2bd..390778f1d 100644 --- a/xgboost_bench/gbt.py +++ b/xgboost_bench/gbt.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,17 +12,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import sys -import os import argparse -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -import bench - import numpy as np -import xgboost as xgb import os +import sys +import xgboost as xgb + +import bench +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) def convert_probs_to_classes(y_prob): From 132d73f948977bd7708b091f4fd84cbaf1a9c167 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Mon, 22 Mar 2021 20:47:14 +0300 Subject: [PATCH 02/31] Sorted imports with ISort --- bench.py | 7 ++++--- cuml_bench/dbscan.py | 5 +++-- cuml_bench/df_clsf.py | 5 +++-- cuml_bench/df_regr.py | 3 ++- cuml_bench/elasticnet.py | 3 ++- cuml_bench/kmeans.py | 7 ++++--- cuml_bench/knn_clsf.py | 3 ++- cuml_bench/lasso.py | 3 ++- cuml_bench/linear.py | 3 ++- cuml_bench/log_reg.py | 3 ++- cuml_bench/pca.py | 3 ++- cuml_bench/ridge.py | 3 ++- cuml_bench/svm.py | 3 ++- cuml_bench/train_test_split.py | 3 ++- daal4py_bench/dbscan.py | 5 +++-- daal4py_bench/df_clsf.py | 13 ++++++------- daal4py_bench/df_regr.py | 10 ++++------ daal4py_bench/distances.py | 5 +++-- daal4py_bench/kmeans.py | 7 ++++--- daal4py_bench/linear.py | 5 +++-- daal4py_bench/pca.py | 9 +++++---- daal4py_bench/ridge.py | 7 ++++--- datasets/load_datasets.py | 5 ++--- datasets/loader.py | 5 +++-- modelbuilders_bench/lgbm_mb.py | 8 +++++--- modelbuilders_bench/utils.py | 1 + modelbuilders_bench/xgb_mb.py | 8 +++++--- report_generator/report_generator.py | 5 +++-- runner.py | 2 +- sklearn_bench/dbscan.py | 5 +++-- sklearn_bench/df_clsf.py | 7 ++++--- sklearn_bench/df_regr.py | 3 ++- sklearn_bench/distances.py | 3 ++- sklearn_bench/elasticnet.py | 3 ++- sklearn_bench/kmeans.py | 7 ++++--- sklearn_bench/knn_clsf.py | 7 ++++--- sklearn_bench/lasso.py | 3 ++- sklearn_bench/linear.py | 3 ++- sklearn_bench/log_reg.py | 7 ++++--- sklearn_bench/pca.py | 3 ++- sklearn_bench/ridge.py | 3 ++- sklearn_bench/svm.py | 7 ++++--- sklearn_bench/train_test_split.py | 6 ++++-- xgboost_bench/gbt.py | 5 +++-- 44 files changed, 130 insertions(+), 91 deletions(-) diff --git a/bench.py b/bench.py index d919c56ad..527cac394 100644 --- a/bench.py +++ b/bench.py @@ -15,11 +15,12 @@ # =============================================================================== import argparse -import numpy as np -import sklearn -import timeit import json import sys +import timeit + +import numpy as np +import sklearn def get_dtype(data): diff --git a/cuml_bench/dbscan.py b/cuml_bench/dbscan.py index 64c1cf5bc..bbfd6c04c 100644 --- a/cuml_bench/dbscan.py +++ b/cuml_bench/dbscan.py @@ -15,12 +15,13 @@ # =============================================================================== import argparse -from cuml import DBSCAN import os -from sklearn.metrics.cluster import davies_bouldin_score import sys import bench +from cuml import DBSCAN +from sklearn.metrics.cluster import davies_bouldin_score + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/cuml_bench/df_clsf.py b/cuml_bench/df_clsf.py index c8d857faa..f6b4842ab 100755 --- a/cuml_bench/df_clsf.py +++ b/cuml_bench/df_clsf.py @@ -15,12 +15,13 @@ # =============================================================================== import argparse -import cuml -from cuml.ensemble import RandomForestClassifier import os import sys import bench +import cuml +from cuml.ensemble import RandomForestClassifier + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='cuml random forest ' diff --git a/cuml_bench/df_regr.py b/cuml_bench/df_regr.py index eb7329fbf..36bf29885 100644 --- a/cuml_bench/df_regr.py +++ b/cuml_bench/df_regr.py @@ -15,11 +15,12 @@ # =============================================================================== import argparse -from cuml.ensemble import RandomForestRegressor import os import sys import bench +from cuml.ensemble import RandomForestRegressor + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/cuml_bench/elasticnet.py b/cuml_bench/elasticnet.py index 8755a51b7..246c4280e 100755 --- a/cuml_bench/elasticnet.py +++ b/cuml_bench/elasticnet.py @@ -15,11 +15,12 @@ # =============================================================================== import argparse -from cuml.linear_model import ElasticNet import os import sys import bench +from cuml.linear_model import ElasticNet + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/cuml_bench/kmeans.py b/cuml_bench/kmeans.py index 6c647477f..9479ea37d 100644 --- a/cuml_bench/kmeans.py +++ b/cuml_bench/kmeans.py @@ -15,14 +15,15 @@ # =============================================================================== import argparse -from cuml import KMeans -import numpy as np import os -from sklearn.metrics.cluster import davies_bouldin_score import sys import warnings import bench +import numpy as np +from cuml import KMeans +from sklearn.metrics.cluster import davies_bouldin_score + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/cuml_bench/knn_clsf.py b/cuml_bench/knn_clsf.py index aa0fb5a8f..01e92e9b0 100755 --- a/cuml_bench/knn_clsf.py +++ b/cuml_bench/knn_clsf.py @@ -15,11 +15,12 @@ # =============================================================================== import argparse -from cuml.neighbors import KNeighborsClassifier import os import sys import bench +from cuml.neighbors import KNeighborsClassifier + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser( diff --git a/cuml_bench/lasso.py b/cuml_bench/lasso.py index f8b827763..bbdf44507 100755 --- a/cuml_bench/lasso.py +++ b/cuml_bench/lasso.py @@ -15,11 +15,12 @@ # =============================================================================== import argparse -from cuml.linear_model import Lasso import os import sys import bench +from cuml.linear_model import Lasso + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/cuml_bench/linear.py b/cuml_bench/linear.py index 72f92b4d6..ad981111b 100644 --- a/cuml_bench/linear.py +++ b/cuml_bench/linear.py @@ -15,11 +15,12 @@ # =============================================================================== import argparse -from cuml import LinearRegression import os import sys import bench +from cuml import LinearRegression + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='cuML linear regression ' diff --git a/cuml_bench/log_reg.py b/cuml_bench/log_reg.py index 5077a1a8e..5f0f24a09 100644 --- a/cuml_bench/log_reg.py +++ b/cuml_bench/log_reg.py @@ -15,11 +15,12 @@ # =============================================================================== import argparse -from cuml import LogisticRegression import os import sys import bench +from cuml import LogisticRegression + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='cuML logistic ' diff --git a/cuml_bench/pca.py b/cuml_bench/pca.py index b8f08f739..dcdef1a53 100644 --- a/cuml_bench/pca.py +++ b/cuml_bench/pca.py @@ -15,11 +15,12 @@ # =============================================================================== import argparse -from cuml import PCA import os import sys import bench +from cuml import PCA + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/cuml_bench/ridge.py b/cuml_bench/ridge.py index 7586f9358..14b1d586d 100644 --- a/cuml_bench/ridge.py +++ b/cuml_bench/ridge.py @@ -15,11 +15,12 @@ # =============================================================================== import argparse -from cuml import Ridge import os import sys import bench +from cuml import Ridge + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='cuML ridge regression ' diff --git a/cuml_bench/svm.py b/cuml_bench/svm.py index 802fb27b5..e6cbcebc0 100644 --- a/cuml_bench/svm.py +++ b/cuml_bench/svm.py @@ -15,11 +15,12 @@ # =============================================================================== import argparse -from cuml.svm import SVC import os import sys import bench +from cuml.svm import SVC + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='cuML SVM benchmark') diff --git a/cuml_bench/train_test_split.py b/cuml_bench/train_test_split.py index 94f652527..45c7470e9 100644 --- a/cuml_bench/train_test_split.py +++ b/cuml_bench/train_test_split.py @@ -15,11 +15,12 @@ # =============================================================================== import argparse -from cuml import train_test_split import os import sys import bench +from cuml import train_test_split + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser( diff --git a/daal4py_bench/dbscan.py b/daal4py_bench/dbscan.py index 2dfacd8ff..b41bf21ba 100644 --- a/daal4py_bench/dbscan.py +++ b/daal4py_bench/dbscan.py @@ -15,12 +15,13 @@ # =============================================================================== import argparse -from daal4py import dbscan -from daal4py.sklearn._utils import getFPType import os import sys import bench +from daal4py import dbscan +from daal4py.sklearn._utils import getFPType + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/daal4py_bench/df_clsf.py b/daal4py_bench/df_clsf.py index a5f6b55b7..55c171504 100644 --- a/daal4py_bench/df_clsf.py +++ b/daal4py_bench/df_clsf.py @@ -15,17 +15,16 @@ # =============================================================================== import argparse -from daal4py import ( - decision_forest_classification_training, - decision_forest_classification_prediction, engines_mt2203 -) -from daal4py.sklearn._utils import getFPType -import numpy as np import os -from sklearn.metrics import accuracy_score import sys import bench +import numpy as np +from daal4py import (decision_forest_classification_prediction, + decision_forest_classification_training, engines_mt2203) +from daal4py.sklearn._utils import getFPType +from sklearn.metrics import accuracy_score + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/daal4py_bench/df_regr.py b/daal4py_bench/df_regr.py index dbb71895b..9ab9c8965 100644 --- a/daal4py_bench/df_regr.py +++ b/daal4py_bench/df_regr.py @@ -15,16 +15,14 @@ # =============================================================================== import argparse -from daal4py import ( - decision_forest_regression_training, - decision_forest_regression_prediction, - engines_mt2203 -) -from daal4py.sklearn._utils import getFPType import os import sys import bench +from daal4py import (decision_forest_regression_prediction, + decision_forest_regression_training, engines_mt2203) +from daal4py.sklearn._utils import getFPType + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/daal4py_bench/distances.py b/daal4py_bench/distances.py index ad243fd25..aef6837b1 100644 --- a/daal4py_bench/distances.py +++ b/daal4py_bench/distances.py @@ -15,12 +15,13 @@ # =============================================================================== import argparse -import daal4py -from daal4py.sklearn._utils import getFPType import os import sys import bench +import daal4py +from daal4py.sklearn._utils import getFPType + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/daal4py_bench/kmeans.py b/daal4py_bench/kmeans.py index 3fd883a8e..469d5b0c9 100644 --- a/daal4py_bench/kmeans.py +++ b/daal4py_bench/kmeans.py @@ -15,13 +15,14 @@ # =============================================================================== import argparse -from daal4py import kmeans -from daal4py.sklearn._utils import getFPType -import numpy as np import os import sys import bench +import numpy as np +from daal4py import kmeans +from daal4py.sklearn._utils import getFPType + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/daal4py_bench/linear.py b/daal4py_bench/linear.py index 7fb7f8244..7e5325da2 100644 --- a/daal4py_bench/linear.py +++ b/daal4py_bench/linear.py @@ -15,12 +15,13 @@ # =============================================================================== import argparse -from daal4py import linear_regression_training, linear_regression_prediction -from daal4py.sklearn._utils import getFPType import os import sys import bench +from daal4py import linear_regression_prediction, linear_regression_training +from daal4py.sklearn._utils import getFPType + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/daal4py_bench/pca.py b/daal4py_bench/pca.py index 0c00b5d71..274b98c60 100644 --- a/daal4py_bench/pca.py +++ b/daal4py_bench/pca.py @@ -15,14 +15,15 @@ # =============================================================================== import argparse -from daal4py import pca, pca_transform, normalization_zscore -from daal4py.sklearn._utils import getFPType -import numpy as np import os -from sklearn.utils.extmath import svd_flip import sys import bench +import numpy as np +from daal4py import normalization_zscore, pca, pca_transform +from daal4py.sklearn._utils import getFPType +from sklearn.utils.extmath import svd_flip + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/daal4py_bench/ridge.py b/daal4py_bench/ridge.py index 8758ec30f..61cc2151c 100644 --- a/daal4py_bench/ridge.py +++ b/daal4py_bench/ridge.py @@ -15,13 +15,14 @@ # =============================================================================== import argparse -from daal4py import ridge_regression_training, ridge_regression_prediction -from daal4py.sklearn._utils import getFPType -import numpy as np import os import sys import bench +import numpy as np +from daal4py import ridge_regression_prediction, ridge_regression_training +from daal4py.sklearn._utils import getFPType + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index 85e6cb7bf..a6e79dc2b 100755 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -19,9 +19,8 @@ import os import sys -from .loader import (a9a, gisette, ijcnn, skin_segmentation, - klaverjas, connect, mnist, sensit, - covertype, codrnanorm) +from .loader import (a9a, codrnanorm, connect, covertype, gisette, ijcnn, + klaverjas, mnist, sensit, skin_segmentation) dataset_loaders = { "a9a": a9a, diff --git a/datasets/loader.py b/datasets/loader.py index 22b077089..055fd52a6 100755 --- a/datasets/loader.py +++ b/datasets/loader.py @@ -15,10 +15,11 @@ # =============================================================================== import logging -import numpy as np import os -import pandas as pd from urllib.request import urlretrieve + +import numpy as np +import pandas as pd from sklearn.datasets import fetch_openml from sklearn.model_selection import train_test_split diff --git a/modelbuilders_bench/lgbm_mb.py b/modelbuilders_bench/lgbm_mb.py index 9ffe3992b..674d2f5bf 100644 --- a/modelbuilders_bench/lgbm_mb.py +++ b/modelbuilders_bench/lgbm_mb.py @@ -15,14 +15,16 @@ # =============================================================================== import argparse -import daal4py -import lightgbm as lgbm -import numpy as np import os import sys import bench +import daal4py +import lightgbm as lgbm +import numpy as np + import utils + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/modelbuilders_bench/utils.py b/modelbuilders_bench/utils.py index 84da884e2..d66adaac8 100644 --- a/modelbuilders_bench/utils.py +++ b/modelbuilders_bench/utils.py @@ -15,6 +15,7 @@ # =============================================================================== import json + import numpy as np diff --git a/modelbuilders_bench/xgb_mb.py b/modelbuilders_bench/xgb_mb.py index 8bd1ccf3f..5a25765de 100644 --- a/modelbuilders_bench/xgb_mb.py +++ b/modelbuilders_bench/xgb_mb.py @@ -15,14 +15,16 @@ # =============================================================================== import argparse -import daal4py -import numpy as np import os -import xgboost as xgb import sys import bench +import daal4py +import numpy as np +import xgboost as xgb + import utils + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser( diff --git a/report_generator/report_generator.py b/report_generator/report_generator.py index 9db76d34f..0bf441517 100755 --- a/report_generator/report_generator.py +++ b/report_generator/report_generator.py @@ -18,9 +18,10 @@ import datetime import hashlib import json -import openpyxl from string import ascii_uppercase -from typing import List, Any +from typing import Any, List + +import openpyxl def get_property(entry, prop): diff --git a/runner.py b/runner.py index 1a79ff1ba..58f99588a 100755 --- a/runner.py +++ b/runner.py @@ -23,8 +23,8 @@ import sys import datasets.make_datasets as make_datasets -from datasets.load_datasets import try_load_dataset import utils +from datasets.load_datasets import try_load_dataset def generate_cases(params): diff --git a/sklearn_bench/dbscan.py b/sklearn_bench/dbscan.py index 76464cffd..acedcb31f 100644 --- a/sklearn_bench/dbscan.py +++ b/sklearn_bench/dbscan.py @@ -16,11 +16,12 @@ import argparse import os -from sklearn.cluster import DBSCAN -from sklearn.metrics.cluster import davies_bouldin_score import sys import bench +from sklearn.cluster import DBSCAN +from sklearn.metrics.cluster import davies_bouldin_score + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark') diff --git a/sklearn_bench/df_clsf.py b/sklearn_bench/df_clsf.py index 07aa3424e..d83f4cfc3 100644 --- a/sklearn_bench/df_clsf.py +++ b/sklearn_bench/df_clsf.py @@ -15,13 +15,14 @@ # =============================================================================== import argparse -import numpy as np import os -from sklearn.ensemble import RandomForestClassifier -from sklearn.metrics import accuracy_score import sys import bench +import numpy as np +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) diff --git a/sklearn_bench/df_regr.py b/sklearn_bench/df_regr.py index 15ccdb617..f37e97800 100644 --- a/sklearn_bench/df_regr.py +++ b/sklearn_bench/df_regr.py @@ -16,10 +16,11 @@ import argparse import os -from sklearn.ensemble import RandomForestRegressor import sys import bench +from sklearn.ensemble import RandomForestRegressor + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn random forest ' diff --git a/sklearn_bench/distances.py b/sklearn_bench/distances.py index 0330a4ef5..dd1f143f1 100644 --- a/sklearn_bench/distances.py +++ b/sklearn_bench/distances.py @@ -16,10 +16,11 @@ import argparse import os -from sklearn.metrics.pairwise import pairwise_distances import sys import bench +from sklearn.metrics.pairwise import pairwise_distances + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn pairwise distances ' diff --git a/sklearn_bench/elasticnet.py b/sklearn_bench/elasticnet.py index e23af0ec6..aa144ea1c 100755 --- a/sklearn_bench/elasticnet.py +++ b/sklearn_bench/elasticnet.py @@ -16,10 +16,11 @@ import argparse import os -from sklearn.linear_model import ElasticNet import sys import bench +from sklearn.linear_model import ElasticNet + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn elastic-net regression ' diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index a5ee27419..6200a48cb 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -15,13 +15,14 @@ # =============================================================================== import argparse -import numpy as np import os -from sklearn.cluster import KMeans -from sklearn.metrics.cluster import davies_bouldin_score import sys import bench +import numpy as np +from sklearn.cluster import KMeans +from sklearn.metrics.cluster import davies_bouldin_score + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn K-means benchmark') diff --git a/sklearn_bench/knn_clsf.py b/sklearn_bench/knn_clsf.py index d144e911b..5e9751f57 100755 --- a/sklearn_bench/knn_clsf.py +++ b/sklearn_bench/knn_clsf.py @@ -15,13 +15,14 @@ # =============================================================================== import argparse -import numpy as np import os -from sklearn.metrics import accuracy_score -from sklearn.neighbors import KNeighborsClassifier import sys import bench +import numpy as np +from sklearn.metrics import accuracy_score +from sklearn.neighbors import KNeighborsClassifier + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser( diff --git a/sklearn_bench/lasso.py b/sklearn_bench/lasso.py index 71c689050..85b0c05d0 100755 --- a/sklearn_bench/lasso.py +++ b/sklearn_bench/lasso.py @@ -16,10 +16,11 @@ import argparse import os -from sklearn.linear_model import Lasso import sys import bench +from sklearn.linear_model import Lasso + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn lasso regression ' diff --git a/sklearn_bench/linear.py b/sklearn_bench/linear.py index 7d8b1c7a1..0c913afac 100644 --- a/sklearn_bench/linear.py +++ b/sklearn_bench/linear.py @@ -16,10 +16,11 @@ import argparse import os -from sklearn.linear_model import LinearRegression import sys import bench +from sklearn.linear_model import LinearRegression + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn linear regression ' diff --git a/sklearn_bench/log_reg.py b/sklearn_bench/log_reg.py index 9d1cc9031..eb104e24e 100644 --- a/sklearn_bench/log_reg.py +++ b/sklearn_bench/log_reg.py @@ -15,13 +15,14 @@ # =============================================================================== import argparse -import numpy as np import os -from sklearn.linear_model import LogisticRegression -from sklearn.metrics import accuracy_score import sys import bench +import numpy as np +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn logistic ' diff --git a/sklearn_bench/pca.py b/sklearn_bench/pca.py index d181d94da..773c8c191 100644 --- a/sklearn_bench/pca.py +++ b/sklearn_bench/pca.py @@ -16,10 +16,11 @@ import argparse import os -from sklearn.decomposition import PCA import sys import bench +from sklearn.decomposition import PCA + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn PCA benchmark') diff --git a/sklearn_bench/ridge.py b/sklearn_bench/ridge.py index 7e250a7b5..8ebe572c8 100644 --- a/sklearn_bench/ridge.py +++ b/sklearn_bench/ridge.py @@ -16,10 +16,11 @@ import argparse import os -from sklearn.linear_model import Ridge import sys import bench +from sklearn.linear_model import Ridge + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn ridge regression ' diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py index c476ab618..2474621c9 100644 --- a/sklearn_bench/svm.py +++ b/sklearn_bench/svm.py @@ -15,13 +15,14 @@ # =============================================================================== import argparse -import numpy as np import os -from sklearn.metrics import accuracy_score -from sklearn.svm import SVC import sys import bench +import numpy as np +from sklearn.metrics import accuracy_score +from sklearn.svm import SVC + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn SVM benchmark') diff --git a/sklearn_bench/train_test_split.py b/sklearn_bench/train_test_split.py index ce36c5f83..18cd7e8d5 100644 --- a/sklearn_bench/train_test_split.py +++ b/sklearn_bench/train_test_split.py @@ -16,11 +16,12 @@ import argparse import os -from sklearn.model_selection import train_test_split import sys from typing import Iterable import bench +from sklearn.model_selection import train_test_split + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser( @@ -61,7 +62,8 @@ if params.rng is not None: tts_params['rng'] = params.rng -time, _ = bench.measure_function_time(train_test_split, *data_args, params=params, **tts_params) +time, _ = bench.measure_function_time( + train_test_split, *data_args, params=params, **tts_params) bench.print_output(library='sklearn', algorithm='train_test_split', stages=['training'], params=params, diff --git a/xgboost_bench/gbt.py b/xgboost_bench/gbt.py index 390778f1d..6360500d3 100644 --- a/xgboost_bench/gbt.py +++ b/xgboost_bench/gbt.py @@ -15,12 +15,13 @@ # =============================================================================== import argparse -import numpy as np import os import sys -import xgboost as xgb import bench +import numpy as np +import xgboost as xgb + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) From 4aa48983ffe548b15c2151202807400d708a6c92 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Mon, 22 Mar 2021 21:42:10 +0300 Subject: [PATCH 03/31] Moved env change to runner --- cuml_bench/dbscan.py | 4 ---- cuml_bench/df_clsf.py | 3 --- cuml_bench/df_regr.py | 4 ---- cuml_bench/elasticnet.py | 4 ---- cuml_bench/kmeans.py | 4 ---- cuml_bench/knn_clsf.py | 3 --- cuml_bench/lasso.py | 4 ---- cuml_bench/linear.py | 3 --- cuml_bench/log_reg.py | 3 --- cuml_bench/pca.py | 4 ---- cuml_bench/ridge.py | 3 --- cuml_bench/svm.py | 3 --- cuml_bench/train_test_split.py | 3 --- daal4py_bench/dbscan.py | 4 ---- daal4py_bench/df_clsf.py | 4 ---- daal4py_bench/df_regr.py | 4 ---- daal4py_bench/distances.py | 4 ---- daal4py_bench/kmeans.py | 4 ---- daal4py_bench/linear.py | 4 ---- daal4py_bench/pca.py | 4 ---- daal4py_bench/ridge.py | 5 ----- modelbuilders_bench/lgbm_mb.py | 3 --- modelbuilders_bench/xgb_mb.py | 2 -- sklearn_bench/dbscan.py | 3 --- sklearn_bench/df_clsf.py | 4 ---- sklearn_bench/df_regr.py | 3 --- sklearn_bench/distances.py | 3 --- sklearn_bench/elasticnet.py | 3 --- sklearn_bench/kmeans.py | 3 --- sklearn_bench/knn_clsf.py | 3 --- sklearn_bench/lasso.py | 3 --- sklearn_bench/linear.py | 3 --- sklearn_bench/log_reg.py | 3 --- sklearn_bench/pca.py | 3 --- sklearn_bench/ridge.py | 3 --- sklearn_bench/svm.py | 4 ---- sklearn_bench/train_test_split.py | 3 --- utils.py | 4 ++++ xgboost_bench/gbt.py | 3 --- 39 files changed, 4 insertions(+), 130 deletions(-) diff --git a/cuml_bench/dbscan.py b/cuml_bench/dbscan.py index bbfd6c04c..663a2fc10 100644 --- a/cuml_bench/dbscan.py +++ b/cuml_bench/dbscan.py @@ -15,15 +15,11 @@ # =============================================================================== import argparse -import os -import sys import bench from cuml import DBSCAN from sklearn.metrics.cluster import davies_bouldin_score -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - parser = argparse.ArgumentParser(description='cuML DBSCAN benchmark') parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10., diff --git a/cuml_bench/df_clsf.py b/cuml_bench/df_clsf.py index f6b4842ab..60cef76e3 100755 --- a/cuml_bench/df_clsf.py +++ b/cuml_bench/df_clsf.py @@ -15,14 +15,11 @@ # =============================================================================== import argparse -import os -import sys import bench import cuml from cuml.ensemble import RandomForestClassifier -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='cuml random forest ' 'classification benchmark') diff --git a/cuml_bench/df_regr.py b/cuml_bench/df_regr.py index 36bf29885..cc46a583c 100644 --- a/cuml_bench/df_regr.py +++ b/cuml_bench/df_regr.py @@ -15,14 +15,10 @@ # =============================================================================== import argparse -import os -import sys import bench from cuml.ensemble import RandomForestRegressor -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - parser = argparse.ArgumentParser(description='cuml random forest ' 'regression benchmark') diff --git a/cuml_bench/elasticnet.py b/cuml_bench/elasticnet.py index 246c4280e..bd0684a09 100755 --- a/cuml_bench/elasticnet.py +++ b/cuml_bench/elasticnet.py @@ -15,14 +15,10 @@ # =============================================================================== import argparse -import os -import sys import bench from cuml.linear_model import ElasticNet -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - parser = argparse.ArgumentParser(description='scikit-learn elastic-net regression ' 'benchmark') diff --git a/cuml_bench/kmeans.py b/cuml_bench/kmeans.py index 9479ea37d..04b284f93 100644 --- a/cuml_bench/kmeans.py +++ b/cuml_bench/kmeans.py @@ -15,8 +15,6 @@ # =============================================================================== import argparse -import os -import sys import warnings import bench @@ -24,8 +22,6 @@ from cuml import KMeans from sklearn.metrics.cluster import davies_bouldin_score -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - warnings.filterwarnings('ignore', category=FutureWarning) parser = argparse.ArgumentParser(description='cuML K-means benchmark') diff --git a/cuml_bench/knn_clsf.py b/cuml_bench/knn_clsf.py index 01e92e9b0..0460346bf 100755 --- a/cuml_bench/knn_clsf.py +++ b/cuml_bench/knn_clsf.py @@ -15,13 +15,10 @@ # =============================================================================== import argparse -import os -import sys import bench from cuml.neighbors import KNeighborsClassifier -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser( description='cuML kNN classifier benchmark') diff --git a/cuml_bench/lasso.py b/cuml_bench/lasso.py index bbdf44507..373ea3f19 100755 --- a/cuml_bench/lasso.py +++ b/cuml_bench/lasso.py @@ -15,14 +15,10 @@ # =============================================================================== import argparse -import os -import sys import bench from cuml.linear_model import Lasso -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - parser = argparse.ArgumentParser(description='scikit-learn lasso regression ' 'benchmark') diff --git a/cuml_bench/linear.py b/cuml_bench/linear.py index ad981111b..f80434917 100644 --- a/cuml_bench/linear.py +++ b/cuml_bench/linear.py @@ -15,13 +15,10 @@ # =============================================================================== import argparse -import os -import sys import bench from cuml import LinearRegression -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='cuML linear regression ' 'benchmark') diff --git a/cuml_bench/log_reg.py b/cuml_bench/log_reg.py index 5f0f24a09..f8e143d9b 100644 --- a/cuml_bench/log_reg.py +++ b/cuml_bench/log_reg.py @@ -15,13 +15,10 @@ # =============================================================================== import argparse -import os -import sys import bench from cuml import LogisticRegression -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='cuML logistic ' 'regression benchmark') diff --git a/cuml_bench/pca.py b/cuml_bench/pca.py index dcdef1a53..c43f569ae 100644 --- a/cuml_bench/pca.py +++ b/cuml_bench/pca.py @@ -15,14 +15,10 @@ # =============================================================================== import argparse -import os -import sys import bench from cuml import PCA -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - parser = argparse.ArgumentParser(description='cuML PCA benchmark') parser.add_argument('--svd-solver', type=str, default='full', diff --git a/cuml_bench/ridge.py b/cuml_bench/ridge.py index 14b1d586d..6c1696ae7 100644 --- a/cuml_bench/ridge.py +++ b/cuml_bench/ridge.py @@ -15,13 +15,10 @@ # =============================================================================== import argparse -import os -import sys import bench from cuml import Ridge -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='cuML ridge regression ' 'benchmark') diff --git a/cuml_bench/svm.py b/cuml_bench/svm.py index e6cbcebc0..0f24a5ec0 100644 --- a/cuml_bench/svm.py +++ b/cuml_bench/svm.py @@ -15,13 +15,10 @@ # =============================================================================== import argparse -import os -import sys import bench from cuml.svm import SVC -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='cuML SVM benchmark') diff --git a/cuml_bench/train_test_split.py b/cuml_bench/train_test_split.py index 45c7470e9..d9e74ad94 100644 --- a/cuml_bench/train_test_split.py +++ b/cuml_bench/train_test_split.py @@ -15,13 +15,10 @@ # =============================================================================== import argparse -import os -import sys import bench from cuml import train_test_split -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser( description='cuml train_test_split benchmark') diff --git a/daal4py_bench/dbscan.py b/daal4py_bench/dbscan.py index b41bf21ba..64a19d813 100644 --- a/daal4py_bench/dbscan.py +++ b/daal4py_bench/dbscan.py @@ -15,15 +15,11 @@ # =============================================================================== import argparse -import os -import sys import bench from daal4py import dbscan from daal4py.sklearn._utils import getFPType -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - parser = argparse.ArgumentParser(description='daal4py DBSCAN clustering ' 'benchmark') diff --git a/daal4py_bench/df_clsf.py b/daal4py_bench/df_clsf.py index 55c171504..0c149291f 100644 --- a/daal4py_bench/df_clsf.py +++ b/daal4py_bench/df_clsf.py @@ -15,8 +15,6 @@ # =============================================================================== import argparse -import os -import sys import bench import numpy as np @@ -25,8 +23,6 @@ from daal4py.sklearn._utils import getFPType from sklearn.metrics import accuracy_score -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - def df_clsf_fit(X, y, n_classes, n_trees=100, seed=12345, n_features_per_node=0, max_depth=0, min_impurity=0, diff --git a/daal4py_bench/df_regr.py b/daal4py_bench/df_regr.py index 9ab9c8965..628e159fd 100644 --- a/daal4py_bench/df_regr.py +++ b/daal4py_bench/df_regr.py @@ -15,16 +15,12 @@ # =============================================================================== import argparse -import os -import sys import bench from daal4py import (decision_forest_regression_prediction, decision_forest_regression_training, engines_mt2203) from daal4py.sklearn._utils import getFPType -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - def df_regr_fit(X, y, n_trees=100, seed=12345, n_features_per_node=0, max_depth=0, min_impurity=0, bootstrap=True): diff --git a/daal4py_bench/distances.py b/daal4py_bench/distances.py index aef6837b1..ed6896024 100644 --- a/daal4py_bench/distances.py +++ b/daal4py_bench/distances.py @@ -15,15 +15,11 @@ # =============================================================================== import argparse -import os -import sys import bench import daal4py from daal4py.sklearn._utils import getFPType -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - def compute_distances(pairwise_distances, X): algorithm = pairwise_distances(fptype=getFPType(X)) diff --git a/daal4py_bench/kmeans.py b/daal4py_bench/kmeans.py index 469d5b0c9..7e15617ce 100644 --- a/daal4py_bench/kmeans.py +++ b/daal4py_bench/kmeans.py @@ -15,16 +15,12 @@ # =============================================================================== import argparse -import os -import sys import bench import numpy as np from daal4py import kmeans from daal4py.sklearn._utils import getFPType -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - parser = argparse.ArgumentParser(description='daal4py K-Means clustering ' 'benchmark') diff --git a/daal4py_bench/linear.py b/daal4py_bench/linear.py index 7e5325da2..0b62a42a5 100644 --- a/daal4py_bench/linear.py +++ b/daal4py_bench/linear.py @@ -15,15 +15,11 @@ # =============================================================================== import argparse -import os -import sys import bench from daal4py import linear_regression_prediction, linear_regression_training from daal4py.sklearn._utils import getFPType -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - parser = argparse.ArgumentParser(description='daal4py linear regression ' 'benchmark') diff --git a/daal4py_bench/pca.py b/daal4py_bench/pca.py index 274b98c60..81161d1bd 100644 --- a/daal4py_bench/pca.py +++ b/daal4py_bench/pca.py @@ -15,8 +15,6 @@ # =============================================================================== import argparse -import os -import sys import bench import numpy as np @@ -24,8 +22,6 @@ from daal4py.sklearn._utils import getFPType from sklearn.utils.extmath import svd_flip -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - parser = argparse.ArgumentParser(description='daal4py PCA benchmark') parser.add_argument('--svd-solver', type=str, diff --git a/daal4py_bench/ridge.py b/daal4py_bench/ridge.py index 61cc2151c..1a04edf78 100644 --- a/daal4py_bench/ridge.py +++ b/daal4py_bench/ridge.py @@ -15,17 +15,12 @@ # =============================================================================== import argparse -import os -import sys import bench import numpy as np from daal4py import ridge_regression_prediction, ridge_regression_training from daal4py.sklearn._utils import getFPType -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - - parser = argparse.ArgumentParser(description='daal4py ridge regression ' 'benchmark') parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, diff --git a/modelbuilders_bench/lgbm_mb.py b/modelbuilders_bench/lgbm_mb.py index 674d2f5bf..5523748c1 100644 --- a/modelbuilders_bench/lgbm_mb.py +++ b/modelbuilders_bench/lgbm_mb.py @@ -16,7 +16,6 @@ import argparse import os -import sys import bench import daal4py @@ -25,8 +24,6 @@ import utils -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - parser = argparse.ArgumentParser( description='lightgbm gbt + model transform + daal predict benchmark') diff --git a/modelbuilders_bench/xgb_mb.py b/modelbuilders_bench/xgb_mb.py index 5a25765de..1274014fe 100644 --- a/modelbuilders_bench/xgb_mb.py +++ b/modelbuilders_bench/xgb_mb.py @@ -16,7 +16,6 @@ import argparse import os -import sys import bench import daal4py @@ -25,7 +24,6 @@ import utils -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser( description='xgboost gbt + model transform + daal predict benchmark') diff --git a/sklearn_bench/dbscan.py b/sklearn_bench/dbscan.py index acedcb31f..8d2dcf9ab 100644 --- a/sklearn_bench/dbscan.py +++ b/sklearn_bench/dbscan.py @@ -15,14 +15,11 @@ # =============================================================================== import argparse -import os -import sys import bench from sklearn.cluster import DBSCAN from sklearn.metrics.cluster import davies_bouldin_score -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark') parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10., diff --git a/sklearn_bench/df_clsf.py b/sklearn_bench/df_clsf.py index d83f4cfc3..82b54128e 100644 --- a/sklearn_bench/df_clsf.py +++ b/sklearn_bench/df_clsf.py @@ -15,16 +15,12 @@ # =============================================================================== import argparse -import os -import sys import bench import numpy as np from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - parser = argparse.ArgumentParser(description='scikit-learn random forest ' 'classification benchmark') diff --git a/sklearn_bench/df_regr.py b/sklearn_bench/df_regr.py index f37e97800..7967c9133 100644 --- a/sklearn_bench/df_regr.py +++ b/sklearn_bench/df_regr.py @@ -15,13 +15,10 @@ # =============================================================================== import argparse -import os -import sys import bench from sklearn.ensemble import RandomForestRegressor -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn random forest ' 'regression benchmark') diff --git a/sklearn_bench/distances.py b/sklearn_bench/distances.py index dd1f143f1..6cb70f9bb 100644 --- a/sklearn_bench/distances.py +++ b/sklearn_bench/distances.py @@ -15,13 +15,10 @@ # =============================================================================== import argparse -import os -import sys import bench from sklearn.metrics.pairwise import pairwise_distances -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn pairwise distances ' 'benchmark') diff --git a/sklearn_bench/elasticnet.py b/sklearn_bench/elasticnet.py index aa144ea1c..b54b5c644 100755 --- a/sklearn_bench/elasticnet.py +++ b/sklearn_bench/elasticnet.py @@ -15,13 +15,10 @@ # =============================================================================== import argparse -import os -import sys import bench from sklearn.linear_model import ElasticNet -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn elastic-net regression ' 'benchmark') diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index 6200a48cb..665014cd8 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -15,15 +15,12 @@ # =============================================================================== import argparse -import os -import sys import bench import numpy as np from sklearn.cluster import KMeans from sklearn.metrics.cluster import davies_bouldin_score -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn K-means benchmark') parser.add_argument('-i', '--filei', '--fileI', '--init', diff --git a/sklearn_bench/knn_clsf.py b/sklearn_bench/knn_clsf.py index 5e9751f57..025f91fe9 100755 --- a/sklearn_bench/knn_clsf.py +++ b/sklearn_bench/knn_clsf.py @@ -15,15 +15,12 @@ # =============================================================================== import argparse -import os -import sys import bench import numpy as np from sklearn.metrics import accuracy_score from sklearn.neighbors import KNeighborsClassifier -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser( description='scikit-learn kNN classifier benchmark') diff --git a/sklearn_bench/lasso.py b/sklearn_bench/lasso.py index 85b0c05d0..173fa7b59 100755 --- a/sklearn_bench/lasso.py +++ b/sklearn_bench/lasso.py @@ -15,13 +15,10 @@ # =============================================================================== import argparse -import os -import sys import bench from sklearn.linear_model import Lasso -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn lasso regression ' 'benchmark') diff --git a/sklearn_bench/linear.py b/sklearn_bench/linear.py index 0c913afac..767a8ff6f 100644 --- a/sklearn_bench/linear.py +++ b/sklearn_bench/linear.py @@ -15,13 +15,10 @@ # =============================================================================== import argparse -import os -import sys import bench from sklearn.linear_model import LinearRegression -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn linear regression ' 'benchmark') diff --git a/sklearn_bench/log_reg.py b/sklearn_bench/log_reg.py index eb104e24e..b7eead27a 100644 --- a/sklearn_bench/log_reg.py +++ b/sklearn_bench/log_reg.py @@ -15,15 +15,12 @@ # =============================================================================== import argparse -import os -import sys import bench import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn logistic ' 'regression benchmark') diff --git a/sklearn_bench/pca.py b/sklearn_bench/pca.py index 773c8c191..2fd7be90f 100644 --- a/sklearn_bench/pca.py +++ b/sklearn_bench/pca.py @@ -15,13 +15,10 @@ # =============================================================================== import argparse -import os -import sys import bench from sklearn.decomposition import PCA -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn PCA benchmark') parser.add_argument('--svd-solver', type=str, choices=['full'], diff --git a/sklearn_bench/ridge.py b/sklearn_bench/ridge.py index 8ebe572c8..2e1cfc102 100644 --- a/sklearn_bench/ridge.py +++ b/sklearn_bench/ridge.py @@ -15,13 +15,10 @@ # =============================================================================== import argparse -import os -import sys import bench from sklearn.linear_model import Ridge -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser(description='scikit-learn ridge regression ' 'benchmark') diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py index 2474621c9..5fc8af178 100644 --- a/sklearn_bench/svm.py +++ b/sklearn_bench/svm.py @@ -15,16 +15,12 @@ # =============================================================================== import argparse -import os -import sys import bench import numpy as np from sklearn.metrics import accuracy_score from sklearn.svm import SVC -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - parser = argparse.ArgumentParser(description='scikit-learn SVM benchmark') parser.add_argument('-C', dest='C', type=float, default=1.0, diff --git a/sklearn_bench/train_test_split.py b/sklearn_bench/train_test_split.py index 18cd7e8d5..6e9764570 100644 --- a/sklearn_bench/train_test_split.py +++ b/sklearn_bench/train_test_split.py @@ -15,14 +15,11 @@ # =============================================================================== import argparse -import os -import sys from typing import Iterable import bench from sklearn.model_selection import train_test_split -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser( description='scikit-learn train_test_split benchmark') diff --git a/utils.py b/utils.py index 04ee52d84..a3cfa7a68 100755 --- a/utils.py +++ b/utils.py @@ -59,6 +59,10 @@ def is_exists_files(files): def read_output_from_command(command, env=os.environ.copy()): + if "PYTHONPATH" in env: + env["PYTHONPATH"] += ":" + os.path.dirname(os.path.abspath(__file__)) + else: + env["PYTHONPATH"] = os.path.dirname(os.path.abspath(__file__)) res = subprocess.run(command.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8', env=env) return res.stdout[:-1], res.stderr[:-1] diff --git a/xgboost_bench/gbt.py b/xgboost_bench/gbt.py index 6360500d3..c903e6008 100644 --- a/xgboost_bench/gbt.py +++ b/xgboost_bench/gbt.py @@ -16,14 +16,11 @@ import argparse import os -import sys import bench import numpy as np import xgboost as xgb -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - def convert_probs_to_classes(y_prob): return np.array([np.argmax(y_prob[i]) for i in range(y_prob.shape[0])]) From 5a8db33d97ea443ce7922048138b16df7fcb1e87 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Tue, 23 Mar 2021 00:46:39 +0300 Subject: [PATCH 04/31] fixed all mypy errors and added mypy check to CI --- azure-pipelines.yml | 15 ++++++++++++++- cuml_bench/__init__.py | 0 cuml_bench/df_clsf.py | 4 +++- cuml_bench/df_regr.py | 4 +++- cuml_bench/kmeans.py | 2 +- daal4py_bench/__init__.py | 0 daal4py_bench/kmeans.py | 2 +- modelbuilders_bench/__init__.py | 0 modelbuilders_bench/lgbm_mb.py | 3 +-- modelbuilders_bench/{utils.py => mb_utils.py} | 0 modelbuilders_bench/xgb_mb.py | 3 +-- sklearn_bench/dbscan.py | 4 ++-- sklearn_bench/df_clsf.py | 4 ++-- sklearn_bench/df_regr.py | 4 ++-- sklearn_bench/distances.py | 4 ++-- sklearn_bench/elasticnet.py | 4 ++-- sklearn_bench/kmeans.py | 6 +++--- sklearn_bench/knn_clsf.py | 4 ++-- sklearn_bench/lasso.py | 4 ++-- sklearn_bench/linear.py | 4 ++-- sklearn_bench/log_reg.py | 4 ++-- sklearn_bench/pca.py | 4 ++-- sklearn_bench/ridge.py | 4 ++-- sklearn_bench/svm.py | 3 ++- sklearn_bench/train_test_split.py | 4 ++-- xgboost_bench/__init__.py | 0 26 files changed, 53 insertions(+), 37 deletions(-) create mode 100755 cuml_bench/__init__.py create mode 100755 daal4py_bench/__init__.py create mode 100755 modelbuilders_bench/__init__.py rename modelbuilders_bench/{utils.py => mb_utils.py} (100%) create mode 100755 xgboost_bench/__init__.py diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 5d1000a15..5dd5d45dd 100755 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -63,5 +63,18 @@ jobs: - script: | python -m pip install --upgrade pip setuptools pip install flake8 - flake8 --ignore=E265,E722,E402 --max-line-length=90 --count + flake8 --ignore=E265 --max-line-length=90 --count displayName: 'PEP 8 check' +- job: Mypy + pool: + vmImage: 'ubuntu-20.04' + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: '3.7' + addToPath: true + - script: | + python -m pip install --upgrade pip setuptools + pip install mypy sklearn-stub data-science-types + mypy . --ignore-missing-imports + displayName: 'mypy check' diff --git a/cuml_bench/__init__.py b/cuml_bench/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/cuml_bench/df_clsf.py b/cuml_bench/df_clsf.py index 60cef76e3..429cdf741 100755 --- a/cuml_bench/df_clsf.py +++ b/cuml_bench/df_clsf.py @@ -15,12 +15,12 @@ # =============================================================================== import argparse +from typing import Any import bench import cuml from cuml.ensemble import RandomForestClassifier - parser = argparse.ArgumentParser(description='cuml random forest ' 'classification benchmark') @@ -62,6 +62,7 @@ params.split_algorithm = 1 params.n_classes = y_train[y_train.columns[0]].nunique() +clf: Any def fit(X, y): @@ -79,6 +80,7 @@ def fit(X, y): def predict(X): + global clf prediction_args = {'predict_model': 'GPU'} if int(cuml.__version__.split('.')[1]) <= 14: prediction_args.update({'num_classes': params.n_classes}) diff --git a/cuml_bench/df_regr.py b/cuml_bench/df_regr.py index cc46a583c..410a189c8 100644 --- a/cuml_bench/df_regr.py +++ b/cuml_bench/df_regr.py @@ -15,11 +15,11 @@ # =============================================================================== import argparse +from typing import Any import bench from cuml.ensemble import RandomForestRegressor - parser = argparse.ArgumentParser(description='cuml random forest ' 'regression benchmark') @@ -59,6 +59,7 @@ params.split_algorithm = 0 else: params.split_algorithm = 1 +regr: Any # Create our random forest regressor @@ -77,6 +78,7 @@ def fit(X, y): def predict(X): + global regr return regr.predict(X, predict_model='GPU') diff --git a/cuml_bench/kmeans.py b/cuml_bench/kmeans.py index 04b284f93..90f19ce82 100644 --- a/cuml_bench/kmeans.py +++ b/cuml_bench/kmeans.py @@ -49,7 +49,7 @@ # or choose random centroids from training data else: np.random.seed(params.seed) - centroids_idx = np.random.randint(0, X_train.shape[0], + centroids_idx = np.random.randint(low=0, high=X_train.shape[0], size=params.n_clusters) if hasattr(X_train, "iloc"): X_init = X_train.iloc[centroids_idx].to_pandas().values diff --git a/daal4py_bench/__init__.py b/daal4py_bench/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/daal4py_bench/kmeans.py b/daal4py_bench/kmeans.py index 7e15617ce..c9d3fb6b7 100644 --- a/daal4py_bench/kmeans.py +++ b/daal4py_bench/kmeans.py @@ -43,7 +43,7 @@ # or choose random centroids from training data else: np.random.seed(params.seed) - centroids_idx = np.random.randint(0, X_train.shape[0], + centroids_idx = np.random.randint(low=0, high=X_train.shape[0], size=params.n_clusters) if hasattr(X_train, "iloc"): X_init = X_train.iloc[centroids_idx].values diff --git a/modelbuilders_bench/__init__.py b/modelbuilders_bench/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/modelbuilders_bench/lgbm_mb.py b/modelbuilders_bench/lgbm_mb.py index 5523748c1..2b4c29616 100644 --- a/modelbuilders_bench/lgbm_mb.py +++ b/modelbuilders_bench/lgbm_mb.py @@ -22,8 +22,7 @@ import lightgbm as lgbm import numpy as np -import utils - +import modelbuilders_bench.mb_utils as utils parser = argparse.ArgumentParser( description='lightgbm gbt + model transform + daal predict benchmark') diff --git a/modelbuilders_bench/utils.py b/modelbuilders_bench/mb_utils.py similarity index 100% rename from modelbuilders_bench/utils.py rename to modelbuilders_bench/mb_utils.py diff --git a/modelbuilders_bench/xgb_mb.py b/modelbuilders_bench/xgb_mb.py index 1274014fe..d70e326d6 100644 --- a/modelbuilders_bench/xgb_mb.py +++ b/modelbuilders_bench/xgb_mb.py @@ -22,8 +22,7 @@ import numpy as np import xgboost as xgb -import utils - +import modelbuilders_bench.mb_utils as utils parser = argparse.ArgumentParser( description='xgboost gbt + model transform + daal predict benchmark') diff --git a/sklearn_bench/dbscan.py b/sklearn_bench/dbscan.py index 8d2dcf9ab..39efb16bb 100644 --- a/sklearn_bench/dbscan.py +++ b/sklearn_bench/dbscan.py @@ -17,10 +17,8 @@ import argparse import bench -from sklearn.cluster import DBSCAN from sklearn.metrics.cluster import davies_bouldin_score - parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark') parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10., help='Radius of neighborhood of a point') @@ -29,6 +27,8 @@ 'neighborhood to consider a point a core point') params = bench.parse_args(parser) +if not params.no_intel_optimized: + from sklearn.cluster import DBSCAN # Load generated data X, _, _, _ = bench.load_data(params, add_dtype=True) diff --git a/sklearn_bench/df_clsf.py b/sklearn_bench/df_clsf.py index 82b54128e..d3351dbc4 100644 --- a/sklearn_bench/df_clsf.py +++ b/sklearn_bench/df_clsf.py @@ -18,10 +18,8 @@ import bench import numpy as np -from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score - parser = argparse.ArgumentParser(description='scikit-learn random forest ' 'classification benchmark') @@ -45,6 +43,8 @@ params = bench.parse_args(parser) +if not params.no_intel_optimized: + from sklearn.ensemble import RandomForestClassifier # Load and convert data X_train, X_test, y_train, y_test = bench.load_data(params) diff --git a/sklearn_bench/df_regr.py b/sklearn_bench/df_regr.py index 7967c9133..a9f29743a 100644 --- a/sklearn_bench/df_regr.py +++ b/sklearn_bench/df_regr.py @@ -17,8 +17,6 @@ import argparse import bench -from sklearn.ensemble import RandomForestRegressor - parser = argparse.ArgumentParser(description='scikit-learn random forest ' 'regression benchmark') @@ -44,6 +42,8 @@ params = bench.parse_args(parser) +if not params.no_intel_optimized: + from sklearn.ensemble import RandomForestRegressor # Load and convert data X_train, X_test, y_train, y_test = bench.load_data(params) diff --git a/sklearn_bench/distances.py b/sklearn_bench/distances.py index 6cb70f9bb..d676a2b36 100644 --- a/sklearn_bench/distances.py +++ b/sklearn_bench/distances.py @@ -17,8 +17,6 @@ import argparse import bench -from sklearn.metrics.pairwise import pairwise_distances - parser = argparse.ArgumentParser(description='scikit-learn pairwise distances ' 'benchmark') @@ -27,6 +25,8 @@ help='Metric to test for pairwise distances') params = bench.parse_args(parser) +if not params.no_intel_optimized: + from sklearn.metrics.pairwise import pairwise_distances # Load data X, _, _, _ = bench.load_data(params, generated_data=['X_train'], add_dtype=True) diff --git a/sklearn_bench/elasticnet.py b/sklearn_bench/elasticnet.py index b54b5c644..b3f5ff2f5 100755 --- a/sklearn_bench/elasticnet.py +++ b/sklearn_bench/elasticnet.py @@ -17,8 +17,6 @@ import argparse import bench -from sklearn.linear_model import ElasticNet - parser = argparse.ArgumentParser(description='scikit-learn elastic-net regression ' 'benchmark') @@ -35,6 +33,8 @@ help='Tolerance for solver.') params = bench.parse_args(parser) +if not params.no_intel_optimized: + from sklearn.linear_model import ElasticNet # Load data X_train, X_test, y_train, y_test = bench.load_data(params) diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index 665014cd8..80252dcac 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -18,10 +18,8 @@ import bench import numpy as np -from sklearn.cluster import KMeans from sklearn.metrics.cluster import davies_bouldin_score - parser = argparse.ArgumentParser(description='scikit-learn K-means benchmark') parser.add_argument('-i', '--filei', '--fileI', '--init', type=str, help='Initial clusters') @@ -32,6 +30,8 @@ parser.add_argument('--n-clusters', type=int, help='Number of clusters') params = bench.parse_args(parser) +if not params.no_intel_optimized: + from sklearn.cluster import KMeans # Load and convert generated data X_train, X_test, _, _ = bench.load_data(params) @@ -46,7 +46,7 @@ # or choose random centroids from training data else: np.random.seed(params.seed) - centroids_idx = np.random.randint(0, X_train.shape[0], + centroids_idx = np.random.randint(low=0, high=X_train.shape[0], size=params.n_clusters) if hasattr(X_train, "iloc"): X_init = X_train.iloc[centroids_idx].values diff --git a/sklearn_bench/knn_clsf.py b/sklearn_bench/knn_clsf.py index 025f91fe9..749a6ae82 100755 --- a/sklearn_bench/knn_clsf.py +++ b/sklearn_bench/knn_clsf.py @@ -19,8 +19,6 @@ import bench import numpy as np from sklearn.metrics import accuracy_score -from sklearn.neighbors import KNeighborsClassifier - parser = argparse.ArgumentParser( description='scikit-learn kNN classifier benchmark') @@ -39,6 +37,8 @@ help='Distance metric to use') params = bench.parse_args(parser) +if not params.no_intel_optimized: + from sklearn.neighbors import KNeighborsClassifier # Load generated data X_train, X_test, y_train, y_test = bench.load_data(params) diff --git a/sklearn_bench/lasso.py b/sklearn_bench/lasso.py index 173fa7b59..51fd09181 100755 --- a/sklearn_bench/lasso.py +++ b/sklearn_bench/lasso.py @@ -17,8 +17,6 @@ import argparse import bench -from sklearn.linear_model import Lasso - parser = argparse.ArgumentParser(description='scikit-learn lasso regression ' 'benchmark') @@ -33,6 +31,8 @@ help='Tolerance for solver.') params = bench.parse_args(parser) +if not params.no_intel_optimized: + from sklearn.linear_model import Lasso # Load data X_train, X_test, y_train, y_test = bench.load_data(params) diff --git a/sklearn_bench/linear.py b/sklearn_bench/linear.py index 767a8ff6f..1c0fb6f9f 100644 --- a/sklearn_bench/linear.py +++ b/sklearn_bench/linear.py @@ -17,8 +17,6 @@ import argparse import bench -from sklearn.linear_model import LinearRegression - parser = argparse.ArgumentParser(description='scikit-learn linear regression ' 'benchmark') @@ -27,6 +25,8 @@ help="Don't fit intercept (assume data already centered)") params = bench.parse_args(parser) +if not params.no_intel_optimized: + from sklearn.linear_model import LinearRegression # Load data X_train, X_test, y_train, y_test = bench.load_data( diff --git a/sklearn_bench/log_reg.py b/sklearn_bench/log_reg.py index b7eead27a..7f3f80eb8 100644 --- a/sklearn_bench/log_reg.py +++ b/sklearn_bench/log_reg.py @@ -18,10 +18,8 @@ import bench import numpy as np -from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score - parser = argparse.ArgumentParser(description='scikit-learn logistic ' 'regression benchmark') parser.add_argument('--no-fit-intercept', dest='fit_intercept', @@ -45,6 +43,8 @@ 'is 1e-10.') params = bench.parse_args(parser, loop_types=('fit', 'predict')) +if not params.no_intel_optimized: + from sklearn.linear_model import LogisticRegression # Load generated data X_train, X_test, y_train, y_test = bench.load_data(params) diff --git a/sklearn_bench/pca.py b/sklearn_bench/pca.py index 2fd7be90f..3eb15465f 100644 --- a/sklearn_bench/pca.py +++ b/sklearn_bench/pca.py @@ -17,8 +17,6 @@ import argparse import bench -from sklearn.decomposition import PCA - parser = argparse.ArgumentParser(description='scikit-learn PCA benchmark') parser.add_argument('--svd-solver', type=str, choices=['full'], @@ -29,6 +27,8 @@ help='Perform whitening') params = bench.parse_args(parser) +if not params.no_intel_optimized: + from sklearn.decomposition import PCA # Load random data X_train, X_test, _, _ = bench.load_data(params, generated_data=['X_train']) diff --git a/sklearn_bench/ridge.py b/sklearn_bench/ridge.py index 2e1cfc102..229fb29dc 100644 --- a/sklearn_bench/ridge.py +++ b/sklearn_bench/ridge.py @@ -17,8 +17,6 @@ import argparse import bench -from sklearn.linear_model import Ridge - parser = argparse.ArgumentParser(description='scikit-learn ridge regression ' 'benchmark') @@ -31,6 +29,8 @@ help='Regularization strength') params = bench.parse_args(parser) +if not params.no_intel_optimized: + from sklearn.linear_model import Ridge # Load data X_train, X_test, y_train, y_test = bench.load_data(params, diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py index 5fc8af178..0ea8611c6 100644 --- a/sklearn_bench/svm.py +++ b/sklearn_bench/svm.py @@ -19,7 +19,6 @@ import bench import numpy as np from sklearn.metrics import accuracy_score -from sklearn.svm import SVC parser = argparse.ArgumentParser(description='scikit-learn SVM benchmark') @@ -40,6 +39,8 @@ dest='shrinking', help="Don't use shrinking heuristic") params = bench.parse_args(parser, loop_types=('fit', 'predict')) +if not params.no_intel_optimized: + from sklearn.svm import SVC # Load data X_train, X_test, y_train, y_test = bench.load_data(params) diff --git a/sklearn_bench/train_test_split.py b/sklearn_bench/train_test_split.py index 6e9764570..5ecaa157e 100644 --- a/sklearn_bench/train_test_split.py +++ b/sklearn_bench/train_test_split.py @@ -18,8 +18,6 @@ from typing import Iterable import bench -from sklearn.model_selection import train_test_split - parser = argparse.ArgumentParser( description='scikit-learn train_test_split benchmark') @@ -39,6 +37,8 @@ '(only for IDP scikit-learn)') params = bench.parse_args(parser) +if not params.no_intel_optimized: + from sklearn.model_selection import train_test_split # Load generated data X, y, _, _ = bench.load_data(params) diff --git a/xgboost_bench/__init__.py b/xgboost_bench/__init__.py new file mode 100755 index 000000000..e69de29bb From 5594efdece608b005b3cf86033c8d4901adcc92d Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Tue, 23 Mar 2021 01:35:42 +0300 Subject: [PATCH 05/31] Yet another mypy fixes --- azure-pipelines.yml | 2 +- cuml_bench/kmeans.py | 5 +++-- daal4py_bench/kmeans.py | 5 +++-- sklearn_bench/kmeans.py | 4 +++- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 5dd5d45dd..eaef6f6e1 100755 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -75,6 +75,6 @@ jobs: addToPath: true - script: | python -m pip install --upgrade pip setuptools - pip install mypy sklearn-stub data-science-types + pip install mypy data-science-types mypy . --ignore-missing-imports displayName: 'mypy check' diff --git a/cuml_bench/kmeans.py b/cuml_bench/kmeans.py index 90f19ce82..d0192ba4d 100644 --- a/cuml_bench/kmeans.py +++ b/cuml_bench/kmeans.py @@ -16,13 +16,13 @@ import argparse import warnings +from typing import Any import bench import numpy as np from cuml import KMeans from sklearn.metrics.cluster import davies_bouldin_score - warnings.filterwarnings('ignore', category=FutureWarning) parser = argparse.ArgumentParser(description='cuML K-means benchmark') parser.add_argument('-i', '--filei', '--fileI', '--init', @@ -39,11 +39,12 @@ # Load and convert generated data X_train, X_test, _, _ = bench.load_data(params) +X_init: Any if params.filei == 'k-means++': X_init = 'k-means++' # Load initial centroids from specified path elif params.filei is not None: - X_init = np.load(params.filei).astype(params.dtype) + X_init = {k: v.astype(params.dtype) for k, v in np.load(params.filei).items()} if isinstance(X_init, np.ndarray): params.n_clusters = X_init.shape[0] # or choose random centroids from training data diff --git a/daal4py_bench/kmeans.py b/daal4py_bench/kmeans.py index c9d3fb6b7..9a224713f 100644 --- a/daal4py_bench/kmeans.py +++ b/daal4py_bench/kmeans.py @@ -15,13 +15,13 @@ # =============================================================================== import argparse +from typing import Any import bench import numpy as np from daal4py import kmeans from daal4py.sklearn._utils import getFPType - parser = argparse.ArgumentParser(description='daal4py K-Means clustering ' 'benchmark') parser.add_argument('-i', '--filei', '--fileI', '--init', @@ -36,9 +36,10 @@ # Load generated data X_train, X_test, _, _ = bench.load_data(params, add_dtype=True) +X_init: Any # Load initial centroids from specified path if params.filei is not None: - X_init = np.load(params.filei).astype(params.dtype) + X_init = {k: v.astype(params.dtype) for k, v in np.load(params.filei).items()} params.n_clusters = X_init.shape[0] # or choose random centroids from training data else: diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index 80252dcac..fadd94714 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -15,6 +15,7 @@ # =============================================================================== import argparse +from typing import Any import bench import numpy as np @@ -36,11 +37,12 @@ # Load and convert generated data X_train, X_test, _, _ = bench.load_data(params) +X_init: Any if params.filei == 'k-means++': X_init = 'k-means++' # Load initial centroids from specified path elif params.filei is not None: - X_init = np.load(params.filei).astype(params.dtype) + X_init = {k: v.astype(params.dtype) for k, v in np.load(params.filei).items()} if isinstance(X_init, np.ndarray): params.n_clusters = X_init.shape[0] # or choose random centroids from training data From 35b55b8a99c5c0b669d36573b6e4b02fcbf4de5a Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Tue, 23 Mar 2021 23:30:46 +0300 Subject: [PATCH 06/31] Small runner refactoring --- datasets/load_datasets.py | 4 +-- runner.py | 68 ++++++++++++++------------------------- utils.py | 17 +++++----- xgboost_bench/gbt.py | 3 +- 4 files changed, 37 insertions(+), 55 deletions(-) diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index a6e79dc2b..6c0af5a01 100755 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -37,7 +37,7 @@ def try_load_dataset(dataset_name, output_directory): - if dataset_name in dataset_loaders.keys(): + if dataset_name in dataset_loaders: try: return dataset_loaders[dataset_name](output_directory) except BaseException: @@ -60,7 +60,7 @@ def try_load_dataset(dataset_name, output_directory): args = parser.parse_args() if args.list: - for key in dataset_loaders.keys(): + for key in dataset_loaders: print(key) sys.exit(0) diff --git a/runner.py b/runner.py index 58f99588a..678365b74 100755 --- a/runner.py +++ b/runner.py @@ -18,13 +18,11 @@ import json import logging import os -import pathlib import socket import sys import datasets.make_datasets as make_datasets import utils -from datasets.load_datasets import try_load_dataset def generate_cases(params): @@ -54,7 +52,7 @@ def generate_cases(params): default='configs/config_example.json', help='Path to configuration files') parser.add_argument('--dummy-run', default=False, action='store_true', - help='Run configuration parser and datasets generation' + help='Run configuration parser and datasets generation ' 'without benchmarks running') parser.add_argument('--no-intel-optimized', default=False, action='store_true', help='Use no intel optimized version. ' @@ -69,7 +67,6 @@ def generate_cases(params): help='Create an Excel report based on benchmarks results. ' 'Need "openpyxl" library') args = parser.parse_args() - env = os.environ.copy() logging.basicConfig( stream=sys.stdout, format='%(levelname)s: %(message)s', level=args.verbose) @@ -90,8 +87,6 @@ def generate_cases(params): with open(config_name, 'r') as config_file: config = json.load(config_file) - if 'omp_env' not in config.keys(): - config['omp_env'] = [] # get parameters that are common for all cases common_params = config['common'] for params_set in config['cases']: @@ -107,34 +102,21 @@ def generate_cases(params): for dataset in params_set['dataset']: if dataset['source'] in ['csv', 'npy']: - train_data = dataset["training"] - file_train_data_x = train_data["x"] - paths = f'--file-X-train {file_train_data_x}' - if 'y' in dataset['training'].keys(): - file_train_data_y = train_data["y"] - paths += f' --file-y-train {file_train_data_y}' - if 'testing' in dataset.keys(): - test_data = dataset["testing"] - file_test_data_x = test_data["x"] - paths += f' --file-X-test {file_test_data_x}' - if 'y' in dataset['testing'].keys(): - file_test_data_y = test_data["y"] - paths += f' --file-y-test {file_test_data_y}' - if 'name' in dataset.keys(): - dataset_name = dataset['name'] - else: - dataset_name = 'unknown' - - if not utils.is_exists_files([file_train_data_x]): - directory_dataset = pathlib.Path(file_train_data_x).parent - if not try_load_dataset(dataset_name=dataset_name, - output_directory=directory_dataset): - logging.warning(f'Dataset {dataset_name} ' - 'could not be loaded. \n' - 'Check the correct name or expand ' - 'the download in the folder dataset.') - continue - + dataset_name = dataset['name'] if 'name' in dataset else 'unknown' + if 'training' not in dataset or not utils.find_the_dataset( + dataset_name, dataset['training']["x"]): + logging.warning( + f'Dataset {dataset_name} could not be loaded. \n' + 'Check the correct name or expand the download in ' + 'the folder dataset.') + continue + paths = '--file-X-train ' + dataset['training']["x"] + if 'y' in dataset['training']: + paths += ' --file-y-train ' + dataset['training']["y"] + if 'testing' in dataset: + paths += ' --file-X-test ' + dataset["testing"]["x"] + if 'y' in dataset['testing']: + paths += ' --file-y-test ' + dataset["testing"]["y"] elif dataset['source'] == 'synthetic': class GenerationArgs: classes: int @@ -151,7 +133,7 @@ class GenerationArgs: gen_args = GenerationArgs() paths = '' - if 'seed' in params_set.keys(): + if 'seed' in params_set: gen_args.seed = params_set['seed'] else: gen_args.seed = 777 @@ -161,10 +143,10 @@ class GenerationArgs: gen_args.type = dataset['type'] gen_args.samples = dataset['training']['n_samples'] gen_args.features = dataset['n_features'] - if 'n_classes' in dataset.keys(): + if 'n_classes' in dataset: gen_args.classes = dataset['n_classes'] cls_num_for_file = f'-{dataset["n_classes"]}' - elif 'n_clusters' in dataset.keys(): + elif 'n_clusters' in dataset: gen_args.clusters = dataset['n_clusters'] cls_num_for_file = f'-{dataset["n_clusters"]}' else: @@ -179,7 +161,7 @@ class GenerationArgs: gen_args.filey = f'{file_prefix}y-train{file_postfix}' paths += f' --file-y-train {gen_args.filey}' - if 'testing' in dataset.keys(): + if 'testing' in dataset: gen_args.test_samples = dataset['testing']['n_samples'] gen_args.filextest = f'{file_prefix}X-test{file_postfix}' paths += f' --file-X-test {gen_args.filextest}' @@ -204,21 +186,21 @@ class GenerationArgs: logging.warning('Unknown dataset source. Only synthetics datasets ' 'and csv/npy files are supported now') - omp_env = utils.get_omp_env() no_intel_optimize = \ '--no-intel-optimized ' if args.no_intel_optimized else '' for lib in libs: env = os.environ.copy() - if lib == 'xgboost': + if lib == 'xgboost' and 'omp_env' in config: + omp_env = utils.get_omp_env() for var in config['omp_env']: - env[var] = omp_env[var] + if var in omp_env: + env[var] = omp_env[var] for i, case in enumerate(cases): command = f'python {lib}_bench/{algorithm}.py ' \ + no_intel_optimize \ + f'--arch {hostname} {case} {paths} ' \ + f'--dataset-name {dataset_name}' - while ' ' in command: - command = command.replace(' ', ' ') + command = ' '.join(command.split()) logging.info(command) if not args.dummy_run: case = f'{lib},{algorithm} ' + case diff --git a/utils.py b/utils.py index a3cfa7a68..d7db2e814 100755 --- a/utils.py +++ b/utils.py @@ -18,19 +18,20 @@ import logging import multiprocessing import os +import pathlib import platform import subprocess import sys +from datasets.load_datasets import try_load_dataset + def filter_stderr(text): # delete 'Intel(R) DAAL usage in sklearn' messages fake_error_message = 'Intel(R) oneAPI Data Analytics Library solvers ' + \ 'for sklearn enabled: ' + \ 'https://intelpython.github.io/daal4py/sklearn.html' - while fake_error_message in text: - text = text.replace(fake_error_message, '') - return text + return ''.join(text.split(fake_error_message)) def filter_stdout(text): @@ -51,9 +52,10 @@ def filter_stdout(text): return filtered, extra -def is_exists_files(files): - for f in files: - if not os.path.isfile(f): +def find_the_dataset(name: str, fullpath: str) -> bool: + if not os.path.isfile(fullpath): + if not try_load_dataset(dataset_name=name, + output_directory=pathlib.Path(fullpath).parent): return False return True @@ -89,11 +91,10 @@ def get_omp_env(): cpu_count = multiprocessing.cpu_count() omp_num_threads = str(cpu_count // 2) if _is_ht_enabled() else str(cpu_count) - omp_env = { + return { 'OMP_PLACES': f'{{0}}:{cpu_count}:1', 'OMP_NUM_THREADS': omp_num_threads } - return omp_env def get_hw_parameters(): diff --git a/xgboost_bench/gbt.py b/xgboost_bench/gbt.py index c903e6008..b19f1ef90 100644 --- a/xgboost_bench/gbt.py +++ b/xgboost_bench/gbt.py @@ -34,8 +34,7 @@ def convert_xgb_predictions(y_pred, objective): return y_pred -parser = argparse.ArgumentParser(description='xgboost gradient boosted trees ' - 'benchmark') +parser = argparse.ArgumentParser(description='xgboost gradient boosted trees benchmark') parser.add_argument('--n-estimators', type=int, default=100, help='Number of gradient boosted trees') From 56de8f7278f17da9d4360169e78f909d9428a1a6 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Mon, 29 Mar 2021 12:08:14 +0300 Subject: [PATCH 07/31] First attempt of adding nvidia datasets --- bench.py | 13 +- configs/cuml_config.json | 1 - configs/lgbm_mb_cpu_config.json | 1 - configs/xgb_cpu_config.json | 190 +++++----- configs/xgb_cpu_nvda_config.json | 155 ++++++++ configs/xgb_gpu_config.json | 1 - configs/xgb_mb_cpu_config.json | 1 - datasets/load_datasets.py | 39 +- datasets/loader.py | 615 ++++++++++++++++++++++--------- runner.py | 41 +-- utils.py | 152 ++++---- xgboost_bench/gbt.py | 4 - 12 files changed, 795 insertions(+), 418 deletions(-) create mode 100644 configs/xgb_cpu_nvda_config.json diff --git a/bench.py b/bench.py index 527cac394..3a0c7e598 100644 --- a/bench.py +++ b/bench.py @@ -16,6 +16,7 @@ import argparse import json +import logging import sys import timeit @@ -196,8 +197,8 @@ def parse_args(parser, size=None, loop_types=(), from daal4py.sklearn import patch_sklearn patch_sklearn() except ImportError: - print('Failed to import daal4py.sklearn.patch_sklearn.' - 'Use stock version scikit-learn', file=sys.stderr) + logging.info('Failed to import daal4py.sklearn.patch_sklearn.' + 'Use stock version scikit-learn', file=sys.stderr) # disable finiteness check (default) if not params.check_finiteness: @@ -206,7 +207,7 @@ def parse_args(parser, size=None, loop_types=(), # Ask DAAL what it thinks about this number of threads num_threads = prepare_daal_threads(num_threads=params.threads) if params.verbose: - print(f'@ DAAL gave us {num_threads} threads') + logging.info(f'@ DAAL gave us {num_threads} threads') n_jobs = None if n_jobs_supported: @@ -222,7 +223,7 @@ def parse_args(parser, size=None, loop_types=(), # Very verbose output if params.verbose: - print(f'@ params = {params.__dict__}') + logging.info(f'@ params = {params.__dict__}') return params @@ -237,8 +238,8 @@ def set_daal_num_threads(num_threads): if num_threads: daal4py.daalinit(nthreads=num_threads) except ImportError: - print('@ Package "daal4py" was not found. Number of threads ' - 'is being ignored') + logging.info('@ Package "daal4py" was not found. Number of threads ' + 'is being ignored') def prepare_daal_threads(num_threads=-1): diff --git a/configs/cuml_config.json b/configs/cuml_config.json index 01ec8333b..70361023e 100755 --- a/configs/cuml_config.json +++ b/configs/cuml_config.json @@ -1,5 +1,4 @@ { - "omp_env": ["OMP_NUM_THREADS"], "common": { "lib": ["cuml"], "data-format": ["cudf"], diff --git a/configs/lgbm_mb_cpu_config.json b/configs/lgbm_mb_cpu_config.json index e8a2111da..3f1e12e5f 100755 --- a/configs/lgbm_mb_cpu_config.json +++ b/configs/lgbm_mb_cpu_config.json @@ -1,5 +1,4 @@ { - "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"], "common": { "lib": ["modelbuilders"], "data-format": ["pandas"], diff --git a/configs/xgb_cpu_config.json b/configs/xgb_cpu_config.json index ecc0da15b..56ab27929 100644 --- a/configs/xgb_cpu_config.json +++ b/configs/xgb_cpu_config.json @@ -1,163 +1,153 @@ { - "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"], "common": { - "lib": ["xgboost"], - "data-format": ["pandas"], - "data-order": ["F"], - "dtype": ["float32"], - "count-dmatrix": [""] + "lib": "xgboost", + "data-format": "pandas", + "data-order": "F", + "dtype": "float32", + "count-dmatrix":"", + "algorithm": "gbt", + "tree-method": "hist", + "num-threads": 56 }, "cases": [ { - "algorithm": "gbt", "dataset": [ { - "source": "csv", - "name": "plasticc", + "source": "csv", + "name": "plasticc", "training": { - "x": "data/plasticc_x_train.csv", - "y": "data/plasticc_y_train.csv" + "x": "data/plasticc_x_train.csv", + "y": "data/plasticc_y_train.csv" }, "testing": { - "x": "data/plasticc_x_test.csv", - "y": "data/plasticc_y_test.csv" + "x": "data/plasticc_x_test.csv", + "y": "data/plasticc_y_test.csv" } } ], - "n-estimators": [60], - "objective": ["multi:softprob"], - "tree-method": ["hist"], - "max-depth": [7], - "subsample": [0.7], - "colsample-bytree": [0.7] + "n-estimators": 60, + "objective": "multi:softprob", + "max-depth": 7, + "subsample": 0.7, + "colsample-bytree": 0.7 }, { - "algorithm": "gbt", "dataset": [ { - "source": "csv", - "name": "santander", + "source": "csv", + "name": "santander", "training": { - "x": "data/santander_x_train.csv", - "y": "data/santander_y_train.csv" + "x": "data/santander_x_train.csv", + "y": "data/santander_y_train.csv" } } ], - "n-estimators": [10000], - "objective": ["binary:logistic"], - "tree-method": ["hist"], - "max-depth": [1], - "subsample": [0.5], - "eta": [0.1], - "colsample-bytree": [0.05], - "single-precision-histogram": [""] + "n-estimators": 10000, + "objective": "binary:logistic", + "max-depth": 1, + "subsample": 0.5, + "eta": 0.1, + "colsample-bytree": 0.05, + "single-precision-histogram": "" }, { - "algorithm": "gbt", "dataset": [ { - "source": "csv", - "name": "mortgage1Q", + "source": "csv", + "name": "mortgage1Q", "training": { - "x": "data/mortgage_x.csv", - "y": "data/mortgage_y.csv" + "x": "data/mortgage_x.csv", + "y": "data/mortgage_y.csv" } } ], - "n-estimators": [100], - "objective": ["reg:squarederror"], - "tree-method": ["hist"], - "max-depth": [8], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-alpha": [0.9], - "reg-lambda": [1], - "min-child-weight": [0], - "max-leaves": [256] + "n-estimators": 100, + "objective": "reg:squarederror", + "max-depth": 8, + "scale-pos-weight": 2, + "learning-rate": 0.1, + "subsample": 1, + "reg-alpha": 0.9, + "reg-lambda": 1, + "min-child-weight": 0, + "max-leaves": 256 }, { - "algorithm": "gbt", "dataset": [ { - "source": "csv", - "name": "airline-ohe", + "source": "csv", + "name": "airline-ohe", "training": { - "x": "data/airline-ohe_x_train.csv", - "y": "data/airline-ohe_y_train.csv" + "x": "data/airline-ohe_x_train.csv", + "y": "data/airline-ohe_y_train.csv" } } ], - "reg-alpha": [0.9], - "max-bin": [256], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-lambda": [1], - "min-child-weight": [0], - "max-depth": [8], - "max-leaves": [256], - "n-estimators": [1000], - "objective": ["binary:logistic"], - "tree-method": ["hist"] + "reg-alpha": 0.9, + "max-bin": 256, + "scale-pos-weight": 2, + "learning-rate": 0.1, + "subsample": 1, + "reg-lambda": 1, + "min-child-weight": 0, + "max-depth": 8, + "max-leaves": 256, + "n-estimators": 1000, + "objective": "binary:logistic" }, { - "algorithm": "gbt", "dataset": [ { - "source": "csv", - "name": "higgs1m", + "source": "csv", + "name": "higgs1m", "training": { - "x": "data/higgs1m_x_train.csv", - "y": "data/higgs1m_y_train.csv" + "x": "data/higgs1m_x_train.csv", + "y": "data/higgs1m_y_train.csv" } } ], - "reg-alpha": [0.9], - "max-bin": [256], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-lambda": [1], - "min-child-weight": [0], - "max-depth": [8], - "max-leaves": [256], - "n-estimators": [1000], - "objective": ["binary:logistic"], - "tree-method": ["hist"], - "enable-experimental-json-serialization": ["False"], - "inplace-predict": [""] + "reg-alpha": 0.9, + "max-bin": 256, + "scale-pos-weight": 2, + "learning-rate": 0.1, + "subsample": 1, + "reg-lambda": 1, + "min-child-weight": 0, + "max-depth": 8, + "max-leaves": 256, + "n-estimators": 1000, + "objective": "binary:logistic", + "enable-experimental-json-serialization": "False", + "inplace-predict": "" }, { - "algorithm": "gbt", "dataset": [ { - "source": "csv", - "name": "msrank", + "source": "csv", + "name": "msrank", "training": { - "x": "data/mlsr_x_train.csv", - "y": "data/mlsr_y_train.csv" + "x": "data/mlsr_x_train.csv", + "y": "data/mlsr_y_train.csv" } } ], - "max-bin": [256], - "learning-rate": [0.3], - "subsample": [1], - "reg-lambda": [2], - "min-child-weight": [1], - "min-split-loss": [0.1], - "max-depth": [8], - "n-estimators": [200], - "objective": ["multi:softprob"], - "tree-method": ["hist"], - "single-precision-histogram": [""] + "max-bin": 256, + "learning-rate": 0.3, + "subsample": 1, + "reg-lambda": 2, + "min-child-weight": 1, + "min-split-loss": 0.1, + "max-depth": 8, + "n-estimators": 200, + "objective": "multi:softprob", + "single-precision-histogram": "" } ] } diff --git a/configs/xgb_cpu_nvda_config.json b/configs/xgb_cpu_nvda_config.json new file mode 100644 index 000000000..3efad4440 --- /dev/null +++ b/configs/xgb_cpu_nvda_config.json @@ -0,0 +1,155 @@ +{ + "common": { + "lib": ["xgboost"], + "data-format": "pandas", + "data-order": "F", + "dtype": "float32", + "algorithm": "gbt", + "tree-method": "hist", + "max-depth": 8, + "learning-rate":0.1, + "reg-lambda": 1, + "max-leaves": 256, + "num-threads": 48 + }, + "cases": [ + { + "objective": "binary:logistic", + "scale-pos-weight": 1, + "dataset": [ + { + "source": "npy", + "name": "airline", + "training": + { + "x": "data/airline_x_train.npy", + "y": "data/airline_y_train.npy" + }, + "testing": + { + "x": "data/airline_x_test.npy", + "y": "data/airline_y_test.npy" + } + } + ] + }, + { + "objective": "binary:logistic", + "scale-pos-weight": 1, + "dataset": [ + { + "source": "npy", + "name": "bosch", + "training": + { + "x": "data/bosch_x_train.npy", + "y": "data/bosch_y_train.npy" + }, + "testing": + { + "x": "data/bosch_x_test.npy", + "y": "data/bosch_y_test.npy" + } + } + ] + }, + { + "objective": "multi:softmax", + "dataset": [ + { + "source": "npy", + "name": "covtype", + "training": + { + "x": "data/covtype_x_train.npy", + "y": "data/covtype_y_train.npy" + }, + "testing": + { + "x": "data/covtype_x_test.npy", + "y": "data/covtype_y_test.npy" + } + } + ] + }, + { + "objective": "binary:logistic", + "scale-pos-weight": 1, + "dataset": [ + { + "source": "npy", + "name": "epsilon", + "training": + { + "x": "data/epsilon_x_train.npy", + "y": "data/epsilon_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_x_test.npy", + "y": "data/epsilon_y_test.npy" + } + } + ] + }, + { + "objective": "binary:logistic", + "scale-pos-weight": 1, + "dataset": [ + { + "source": "npy", + "name": "fraud", + "training": + { + "x": "data/fraud_x_train.npy", + "y": "data/fraud_y_train.npy" + }, + "testing": + { + "x": "data/fraud_x_test.npy", + "y": "data/fraud_y_test.npy" + } + } + ] + }, + { + "objective": "binary:logistic", + "scale-pos-weight": 1, + "dataset": [ + { + "source": "npy", + "name": "higgs", + "training": + { + "x": "data/higgs_x_train.npy", + "y": "data/higgs_y_train.npy" + }, + "testing": + { + "x": "data/higgs_x_test.npy", + "y": "data/higgs_y_test.npy" + } + } + ] + }, + { + "objective": "reg:squarederror", + "dataset": [ + { + "source": "npy", + "name": "year", + "training": + { + "x": "data/year_x_train.npy", + "y": "data/year_y_train.npy" + }, + "testing": + { + "x": "data/year_x_test.npy", + "y": "data/year_y_test.npy" + } + } + ] + } + ] +} diff --git a/configs/xgb_gpu_config.json b/configs/xgb_gpu_config.json index 44d9aec45..7fa81e828 100644 --- a/configs/xgb_gpu_config.json +++ b/configs/xgb_gpu_config.json @@ -1,5 +1,4 @@ { - "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"], "common": { "lib": ["xgboost"], "data-format": ["cudf"], diff --git a/configs/xgb_mb_cpu_config.json b/configs/xgb_mb_cpu_config.json index 0c8128aef..eefc97fed 100755 --- a/configs/xgb_mb_cpu_config.json +++ b/configs/xgb_mb_cpu_config.json @@ -1,5 +1,4 @@ { - "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"], "common": { "lib": ["modelbuilders"], "data-format": ["pandas"], diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index 6c0af5a01..86c22c9e3 100755 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -18,25 +18,44 @@ import logging import os import sys +from pathlib import Path +from typing import Callable, Dict -from .loader import (a9a, codrnanorm, connect, covertype, gisette, ijcnn, - klaverjas, mnist, sensit, skin_segmentation) +from .loader import (a_nine_a, airline, airline_ohe, bosch, codrnanorm, + connect, covertype, covtype, epsilon, fraud, gisette, + higgs, higgs_one_m, ijcnn, klaverjas, mnist, + mortgage_first_q, msrank, plasticc, santander, sensit, + skin_segmentation, year) -dataset_loaders = { - "a9a": a9a, + +dataset_loaders: Dict[str, Callable[[Path], bool]] = { + "a9a": a_nine_a, + "airline": airline, + "airline-ohe": airline_ohe, + "bosch": bosch, + "codrnanorm": codrnanorm, + "connect": connect, + "covertype": covertype, + "covtype": covtype, + "epsilon": epsilon, + "fraud": fraud, "gisette": gisette, + "higgs": higgs, + "higgs1m": higgs_one_m, "ijcnn": ijcnn, - "skin_segmentation": skin_segmentation, "klaverjas": klaverjas, - "connect": connect, "mnist": mnist, + "mortgage1Q": mortgage_first_q, + "msrank": msrank, + "plasticc": plasticc, + "santander": santander, "sensit": sensit, - "covertype": covertype, - "codrnanorm": codrnanorm, + "skin_segmentation": skin_segmentation, + "year": year, } -def try_load_dataset(dataset_name, output_directory): +def try_load_dataset(dataset_name: str, output_directory: Path) -> bool: if dataset_name in dataset_loaders: try: return dataset_loaders[dataset_name](output_directory) @@ -64,7 +83,7 @@ def try_load_dataset(dataset_name, output_directory): print(key) sys.exit(0) - root_dir = os.environ['DATASETSROOT'] + root_dir = Path(os.environ['DATASETSROOT']) if args.datasets is not None: for val in dataset_loaders.values(): diff --git a/datasets/loader.py b/datasets/loader.py index 055fd52a6..50bc2a5be 100755 --- a/datasets/loader.py +++ b/datasets/loader.py @@ -16,15 +16,37 @@ import logging import os +from pathlib import Path +from typing import Any from urllib.request import urlretrieve import numpy as np import pandas as pd -from sklearn.datasets import fetch_openml +import tqdm +from sklearn.datasets import fetch_covtype, fetch_openml, load_svmlight_file from sklearn.model_selection import train_test_split +pbar: tqdm.tqdm = None -def a9a(dataset_dir=None): + +def _show_progress(block_num: int, block_size: int, total_size: int) -> None: + global pbar + if pbar is None: + pbar = tqdm.tqdm(total=total_size / 1024, unit='kB') + + downloaded = block_num * block_size + if downloaded < total_size: + pbar.update(block_size / 1024) + else: + pbar.close() + pbar = None + + +def _retrieve(url: str, filename: str) -> None: + urlretrieve(url, filename, reporthook=_show_progress) + + +def a_nine_a(dataset_dir: Path) -> bool: """ Author: Ronny Kohavi","Barry Becker libSVM","AAD group @@ -61,30 +83,111 @@ def a9a(dataset_dir=None): return True -def ijcnn(dataset_dir=None): +def airline(dataset_dir: Path) -> bool: + dataset_name = 'airline' + os.makedirs(dataset_dir, exist_ok=True) + + url = 'http://kt.ijs.si/elena_ikonomovska/datasets/airline/airline_14col.data.bz2' + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + _retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, started parsing...') + + cols = [ + "Year", "Month", "DayofMonth", "DayofWeek", "CRSDepTime", + "CRSArrTime", "UniqueCarrier", "FlightNum", "ActualElapsedTime", + "Origin", "Dest", "Distance", "Diverted", "ArrDelay" + ] + + # load the data as int16 + dtype = np.int16 + + dtype_columns = { + "Year": dtype, "Month": dtype, "DayofMonth": dtype, "DayofWeek": dtype, + "CRSDepTime": dtype, "CRSArrTime": dtype, "FlightNum": dtype, + "ActualElapsedTime": dtype, "Distance": + dtype, + "Diverted": dtype, "ArrDelay": dtype, + } + + df: Any = pd.read_csv(local_url, names=cols, dtype=dtype_columns) + + # Encode categoricals as numeric + for col in df.select_dtypes(['object']).columns: + df[col] = df[col].astype("category").cat.codes + + # Turn into binary classification problem + df["ArrDelayBinary"] = 1 * (df["ArrDelay"] > 0) + + X = df[df.columns.difference(["ArrDelay", "ArrDelayBinary"]) + ].to_numpy(dtype=np.float32) + y = df["ArrDelayBinary"].to_numpy(dtype=np.float32) + del df + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, + test_size=0.2, + ) + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def airline_ohe(dataset_dir: Path) -> bool: + return False + + +def bosch(dataset_dir: Path) -> bool: + dataset_name = 'bosch' + os.makedirs(dataset_dir, exist_ok=True) + + filename = "train_numeric.csv.zip" + local_url = os.path.join(dataset_dir, filename) + + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + os.system( + "kaggle competitions download -c bosch-production-line-performance -f " + + filename + " -p " + str(dataset_dir)) + logging.info(f'{dataset_name} is loaded, started parsing...') + X = pd.read_csv(local_url, index_col=0, compression='zip', dtype=np.float32) + y = X.iloc[:, -1].to_numpy(dtype=np.float32) + X.drop(X.columns[-1], axis=1, inplace=True) + X_np = X.to_numpy(dtype=np.float32) + X_train, X_test, y_train, y_test = train_test_split(X_np, y, random_state=77, + test_size=0.2, + ) + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def codrnanorm(dataset_dir: Path) -> bool: """ - Author: Danil Prokhorov. - libSVM,AAD group - Cite: Danil Prokhorov. IJCNN 2001 neural network competition. - Slide presentation in IJCNN'01, - Ford Research Laboratory, 2001. http://www.geocities.com/ijcnn/nnc_ijcnn01.pdf. + Abstract: Detection of non-coding RNAs on the basis of predicted secondary + structure formation free energy change. + Author: Andrew V Uzilov,Joshua M Keegan,David H Mathews. + Source: [original](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets) Classification task. n_classes = 2. - ijcnn X train dataset (153344, 22) - ijcnn y train dataset (153344, 1) - ijcnn X test dataset (38337, 22) - ijcnn y test dataset (38337, 1) + codrnanorm X train dataset (390852, 8) + codrnanorm y train dataset (390852, 1) + codrnanorm X test dataset (97713, 8) + codrnanorm y test dataset (97713, 1) """ - dataset_name = 'ijcnn' + dataset_name = 'codrnanorm' os.makedirs(dataset_dir, exist_ok=True) - X, y = fetch_openml(name='ijcnn', return_X_y=True, + X, y = fetch_openml(name='codrnaNorm', return_X_y=True, as_frame=False, data_home=dataset_dir) X = pd.DataFrame(X.todense()) y = pd.DataFrame(y) - y[y == -1] = 0 - logging.info(f'{dataset_name} dataset is downloaded') logging.info('reading CSV file...') @@ -99,33 +202,32 @@ def ijcnn(dataset_dir=None): return True -def skin_segmentation(dataset_dir=None): +def connect(dataset_dir: Path) -> bool: """ - Abstract: - The Skin Segmentation dataset is constructed over B, G, R color space. - Skin and Nonskin dataset is generated using skin textures from - face images of diversity of age, gender, and race people. - Author: Rajen Bhatt, Abhinav Dhall, rajen.bhatt '@' gmail.com, IIT Delhi. + Source: + UC Irvine Machine Learning Repository + http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.htm - Classification task. n_classes = 2. - skin_segmentation X train dataset (196045, 3) - skin_segmentation y train dataset (196045, 1) - skin_segmentation X test dataset (49012, 3) - skin_segmentation y test dataset (49012, 1) + Classification task. n_classes = 3. + connect X train dataset (196045, 127) + connect y train dataset (196045, 1) + connect X test dataset (49012, 127) + connect y test dataset (49012, 1) """ - dataset_name = 'skin_segmentation' + dataset_name = 'connect' os.makedirs(dataset_dir, exist_ok=True) - X, y = fetch_openml(name='skin-segmentation', - return_X_y=True, as_frame=True, data_home=dataset_dir) + X, y = fetch_openml(name='connect-4', return_X_y=True, + as_frame=False, data_home=dataset_dir) + X = pd.DataFrame(X.todense()) + y = pd.DataFrame(y) y = y.astype(int) - y[y == 2] = 0 logging.info(f'{dataset_name} dataset is downloaded') logging.info('reading CSV file...') x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42) + X, y, test_size=0.1, random_state=42) for data, name in zip((x_train, x_test, y_train, y_test), ('x_train', 'x_test', 'y_train', 'y_test')): filename = f'{dataset_name}_{name}.csv' @@ -135,34 +237,31 @@ def skin_segmentation(dataset_dir=None): return True -def klaverjas(dataset_dir=None): +def covertype(dataset_dir: Path) -> bool: """ - Abstract: - Klaverjas is an example of the Jack-Nine card games, - which are characterized as trick-taking games where the the Jack - and nine of the trump suit are the highest-ranking trumps, and - the tens and aces of other suits are the most valuable cards - of these suits. It is played by four players in two teams. + Abstract: This is the original version of the famous + covertype dataset in ARFF format. + Author: Jock A. Blackard, Dr. Denis J. Dean, Dr. Charles W. Anderson + Source: [original](https://archive.ics.uci.edu/ml/datasets/covertype) - Task Information: - Classification task. n_classes = 2. - klaverjas X train dataset (196045, 3) - klaverjas y train dataset (196045, 1) - klaverjas X test dataset (49012, 3) - klaverjas y test dataset (49012, 1) + Classification task. n_classes = 7. + covertype X train dataset (390852, 54) + covertype y train dataset (390852, 1) + covertype X test dataset (97713, 54) + covertype y test dataset (97713, 1) """ - dataset_name = 'klaverjas' + dataset_name = 'covertype' os.makedirs(dataset_dir, exist_ok=True) - X, y = fetch_openml(name='Klaverjas2018', return_X_y=True, + X, y = fetch_openml(name='covertype', version=3, return_X_y=True, as_frame=True, data_home=dataset_dir) + y = y.astype(int) - y = y.cat.codes logging.info(f'{dataset_name} dataset is downloaded') logging.info('reading CSV file...') x_train, x_test, y_train, y_test = train_test_split( - X, y, train_size=0.2, random_state=42) + X, y, test_size=0.2, random_state=42) for data, name in zip((x_train, x_test, y_train, y_test), ('x_train', 'x_test', 'y_train', 'y_test')): filename = f'{dataset_name}_{name}.csv' @@ -172,32 +271,221 @@ def klaverjas(dataset_dir=None): return True -def connect(dataset_dir=None): +def covtype(dataset_dir: Path) -> bool: + dataset_name = 'covtype' + os.makedirs(dataset_dir, exist_ok=True) + + logging.info(f'Started loading {dataset_name}') + X, y = fetch_covtype(return_X_y=True) # pylint: disable=unexpected-keyword-arg + logging.info(f'{dataset_name} is loaded, started parsing...') + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, + test_size=0.2, + ) + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def epsilon(dataset_dir: Path) -> bool: + dataset_name = 'epsilon' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.bz2' + url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.t.bz2' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + _retrieve(url_train, local_url_train) + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + _retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + X_train, y_train = load_svmlight_file(local_url_train, + dtype=np.float32) + X_test, y_test = load_svmlight_file(local_url_test, + dtype=np.float32) + X_train = X_train.toarray() + X_test = X_test.toarray() + y_train[y_train <= 0] = 0 + y_test[y_test <= 0] = 0 + + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def fraud(dataset_dir: Path) -> bool: + dataset_name = 'fraud' + os.makedirs(dataset_dir, exist_ok=True) + + filename = "creditcard.csv" + local_url = os.path.join(dataset_dir, filename) + + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + os.system("kaggle datasets download mlg-ulb/creditcardfraud -f" + + filename + " -p " + str(dataset_dir)) + logging.info(f'{dataset_name} is loaded, started parsing...') + + df = pd.read_csv(local_url + ".zip", dtype=np.float32) + X = df[[col for col in df.columns if col.startswith('V')]].to_numpy(dtype=np.float32) + y = df['Class'].to_numpy(dtype=np.float32) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, + test_size=0.2, + ) + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def gisette(dataset_dir: Path) -> bool: """ - Source: - UC Irvine Machine Learning Repository - http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.htm + GISETTE is a handwritten digit recognition problem. + The problem is to separate the highly confusable digits '4' and '9'. + This dataset is one of five datasets of the NIPS 2003 feature selection challenge. - Classification task. n_classes = 3. - connect X train dataset (196045, 127) - connect y train dataset (196045, 1) - connect X test dataset (49012, 127) - connect y test dataset (49012, 1) + Classification task. n_classes = 2. + gisette X train dataset (6000, 5000) + gisette y train dataset (6000, 1) + gisette X test dataset (1000, 5000) + gisette y test dataset (1000, 1) """ - dataset_name = 'connect' + dataset_name = 'gisette' os.makedirs(dataset_dir, exist_ok=True) - X, y = fetch_openml(name='connect-4', return_X_y=True, + cache_dir = os.path.join(dataset_dir, '_gisette') + os.makedirs(cache_dir, exist_ok=True) + + domen_hhtp = 'http://archive.ics.uci.edu/ml/machine-learning-databases/' + + gisette_train_data_url = domen_hhtp + '/gisette/GISETTE/gisette_train.data' + filename_train_data = os.path.join(cache_dir, 'gisette_train.data') + if not os.path.exists(filename_train_data): + _retrieve(gisette_train_data_url, filename_train_data) + + gisette_train_labels_url = domen_hhtp + '/gisette/GISETTE/gisette_train.labels' + filename_train_labels = os.path.join(cache_dir, 'gisette_train.labels') + if not os.path.exists(filename_train_labels): + _retrieve(gisette_train_labels_url, filename_train_labels) + + gisette_test_data_url = domen_hhtp + '/gisette/GISETTE/gisette_valid.data' + filename_test_data = os.path.join(cache_dir, 'gisette_valid.data') + if not os.path.exists(filename_test_data): + _retrieve(gisette_test_data_url, filename_test_data) + + gisette_test_labels_url = domen_hhtp + '/gisette/gisette_valid.labels' + filename_test_labels = os.path.join(cache_dir, 'gisette_valid.labels') + if not os.path.exists(filename_test_labels): + _retrieve(gisette_test_labels_url, filename_test_labels) + + logging.info('gisette dataset is downloaded') + logging.info('reading CSV file...') + + num_cols = 5000 + + df_train = pd.read_csv(filename_train_data, header=None) + df_labels = pd.read_csv(filename_train_labels, header=None) + num_train = 6000 + x_train_arr = df_train.iloc[:num_train].values + x_train = pd.DataFrame(np.array([np.fromstring( + elem[0], dtype=int, count=num_cols, sep=' ') for elem in x_train_arr])) + y_train_arr = df_labels.iloc[:num_train].values + y_train = pd.DataFrame((y_train_arr > 0).astype(int)) + + num_train = 1000 + df_test = pd.read_csv(filename_test_data, header=None) + df_labels = pd.read_csv(filename_test_labels, header=None) + x_test_arr = df_test.iloc[:num_train].values + x_test = pd.DataFrame(np.array( + [np.fromstring( + elem[0], + dtype=int, count=num_cols, sep=' ') + for elem in x_test_arr])) + y_test_arr = df_labels.iloc[:num_train].values + y_test = pd.DataFrame((y_test_arr > 0).astype(int)) + + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.csv' + data.to_csv(os.path.join(dataset_dir, filename), + header=False, index=False) + + logging.info('dataset gisette ready.') + return True + + +def higgs(dataset_dir: Path) -> bool: + dataset_name = 'higgs' + os.makedirs(dataset_dir, exist_ok=True) + + url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + _retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, started parsing...') + + higgs = pd.read_csv(local_url) + X = higgs.iloc[:, 1:].to_numpy(dtype=np.float32) + y = higgs.iloc[:, 0].to_numpy(dtype=np.float32) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, + test_size=0.2, + ) + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def higgs_one_m(dataset_dir: Path) -> bool: + return False + + +def ijcnn(dataset_dir: Path) -> bool: + """ + Author: Danil Prokhorov. + libSVM,AAD group + Cite: Danil Prokhorov. IJCNN 2001 neural network competition. + Slide presentation in IJCNN'01, + Ford Research Laboratory, 2001. http://www.geocities.com/ijcnn/nnc_ijcnn01.pdf. + + Classification task. n_classes = 2. + ijcnn X train dataset (153344, 22) + ijcnn y train dataset (153344, 1) + ijcnn X test dataset (38337, 22) + ijcnn y test dataset (38337, 1) + """ + dataset_name = 'ijcnn' + os.makedirs(dataset_dir, exist_ok=True) + + X, y = fetch_openml(name='ijcnn', return_X_y=True, as_frame=False, data_home=dataset_dir) X = pd.DataFrame(X.todense()) y = pd.DataFrame(y) - y = y.astype(int) + + y[y == -1] = 0 logging.info(f'{dataset_name} dataset is downloaded') logging.info('reading CSV file...') x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.1, random_state=42) + X, y, test_size=0.2, random_state=42) for data, name in zip((x_train, x_test, y_train, y_test), ('x_train', 'x_test', 'y_train', 'y_test')): filename = f'{dataset_name}_{name}.csv' @@ -207,7 +495,44 @@ def connect(dataset_dir=None): return True -def mnist(dataset_dir=None): +def klaverjas(dataset_dir: Path) -> bool: + """ + Abstract: + Klaverjas is an example of the Jack-Nine card games, + which are characterized as trick-taking games where the the Jack + and nine of the trump suit are the highest-ranking trumps, and + the tens and aces of other suits are the most valuable cards + of these suits. It is played by four players in two teams. + + Task Information: + Classification task. n_classes = 2. + klaverjas X train dataset (196045, 3) + klaverjas y train dataset (196045, 1) + klaverjas X test dataset (49012, 3) + klaverjas y test dataset (49012, 1) + """ + dataset_name = 'klaverjas' + os.makedirs(dataset_dir, exist_ok=True) + + X, y = fetch_openml(name='Klaverjas2018', return_X_y=True, + as_frame=True, data_home=dataset_dir) + + y = y.cat.codes + logging.info(f'{dataset_name} dataset is downloaded') + logging.info('reading CSV file...') + + x_train, x_test, y_train, y_test = train_test_split( + X, y, train_size=0.2, random_state=42) + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.csv' + data.to_csv(os.path.join(dataset_dir, filename), + header=False, index=False) + logging.info(f'dataset {dataset_name} ready.') + return True + + +def mnist(dataset_dir: Path) -> bool: """ Abstract: The MNIST database of handwritten digits with 784 features. @@ -246,7 +571,23 @@ def mnist(dataset_dir=None): return True -def sensit(dataset_dir=None): +def mortgage_first_q(dataset_dir: Path) -> bool: + return False + + +def msrank(dataset_dir: Path) -> bool: + return False + + +def plasticc(dataset_dir: Path) -> bool: + return False + + +def santander(dataset_dir: Path) -> bool: + return False + + +def sensit(dataset_dir: Path) -> bool: """ Abstract: Vehicle classification in distributed sensor networks. Author: M. Duarte, Y. H. Hu @@ -281,60 +622,27 @@ def sensit(dataset_dir=None): return True -def covertype(dataset_dir=None): +def skin_segmentation(dataset_dir: Path) -> bool: """ - Abstract: This is the original version of the famous - covertype dataset in ARFF format. - Author: Jock A. Blackard, Dr. Denis J. Dean, Dr. Charles W. Anderson - Source: [original](https://archive.ics.uci.edu/ml/datasets/covertype) - - Classification task. n_classes = 7. - covertype X train dataset (390852, 54) - covertype y train dataset (390852, 1) - covertype X test dataset (97713, 54) - covertype y test dataset (97713, 1) - """ - dataset_name = 'covertype' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='covertype', version=3, return_X_y=True, - as_frame=True, data_home=dataset_dir) - y = y.astype(int) - - logging.info(f'{dataset_name} dataset is downloaded') - logging.info('reading CSV file...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - logging.info(f'dataset {dataset_name} ready.') - return True - - -def codrnanorm(dataset_dir=None): - """ - Abstract: Detection of non-coding RNAs on the basis of predicted secondary - structure formation free energy change. - Author: Andrew V Uzilov,Joshua M Keegan,David H Mathews. - Source: [original](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets) + Abstract: + The Skin Segmentation dataset is constructed over B, G, R color space. + Skin and Nonskin dataset is generated using skin textures from + face images of diversity of age, gender, and race people. + Author: Rajen Bhatt, Abhinav Dhall, rajen.bhatt '@' gmail.com, IIT Delhi. Classification task. n_classes = 2. - codrnanorm X train dataset (390852, 8) - codrnanorm y train dataset (390852, 1) - codrnanorm X test dataset (97713, 8) - codrnanorm y test dataset (97713, 1) + skin_segmentation X train dataset (196045, 3) + skin_segmentation y train dataset (196045, 1) + skin_segmentation X test dataset (49012, 3) + skin_segmentation y test dataset (49012, 1) """ - dataset_name = 'codrnanorm' + dataset_name = 'skin_segmentation' os.makedirs(dataset_dir, exist_ok=True) - X, y = fetch_openml(name='codrnaNorm', return_X_y=True, - as_frame=False, data_home=dataset_dir) - X = pd.DataFrame(X.todense()) - y = pd.DataFrame(y) + X, y = fetch_openml(name='skin-segmentation', + return_X_y=True, as_frame=True, data_home=dataset_dir) + y = y.astype(int) + y[y == 2] = 0 logging.info(f'{dataset_name} dataset is downloaded') logging.info('reading CSV file...') @@ -350,74 +658,29 @@ def codrnanorm(dataset_dir=None): return True -def gisette(dataset_dir=None): - """ - GISETTE is a handwritten digit recognition problem. - The problem is to separate the highly confusable digits '4' and '9'. - This dataset is one of five datasets of the NIPS 2003 feature selection challenge. - - Classification task. n_classes = 2. - gisette X train dataset (6000, 5000) - gisette y train dataset (6000, 1) - gisette X test dataset (1000, 5000) - gisette y test dataset (1000, 1) - """ - dataset_name = 'gisette' +def year(dataset_dir: Path) -> bool: + dataset_name = 'year' os.makedirs(dataset_dir, exist_ok=True) - cache_dir = os.path.join(dataset_dir, '_gisette') - os.makedirs(cache_dir, exist_ok=True) - - domen_hhtp = 'http://archive.ics.uci.edu/ml/machine-learning-databases/' - - gisette_train_data_url = domen_hhtp + '/gisette/GISETTE/gisette_train.data' - filename_train_data = os.path.join(cache_dir, 'gisette_train.data') - if not os.path.exists(filename_train_data): - urlretrieve(gisette_train_data_url, filename_train_data) - - gisette_train_labels_url = domen_hhtp + '/gisette/GISETTE/gisette_train.labels' - filename_train_labels = os.path.join(cache_dir, 'gisette_train.labels') - if not os.path.exists(filename_train_labels): - urlretrieve(gisette_train_labels_url, filename_train_labels) - - gisette_test_data_url = domen_hhtp + '/gisette/GISETTE/gisette_valid.data' - filename_test_data = os.path.join(cache_dir, 'gisette_valid.data') - if not os.path.exists(filename_test_data): - urlretrieve(gisette_test_data_url, filename_test_data) - - gisette_test_labels_url = domen_hhtp + '/gisette/gisette_valid.labels' - filename_test_labels = os.path.join(cache_dir, 'gisette_valid.labels') - if not os.path.exists(filename_test_labels): - urlretrieve(gisette_test_labels_url, filename_test_labels) - - logging.info('gisette dataset is downloaded') - logging.info('reading CSV file...') + url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt' \ + '.zip' + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + _retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, started parsing...') - num_cols = 5000 + year = pd.read_csv(local_url, header=None) + X = year.iloc[:, 1:].to_numpy(dtype=np.float32) + y = year.iloc[:, 0].to_numpy(dtype=np.float32) - df_train = pd.read_csv(filename_train_data, header=None) - df_labels = pd.read_csv(filename_train_labels, header=None) - num_train = 6000 - x_train = df_train.iloc[:num_train].values - x_train = pd.DataFrame(np.array([np.fromstring( - elem[0], dtype=int, count=num_cols, sep=' ') for elem in x_train])) - y_train = df_labels.iloc[:num_train].values - y_train = pd.DataFrame((y_train > 0).astype(int)) + X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, + train_size=463715, + test_size=51630) - num_train = 1000 - df_test = pd.read_csv(filename_test_data, header=None) - df_labels = pd.read_csv(filename_test_labels, header=None) - x_test = df_test.iloc[:num_train].values - x_test = pd.DataFrame(np.array( - [np.fromstring(elem[0], dtype=int, count=num_cols, sep=' ') for elem in x_test])) - y_test = df_labels.iloc[:num_train].values - y_test = pd.DataFrame((y_test > 0).astype(int)) - - for data, name in zip((x_train, x_test, y_train, y_test), + for data, name in zip((X_train, X_test, y_train, y_test), ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - - logging.info('dataset gisette ready.') + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') return True diff --git a/runner.py b/runner.py index 678365b74..cdd7c0d9a 100755 --- a/runner.py +++ b/runner.py @@ -25,27 +25,6 @@ import utils -def generate_cases(params): - ''' - Generate cases for benchmarking by iterating of - parameters values - ''' - global cases - if len(params) == 0: - return cases - prev_length = len(cases) - param_name = list(params.keys())[0] - n_param_values = len(params[param_name]) - cases = cases * n_param_values - dashes = '-' if len(param_name) == 1 else '--' - for i in range(n_param_values): - for j in range(prev_length): - cases[prev_length * i + j] += f' {dashes}{param_name} ' \ - + f'{params[param_name][i]}' - del params[param_name] - generate_cases(params) - - if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--configs', metavar='ConfigPath', type=str, @@ -90,21 +69,22 @@ def generate_cases(params): # get parameters that are common for all cases common_params = config['common'] for params_set in config['cases']: - cases = [''] params = common_params.copy() params.update(params_set.copy()) algorithm = params['algorithm'] libs = params['lib'] del params['dataset'], params['algorithm'], params['lib'] - generate_cases(params) + cases = utils.generate_cases(params) logging.info(f'{algorithm} algorithm: {len(libs) * len(cases)} case(s),' f' {len(params_set["dataset"])} dataset(s)\n') for dataset in params_set['dataset']: if dataset['source'] in ['csv', 'npy']: dataset_name = dataset['name'] if 'name' in dataset else 'unknown' - if 'training' not in dataset or not utils.find_the_dataset( - dataset_name, dataset['training']["x"]): + if 'training' not in dataset or \ + 'x' not in dataset['training'] or \ + not utils.find_the_dataset(dataset_name, + dataset['training']['x']): logging.warning( f'Dataset {dataset_name} could not be loaded. \n' 'Check the correct name or expand the download in ' @@ -189,12 +169,6 @@ class GenerationArgs: no_intel_optimize = \ '--no-intel-optimized ' if args.no_intel_optimized else '' for lib in libs: - env = os.environ.copy() - if lib == 'xgboost' and 'omp_env' in config: - omp_env = utils.get_omp_env() - for var in config['omp_env']: - if var in omp_env: - env[var] = omp_env[var] for i, case in enumerate(cases): command = f'python {lib}_bench/{algorithm}.py ' \ + no_intel_optimize \ @@ -205,7 +179,7 @@ class GenerationArgs: if not args.dummy_run: case = f'{lib},{algorithm} ' + case stdout, stderr = utils.read_output_from_command( - command, env=env) + command, env=os.environ.copy()) stdout, extra_stdout = utils.filter_stdout(stdout) stderr = utils.filter_stderr(stderr) @@ -215,8 +189,7 @@ class GenerationArgs: stderr += f'CASE {case} EXTRA OUTPUT:\n' \ + f'{extra_stdout}\n' try: - json_result['results'].extend( - json.loads(stdout)) + json_result['results'] = json.loads(stdout) except json.JSONDecodeError as decoding_exception: stderr += f'CASE {case} JSON DECODING ERROR:\n' \ + f'{decoding_exception}\n{stdout}\n' diff --git a/utils.py b/utils.py index d7db2e814..5a62413d0 100755 --- a/utils.py +++ b/utils.py @@ -15,18 +15,17 @@ # =============================================================================== import json -import logging -import multiprocessing import os import pathlib import platform import subprocess import sys +from typing import Any, Dict, List, Tuple, Union, cast from datasets.load_datasets import try_load_dataset -def filter_stderr(text): +def filter_stderr(text: str) -> str: # delete 'Intel(R) DAAL usage in sklearn' messages fake_error_message = 'Intel(R) oneAPI Data Analytics Library solvers ' + \ 'for sklearn enabled: ' + \ @@ -34,7 +33,7 @@ def filter_stderr(text): return ''.join(text.split(fake_error_message)) -def filter_stdout(text): +def filter_stdout(text: str) -> Tuple[str, str]: verbosity_letters = 'EWIDT' filtered, extra = '', '' for line in text.split('\n'): @@ -53,14 +52,12 @@ def filter_stdout(text): def find_the_dataset(name: str, fullpath: str) -> bool: - if not os.path.isfile(fullpath): - if not try_load_dataset(dataset_name=name, - output_directory=pathlib.Path(fullpath).parent): - return False - return True + return os.path.isfile(fullpath) or try_load_dataset( + dataset_name=name, output_directory=pathlib.Path(fullpath).parent) -def read_output_from_command(command, env=os.environ.copy()): +def read_output_from_command(command: str, + env: Dict[str, str] = os.environ.copy()) -> Tuple[str, str]: if "PYTHONPATH" in env: env["PYTHONPATH"] += ":" + os.path.dirname(os.path.abspath(__file__)) else: @@ -70,85 +67,51 @@ def read_output_from_command(command, env=os.environ.copy()): return res.stdout[:-1], res.stderr[:-1] -def _is_ht_enabled(): +def get_hw_parameters() -> Dict[str, Union[Dict[str, str], float]]: + if 'Linux' not in platform.platform(): + return {} + + hw_params: Dict[str, Union[Dict[str, str], float]] = {'CPU': {}} + # get CPU information + lscpu_info, _ = read_output_from_command('lscpu') + lscpu_info = ' '.join(lscpu_info.split()) + for line in lscpu_info.split('\n'): + k, v = line.split(": ")[:2] + if k == 'CPU MHz': + continue + cast(Dict[str, str], hw_params['CPU'])[k] = v + + # get RAM size + mem_info, _ = read_output_from_command('free -b') + mem_info = mem_info.split('\n')[1] + mem_info = ' '.join(mem_info.split()) + hw_params['RAM size[GB]'] = int(mem_info.split(' ')[1]) / 2 ** 30 + + # get GPU information try: - cpu_info, _ = read_output_from_command('lscpu') - cpu_info = cpu_info.split('\n') - for el in cpu_info: - if 'Thread(s) per core' in el: - threads_per_core = int(el[-1]) - if threads_per_core > 1: - return True - else: - return False - return False - except FileNotFoundError: - logging.info('Impossible to check hyperthreading via lscpu') - return False - - -def get_omp_env(): - cpu_count = multiprocessing.cpu_count() - omp_num_threads = str(cpu_count // 2) if _is_ht_enabled() else str(cpu_count) - - return { - 'OMP_PLACES': f'{{0}}:{cpu_count}:1', - 'OMP_NUM_THREADS': omp_num_threads - } - - -def get_hw_parameters(): - hw_params = {} - - if 'Linux' in platform.platform(): - # get CPU information - lscpu_info, _ = read_output_from_command('lscpu') - # remove excess spaces in CPU info output - while ' ' in lscpu_info: - lscpu_info = lscpu_info.replace(' ', ' ') - lscpu_info = lscpu_info.split('\n') - for i in range(len(lscpu_info)): - lscpu_info[i] = lscpu_info[i].split(': ') - hw_params.update( - {'CPU': {line[0]: line[1] for line in lscpu_info}}) - if 'CPU MHz' in hw_params['CPU'].keys(): - del hw_params['CPU']['CPU MHz'] - # get RAM size - mem_info, _ = read_output_from_command('free -b') - mem_info = mem_info.split('\n')[1] - while ' ' in mem_info: - mem_info = mem_info.replace(' ', ' ') - mem_info = int(mem_info.split(' ')[1]) / 2 ** 30 - hw_params.update({'RAM size[GB]': mem_info}) - # get GPU information - try: - gpu_info, _ = read_output_from_command( - 'nvidia-smi --query-gpu=name,memory.total,driver_version,pstate ' - '--format=csv,noheader') - gpu_info = gpu_info.split(', ') - hw_params.update({ - 'GPU': { - 'Name': gpu_info[0], - 'Memory size': gpu_info[1], - 'Performance mode': gpu_info[3] - } - }) - except (FileNotFoundError, json.JSONDecodeError): - pass + gpu_info, _ = read_output_from_command( + 'nvidia-smi --query-gpu=name,memory.total,driver_version,pstate ' + '--format=csv,noheader') + info_arr = gpu_info.split(', ') + hw_params['GPU'] = { + 'Name': info_arr[0], + 'Memory size': info_arr[1], + 'Performance mode': info_arr[3] + } + except (FileNotFoundError, json.JSONDecodeError): + pass return hw_params -def get_sw_parameters(): +def get_sw_parameters() -> Dict[str, Dict[str, Any]]: sw_params = {} try: gpu_info, _ = read_output_from_command( 'nvidia-smi --query-gpu=name,memory.total,driver_version,pstate ' '--format=csv,noheader') - gpu_info = gpu_info.split(', ') - - sw_params.update( - {'GPU_driver': {'version': gpu_info[2]}}) + info_arr = gpu_info.split(', ') + sw_params['GPU_driver'] = {'version': info_arr[2]} # alert if GPU is already running any processes gpu_processes, _ = read_output_from_command( 'nvidia-smi --query-compute-apps=name,pid,used_memory ' @@ -163,14 +126,35 @@ def get_sw_parameters(): try: conda_list, _ = read_output_from_command('conda list --json') needed_columns = ['version', 'build_string', 'channel'] - conda_list = json.loads(conda_list) - for pkg in conda_list: + conda_list_json: List[Dict[str, str]] = json.loads(conda_list) + for pkg in conda_list_json: pkg_info = {} for col in needed_columns: - if col in pkg.keys(): - pkg_info.update({col: pkg[col]}) - sw_params.update({pkg['name']: pkg_info}) + if col in pkg: + pkg_info[col] = pkg[col] + sw_params[pkg['name']] = pkg_info except (FileNotFoundError, json.JSONDecodeError): pass return sw_params + + +def generate_cases(params: Dict[str, Union[List[Any], Any]]) -> List[str]: + ''' + Generate cases for benchmarking by iterating the parameter values + ''' + commands = [''] + for param, values in params.items(): + if isinstance(values, list): + prev_len = len(commands) + commands *= len(values) + dashes = '-' if len(param) == 1 else '--' + for command_num in range(prev_len): + for value_num in range(len(values)): + commands[prev_len * value_num + command_num] += ' ' + \ + dashes + param + ' ' + str(values[value_num]) + else: + dashes = '-' if len(param) == 1 else '--' + for command_num in range(len(commands)): + commands[command_num] += ' ' + dashes + param + ' ' + str(values) + return commands diff --git a/xgboost_bench/gbt.py b/xgboost_bench/gbt.py index b19f1ef90..43229d096 100644 --- a/xgboost_bench/gbt.py +++ b/xgboost_bench/gbt.py @@ -15,7 +15,6 @@ # =============================================================================== import argparse -import os import bench import numpy as np @@ -118,9 +117,6 @@ def convert_xgb_predictions(y_pred, objective): if params.threads != -1: xgb_params.update({'nthread': params.threads}) -if 'OMP_NUM_THREADS' in os.environ.keys(): - xgb_params['nthread'] = int(os.environ['OMP_NUM_THREADS']) - if params.objective.startswith('reg'): task = 'regression' metric_name, metric_func = 'rmse', bench.rmse_score From 04e7a64d76731c0d11d6443c3402a43e49275e58 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Mon, 29 Mar 2021 16:50:50 +0300 Subject: [PATCH 08/31] removed E265 ignoring for flake8 job --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index eaef6f6e1..784acad03 100755 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -63,7 +63,7 @@ jobs: - script: | python -m pip install --upgrade pip setuptools pip install flake8 - flake8 --ignore=E265 --max-line-length=90 --count + flake8 --max-line-length=90 --count displayName: 'PEP 8 check' - job: Mypy pool: From b6a7eb04a24427e13c4e58c2d1e96fa127e91172 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Tue, 30 Mar 2021 14:16:27 +0300 Subject: [PATCH 09/31] NVidia benchmarks are working now --- bench.py | 2 +- configs/xgb_cpu_nvda_config.json | 10 ++--- runner.py | 1 - xgboost_bench/gbt.py | 75 +++++++++++++++++--------------- 4 files changed, 47 insertions(+), 41 deletions(-) diff --git a/bench.py b/bench.py index 2f8f3d768..dc98e34db 100644 --- a/bench.py +++ b/bench.py @@ -486,7 +486,7 @@ def print_output(library, algorithm, stages, params, functions, output = [] for i in range(len(stages)): result = gen_basic_dict(library, algorithm, stages[i], params, - data[i], alg_instance, alg_params) + data[i], alg_instance, alg_params if i == 0 else None) result.update({'time[s]': times[i]}) if accuracy_type is not None: result.update({f'{accuracy_type}': accuracies[i]}) diff --git a/configs/xgb_cpu_nvda_config.json b/configs/xgb_cpu_nvda_config.json index 3efad4440..d387c861b 100644 --- a/configs/xgb_cpu_nvda_config.json +++ b/configs/xgb_cpu_nvda_config.json @@ -15,7 +15,7 @@ "cases": [ { "objective": "binary:logistic", - "scale-pos-weight": 1, + "scale-pos-weight": 2.1067817411664587, "dataset": [ { "source": "npy", @@ -35,7 +35,7 @@ }, { "objective": "binary:logistic", - "scale-pos-weight": 1, + "scale-pos-weight": 173.63348001466812, "dataset": [ { "source": "npy", @@ -74,7 +74,7 @@ }, { "objective": "binary:logistic", - "scale-pos-weight": 1, + "scale-pos-weight": 2.0017715678375363, "dataset": [ { "source": "npy", @@ -94,7 +94,7 @@ }, { "objective": "binary:logistic", - "scale-pos-weight": 1, + "scale-pos-weight": 578.2868020304569, "dataset": [ { "source": "npy", @@ -114,7 +114,7 @@ }, { "objective": "binary:logistic", - "scale-pos-weight": 1, + "scale-pos-weight": 1.8872389605086624, "dataset": [ { "source": "npy", diff --git a/runner.py b/runner.py index cdd7c0d9a..7f6ca08fd 100755 --- a/runner.py +++ b/runner.py @@ -24,7 +24,6 @@ import datasets.make_datasets as make_datasets import utils - if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--configs', metavar='ConfigPath', type=str, diff --git a/xgboost_bench/gbt.py b/xgboost_bench/gbt.py index 43229d096..d7366b563 100644 --- a/xgboost_bench/gbt.py +++ b/xgboost_bench/gbt.py @@ -35,54 +35,58 @@ def convert_xgb_predictions(y_pred, objective): parser = argparse.ArgumentParser(description='xgboost gradient boosted trees benchmark') -parser.add_argument('--n-estimators', type=int, default=100, - help='Number of gradient boosted trees') -parser.add_argument('--learning-rate', '--eta', type=float, default=0.3, - help='Step size shrinkage used in update ' - 'to prevents overfitting') -parser.add_argument('--min-split-loss', '--gamma', type=float, default=0, - help='Minimum loss reduction required to make' - ' partition on a leaf node') -parser.add_argument('--max-depth', type=int, default=6, - help='Maximum depth of a tree') -parser.add_argument('--min-child-weight', type=float, default=1, - help='Minimum sum of instance weight needed in a child') -parser.add_argument('--max-delta-step', type=float, default=0, - help='Maximum delta step we allow each leaf output to be') -parser.add_argument('--subsample', type=float, default=1, - help='Subsample ratio of the training instances') + parser.add_argument('--colsample-bytree', type=float, default=1, help='Subsample ratio of columns ' 'when constructing each tree') -parser.add_argument('--reg-lambda', type=float, default=1, - help='L2 regularization term on weights') -parser.add_argument('--reg-alpha', type=float, default=0, - help='L1 regularization term on weights') -parser.add_argument('--tree-method', type=str, required=True, - help='The tree construction algorithm used in XGBoost') -parser.add_argument('--scale-pos-weight', type=float, default=1, - help='Controls a balance of positive and negative weights') +parser.add_argument('--count-dmatrix', default=False, action='store_true', + help='Count DMatrix creation in time measurements') +parser.add_argument('--enable-experimental-json-serialization', default=True, + choices=('True', 'False'), help='Use JSON to store memory snapshots') parser.add_argument('--grow-policy', type=str, default='depthwise', help='Controls a way new nodes are added to the tree') -parser.add_argument('--max-leaves', type=int, default=0, - help='Maximum number of nodes to be added') +parser.add_argument('--inplace-predict', default=False, action='store_true', + help='Perform inplace_predict instead of default') +parser.add_argument('--learning-rate', '--eta', type=float, default=0.3, + help='Step size shrinkage used in update ' + 'to prevents overfitting') parser.add_argument('--max-bin', type=int, default=256, help='Maximum number of discrete bins to ' 'bucket continuous features') +parser.add_argument('--max-delta-step', type=float, default=0, + help='Maximum delta step we allow each leaf output to be') +parser.add_argument('--max-depth', type=int, default=6, + help='Maximum depth of a tree') +parser.add_argument('--max-leaves', type=int, default=0, + help='Maximum number of nodes to be added') +parser.add_argument('--min-child-weight', type=float, default=1, + help='Minimum sum of instance weight needed in a child') +parser.add_argument('--min-split-loss', '--gamma', type=float, default=0, + help='Minimum loss reduction required to make' + ' partition on a leaf node') +parser.add_argument('--n-estimators', type=int, default=100, + help='Number of gradient boosted trees') parser.add_argument('--objective', type=str, required=True, choices=('reg:squarederror', 'binary:logistic', 'multi:softmax', 'multi:softprob'), - help='Control a balance of positive and negative weights') -parser.add_argument('--count-dmatrix', default=False, action='store_true', - help='Count DMatrix creation in time measurements') -parser.add_argument('--inplace-predict', default=False, action='store_true', - help='Perform inplace_predict instead of default') + help='Specifies the learning task') +parser.add_argument('--reg-alpha', type=float, default=0, + help='L1 regularization term on weights') +parser.add_argument('--reg-lambda', type=float, default=1, + help='L2 regularization term on weights') +parser.add_argument('--scale-pos-weight', type=float, default=1, + help='Controls a balance of positive and negative weights') parser.add_argument('--single-precision-histogram', default=False, action='store_true', help='Build histograms instead of double precision') -parser.add_argument('--enable-experimental-json-serialization', default=True, - choices=('True', 'False'), help='Use JSON to store memory snapshots') +parser.add_argument('--subsample', type=float, default=1, + help='Subsample ratio of the training instances') +parser.add_argument('--tree-method', type=str, required=True, + help='The tree construction algorithm used in XGBoost') params = bench.parse_args(parser) +# Default seed +if params.seed == 12345: + params.seed = 0 # Load and convert data X_train, X_test, y_train, y_test = bench.load_data(params) @@ -128,6 +132,9 @@ def convert_xgb_predictions(y_pred, objective): params.n_classes = y_train[y_train.columns[0]].nunique() else: params.n_classes = len(np.unique(y_train)) + # BE VERY CAREFUL ON IT!! It should only work for COVTYPE DATASET + if params.objective.startswith('multi:softmax'): + params.n_classes += 1 if params.n_classes > 2: xgb_params['num_class'] = params.n_classes @@ -165,4 +172,4 @@ def predict(): params=params, functions=['gbt.fit', 'gbt.predict'], times=[fit_time, predict_time], accuracy_type=metric_name, accuracies=[train_metric, test_metric], data=[X_train, X_test], - alg_instance=booster) + alg_instance=booster, alg_params=xgb_params) From 7e780bbd4d4fe6239d5972a732926e8b6b2bec7b Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Tue, 30 Mar 2021 15:24:58 +0300 Subject: [PATCH 10/31] Added higgs, msrank and airline fetching --- configs/xgb_cpu_config.json | 33 +++++-- datasets/loader.py | 173 +++++++++++++++++++++++++++++++++++- 2 files changed, 194 insertions(+), 12 deletions(-) diff --git a/configs/xgb_cpu_config.json b/configs/xgb_cpu_config.json index 56ab27929..7bbc09b2d 100644 --- a/configs/xgb_cpu_config.json +++ b/configs/xgb_cpu_config.json @@ -79,12 +79,17 @@ { "dataset": [ { - "source": "csv", + "source": "npy", "name": "airline-ohe", "training": { - "x": "data/airline-ohe_x_train.csv", - "y": "data/airline-ohe_y_train.csv" + "x": "data/airline-ohe_x_train.npy", + "y": "data/airline-ohe_y_train.npy" + }, + "testing": + { + "x": "data/airline-ohe_x_test.npy", + "y": "data/airline-ohe_y_test.npy" } } ], @@ -103,12 +108,17 @@ { "dataset": [ { - "source": "csv", + "source": "npy", "name": "higgs1m", "training": { - "x": "data/higgs1m_x_train.csv", - "y": "data/higgs1m_y_train.csv" + "x": "data/higgs1m_x_train.npy", + "y": "data/higgs1m_y_train.npy" + }, + "testing": + { + "x": "data/higgs1m_x_test.npy", + "y": "data/higgs1m_y_test.npy" } } ], @@ -129,12 +139,17 @@ { "dataset": [ { - "source": "csv", + "source": "npy", "name": "msrank", "training": { - "x": "data/mlsr_x_train.csv", - "y": "data/mlsr_y_train.csv" + "x": "data/msrank_x_train.npy", + "y": "data/msrank_y_train.npy" + }, + "testing": + { + "x": "data/msrank_x_test.npy", + "y": "data/msrank_y_test.npy" } } ], diff --git a/datasets/loader.py b/datasets/loader.py index 50bc2a5be..98409e73b 100755 --- a/datasets/loader.py +++ b/datasets/loader.py @@ -16,6 +16,8 @@ import logging import os +import re +import tarfile from pathlib import Path from typing import Any from urllib.request import urlretrieve @@ -46,6 +48,42 @@ def _retrieve(url: str, filename: str) -> None: urlretrieve(url, filename, reporthook=_show_progress) +def _read_libsvm_msrank(file_obj, n_samples, n_features, dtype): + X = np.zeros((n_samples, n_features)) + y = np.zeros((n_samples,)) + + counter = 0 + + regexp = re.compile(r'[A-Za-z0-9]+:(-?\d*\.?\d+)') + + for line in file_obj: + line = str(line).replace("\\n'", "") + line = regexp.sub('\g<1>', line) + line = line.rstrip(" \n\r").split(' ') + + y[counter] = int(line[0]) + X[counter] = [float(i) for i in line[1:]] + + counter += 1 + if counter == n_samples: + break + + return np.array(X, dtype=dtype), np.array(y, dtype=dtype) + + +def _make_gen(reader): + b = reader(1024 * 1024) + while b: + yield b + b = reader(1024 * 1024) + + +def _count_lines(filename): + with open(filename, 'rb') as f: + f_gen = _make_gen(f.read) + return sum(buf.count(b'\n') for buf in f_gen) + + def a_nine_a(dataset_dir: Path) -> bool: """ Author: Ronny Kohavi","Barry Becker @@ -136,7 +174,56 @@ def airline(dataset_dir: Path) -> bool: def airline_ohe(dataset_dir: Path) -> bool: - return False + """ + Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf + TaskType:binclass + NumberOfFeatures:700 + NumberOfInstances:10100000 + """ + dataset_name = 'airline-ohe' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://s3.amazonaws.com/benchm-ml--main/train-10m.csv' + url_test = 'https://s3.amazonaws.com/benchm-ml--main/test.csv' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}') + _retrieve(url_train, local_url_train) + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}') + _retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + + sets = [] + labels = [] + + categorical_names = ["Month", "DayofMonth", + "DayOfWeek", "UniqueCarrier", "Origin", "Dest"] + + for local_url in [local_url_train, local_url_train]: + df = pd.read_csv(local_url, nrows=1000000 + if local_url.endswith('train-10m.csv') else None) + X = df.drop('dep_delayed_15min', 1) + y = df["dep_delayed_15min"] + + y_num = np.where(y == "Y", 1, 0) + + sets.append(X) + labels.append(y_num) + + n_samples_train = sets[0].shape[0] + + X_final: Any = pd.concat(sets) + X_final = pd.get_dummies(X_final, columns=categorical_names) + sets = [X_final[:n_samples_train], X_final[n_samples_train:]] + + for data, name in zip((sets[0], sets[1], labels[0], labels[1]), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True def bosch(dataset_dir: Path) -> bool: @@ -454,7 +541,43 @@ def higgs(dataset_dir: Path) -> bool: def higgs_one_m(dataset_dir: Path) -> bool: - return False + """ + Higgs dataset from UCI machine learning repository ( + https://archive.ics.uci.edu/ml/datasets/HIGGS). + TaskType:binclass + NumberOfFeatures:28 + NumberOfInstances:11M + """ + dataset_name = 'higgs1m' + os.makedirs(dataset_dir, exist_ok=True) + + url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + _retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, started parsing...') + + nrows_train, nrows_test, dtype = 1000000, 500000, np.float32 + data: Any = pd.read_csv(local_url, delimiter=",", header=None, + compression="gzip", dtype=dtype, nrows=nrows_train+nrows_test) + + data = data[list(data.columns[1:])+list(data.columns[0:1])] + n_features = data.shape[1]-1 + train_data = np.ascontiguousarray(data.values[:nrows_train, :n_features], dtype=dtype) + train_label = np.ascontiguousarray(data.values[:nrows_train, n_features], dtype=dtype) + test_data = np.ascontiguousarray( + data.values[nrows_train: nrows_train + nrows_test, : n_features], + dtype=dtype) + test_label = np.ascontiguousarray( + data.values[nrows_train: nrows_train + nrows_test, n_features], + dtype=dtype) + for data, name in zip((train_data, test_data, train_label, test_label), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True def ijcnn(dataset_dir: Path) -> bool: @@ -576,7 +699,51 @@ def mortgage_first_q(dataset_dir: Path) -> bool: def msrank(dataset_dir: Path) -> bool: - return False + """ + Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf + TaskType:binclass + NumberOfFeatures:700 + NumberOfInstances:10100000 + """ + dataset_name = 'msrank' + os.makedirs(dataset_dir, exist_ok=True) + url = "https://storage.mds.yandex.net/get-devtools-opensource/471749/msrank.tar.gz" + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + _retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, unzipping...') + tar = tarfile.open(local_url, "r:gz") + tar.extractall(dataset_dir) + tar.close() + logging.info(f'{dataset_name} is unzipped, started parsing...') + + sets = [] + labels = [] + n_features = 137 + + for set_name in ['train.txt', 'vali.txt', 'test.txt']: + file_name = str(dataset_dir) + os.path.join('MSRank', set_name) + + n_samples = _count_lines(file_name) + with open(file_name, 'r') as file_obj: + X, y = _read_libsvm_msrank(file_obj, n_samples, n_features, np.float32) + + sets.append(X) + labels.append(y) + + sets[0] = np.vstack((sets[0], sets[1])) + labels[0] = np.hstack((labels[0], labels[1])) + + sets = [np.ascontiguousarray(sets[i]) for i in [0, 2]] + labels = [np.ascontiguousarray(labels[i]) for i in [0, 2]] + + for data, name in zip((sets[0], sets[1], labels[0], labels[1]), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True def plasticc(dataset_dir: Path) -> bool: From 670c28992d86889b4cd129a6b3bf525867c3b624 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Tue, 30 Mar 2021 15:45:32 +0300 Subject: [PATCH 11/31] small fixes of env --- azure-pipelines.yml | 10 +++++----- datasets/loader.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 784acad03..67cef5c81 100755 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -6,7 +6,7 @@ jobs: - script: | conda update -y -q conda export FORCE_DAAL4PY_SKLEARN=yes - conda create -q -y -n bench -c conda-forge python=3.7 pandas scikit-learn daal4py + conda create -q -y -n bench -c conda-forge python=3.7 pandas scikit-learn daal4py tqdm displayName: Create Anaconda environment - script: | . /usr/share/miniconda/etc/profile.d/conda.sh @@ -19,7 +19,7 @@ jobs: steps: - script: | conda update -y -q conda - conda create -n bench -q -y -c conda-forge python=3.7 pandas xgboost scikit-learn + conda create -n bench -q -y -c conda-forge python=3.7 pandas xgboost scikit-learn tqdm displayName: Create Anaconda environment - script: | . /usr/share/miniconda/etc/profile.d/conda.sh @@ -32,7 +32,7 @@ jobs: steps: - script: | conda update -y -q conda - conda create -n bench -q -y -c conda-forge python=3.7 pandas scikit-learn daal4py + conda create -n bench -q -y -c conda-forge python=3.7 pandas scikit-learn daal4py tqdm displayName: Create Anaconda environment - script: | . /usr/share/miniconda/etc/profile.d/conda.sh @@ -45,7 +45,7 @@ jobs: steps: - script: | conda update -y -q conda - conda create -n bench -q -y -c conda-forge python=3.7 pandas xgboost scikit-learn daal4py + conda create -n bench -q -y -c conda-forge python=3.7 pandas xgboost scikit-learn daal4py tqdm displayName: Create Anaconda environment - script: | . /usr/share/miniconda/etc/profile.d/conda.sh @@ -63,7 +63,7 @@ jobs: - script: | python -m pip install --upgrade pip setuptools pip install flake8 - flake8 --max-line-length=90 --count + flake8 --max-line-length=100 --count displayName: 'PEP 8 check' - job: Mypy pool: diff --git a/datasets/loader.py b/datasets/loader.py index 98409e73b..e5fe3387d 100755 --- a/datasets/loader.py +++ b/datasets/loader.py @@ -58,7 +58,7 @@ def _read_libsvm_msrank(file_obj, n_samples, n_features, dtype): for line in file_obj: line = str(line).replace("\\n'", "") - line = regexp.sub('\g<1>', line) + line = regexp.sub(r'\g<1>', line) line = line.rstrip(" \n\r").split(' ') y[counter] = int(line[0]) From dc0e9c975822995ff9a1b048767bcf8ea5d60d66 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Thu, 1 Apr 2021 11:01:26 +0300 Subject: [PATCH 12/31] Applying comments --- azure-pipelines.yml | 2 +- datasets/load_datasets.py | 4 +- sklearn_bench/df_clsf.py | 100 +++++++++++++------------- sklearn_bench/df_regr.py | 100 +++++++++++++------------- sklearn_bench/distances.py | 36 +++++----- sklearn_bench/elasticnet.py | 84 +++++++++++----------- sklearn_bench/knn_clsf.py | 114 ++++++++++++++++-------------- sklearn_bench/lasso.py | 70 +++++++++--------- sklearn_bench/pca.py | 72 ++++++++++--------- sklearn_bench/ridge.py | 76 ++++++++++---------- sklearn_bench/svm.py | 94 ++++++++++++------------ sklearn_bench/train_test_split.py | 82 +++++++++++---------- 12 files changed, 437 insertions(+), 397 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 784acad03..4a59a3c9e 100755 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -63,7 +63,7 @@ jobs: - script: | python -m pip install --upgrade pip setuptools pip install flake8 - flake8 --max-line-length=90 --count + flake8 --max-line-length=100 --count displayName: 'PEP 8 check' - job: Mypy pool: diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index a6e79dc2b..e16c6c918 100755 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -40,8 +40,8 @@ def try_load_dataset(dataset_name, output_directory): if dataset_name in dataset_loaders.keys(): try: return dataset_loaders[dataset_name](output_directory) - except BaseException: - logging.warning("Internal error loading dataset") + except BaseException as ex: + logging.warning(f"Internal error loading dataset:\n{ex}") return False else: logging.warning(f"There is no script to download the dataset: {dataset_name}. " diff --git a/sklearn_bench/df_clsf.py b/sklearn_bench/df_clsf.py index d3351dbc4..bb69185a3 100644 --- a/sklearn_bench/df_clsf.py +++ b/sklearn_bench/df_clsf.py @@ -20,60 +20,64 @@ import numpy as np from sklearn.metrics import accuracy_score -parser = argparse.ArgumentParser(description='scikit-learn random forest ' - 'classification benchmark') -parser.add_argument('--criterion', type=str, default='gini', - choices=('gini', 'entropy'), - help='The function to measure the quality of a split') -parser.add_argument('--num-trees', type=int, default=100, - help='Number of trees in the forest') -parser.add_argument('--max-features', type=bench.float_or_int, default=None, - help='Upper bound on features used at each split') -parser.add_argument('--max-depth', type=int, default=None, - help='Upper bound on depth of constructed trees') -parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2, - help='Minimum samples number for node splitting') -parser.add_argument('--max-leaf-nodes', type=int, default=None, - help='Maximum leaf nodes per tree') -parser.add_argument('--min-impurity-decrease', type=float, default=0., - help='Needed impurity decrease for node splitting') -parser.add_argument('--no-bootstrap', dest='bootstrap', default=True, - action='store_false', help="Don't control bootstraping") +def main(): + from sklearn.ensemble import RandomForestClassifier -params = bench.parse_args(parser) + # Load and convert data + X_train, X_test, y_train, y_test = bench.load_data(params) -if not params.no_intel_optimized: - from sklearn.ensemble import RandomForestClassifier + # Create our random forest classifier + clf = RandomForestClassifier(criterion=params.criterion, + n_estimators=params.num_trees, + max_depth=params.max_depth, + max_features=params.max_features, + min_samples_split=params.min_samples_split, + max_leaf_nodes=params.max_leaf_nodes, + min_impurity_decrease=params.min_impurity_decrease, + bootstrap=params.bootstrap, + random_state=params.seed, + n_jobs=params.n_jobs) + + params.n_classes = len(np.unique(y_train)) + + fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) + y_pred = clf.predict(X_train) + train_acc = 100 * accuracy_score(y_pred, y_train) -# Load and convert data -X_train, X_test, y_train, y_test = bench.load_data(params) + predict_time, y_pred = bench.measure_function_time( + clf.predict, X_test, params=params) + test_acc = 100 * accuracy_score(y_pred, y_test) -# Create our random forest classifier -clf = RandomForestClassifier(criterion=params.criterion, - n_estimators=params.num_trees, - max_depth=params.max_depth, - max_features=params.max_features, - min_samples_split=params.min_samples_split, - max_leaf_nodes=params.max_leaf_nodes, - min_impurity_decrease=params.min_impurity_decrease, - bootstrap=params.bootstrap, - random_state=params.seed, - n_jobs=params.n_jobs) + bench.print_output(library='sklearn', algorithm='decision_forest_classification', + stages=['training', 'prediction'], params=params, + functions=['df_clsf.fit', 'df_clsf.predict'], + times=[fit_time, predict_time], accuracy_type='accuracy[%]', + accuracies=[train_acc, test_acc], data=[X_train, X_test], + alg_instance=clf) -params.n_classes = len(np.unique(y_train)) -fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) -y_pred = clf.predict(X_train) -train_acc = 100 * accuracy_score(y_pred, y_train) +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='scikit-learn random forest ' + 'classification benchmark') -predict_time, y_pred = bench.measure_function_time( - clf.predict, X_test, params=params) -test_acc = 100 * accuracy_score(y_pred, y_test) + parser.add_argument('--criterion', type=str, default='gini', + choices=('gini', 'entropy'), + help='The function to measure the quality of a split') + parser.add_argument('--num-trees', type=int, default=100, + help='Number of trees in the forest') + parser.add_argument('--max-features', type=bench.float_or_int, default=None, + help='Upper bound on features used at each split') + parser.add_argument('--max-depth', type=int, default=None, + help='Upper bound on depth of constructed trees') + parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2, + help='Minimum samples number for node splitting') + parser.add_argument('--max-leaf-nodes', type=int, default=None, + help='Maximum leaf nodes per tree') + parser.add_argument('--min-impurity-decrease', type=float, default=0., + help='Needed impurity decrease for node splitting') + parser.add_argument('--no-bootstrap', dest='bootstrap', default=True, + action='store_false', help="Don't control bootstraping") -bench.print_output(library='sklearn', algorithm='decision_forest_classification', - stages=['training', 'prediction'], params=params, - functions=['df_clsf.fit', 'df_clsf.predict'], - times=[fit_time, predict_time], accuracy_type='accuracy[%]', - accuracies=[train_acc, test_acc], data=[X_train, X_test], - alg_instance=clf) + params = bench.parse_args(parser) + bench.run_with_context(params, main) diff --git a/sklearn_bench/df_regr.py b/sklearn_bench/df_regr.py index a9f29743a..53d3c8afd 100644 --- a/sklearn_bench/df_regr.py +++ b/sklearn_bench/df_regr.py @@ -18,60 +18,64 @@ import bench -parser = argparse.ArgumentParser(description='scikit-learn random forest ' - 'regression benchmark') -parser.add_argument('--criterion', type=str, default='mse', - choices=('mse', 'mae'), - help='The function to measure the quality of a split') -parser.add_argument('--num-trees', type=int, default=100, - help='Number of trees in the forest') -parser.add_argument('--max-features', type=bench.float_or_int, default=None, - help='Upper bound on features used at each split') -parser.add_argument('--max-depth', type=int, default=None, - help='Upper bound on depth of constructed trees') -parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2, - help='Minimum samples number for node splitting') -parser.add_argument('--max-leaf-nodes', type=int, default=None, - help='Grow trees with max_leaf_nodes in best-first fashion' - 'if it is not None') -parser.add_argument('--min-impurity-decrease', type=float, default=0., - help='Needed impurity decrease for node splitting') -parser.add_argument('--no-bootstrap', dest='bootstrap', default=True, - action='store_false', help="Don't control bootstraping") +def main(): + from sklearn.ensemble import RandomForestRegressor -params = bench.parse_args(parser) + # Load and convert data + X_train, X_test, y_train, y_test = bench.load_data(params) -if not params.no_intel_optimized: - from sklearn.ensemble import RandomForestRegressor + # Create our random forest regressor + regr = RandomForestRegressor(criterion=params.criterion, + n_estimators=params.num_trees, + max_depth=params.max_depth, + max_features=params.max_features, + min_samples_split=params.min_samples_split, + max_leaf_nodes=params.max_leaf_nodes, + min_impurity_decrease=params.min_impurity_decrease, + bootstrap=params.bootstrap, + random_state=params.seed, + n_jobs=params.n_jobs) + + fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) + + y_pred = regr.predict(X_train) + train_rmse = bench.rmse_score(y_pred, y_train) -# Load and convert data -X_train, X_test, y_train, y_test = bench.load_data(params) + predict_time, y_pred = bench.measure_function_time( + regr.predict, X_test, params=params) + test_rmse = bench.rmse_score(y_pred, y_test) -# Create our random forest regressor -regr = RandomForestRegressor(criterion=params.criterion, - n_estimators=params.num_trees, - max_depth=params.max_depth, - max_features=params.max_features, - min_samples_split=params.min_samples_split, - max_leaf_nodes=params.max_leaf_nodes, - min_impurity_decrease=params.min_impurity_decrease, - bootstrap=params.bootstrap, - random_state=params.seed, - n_jobs=params.n_jobs) + bench.print_output(library='sklearn', algorithm='decision_forest_regression', + stages=['training', 'prediction'], params=params, + functions=['df_regr.fit', 'df_regr.predict'], + times=[fit_time, predict_time], accuracy_type='rmse', + accuracies=[train_rmse, test_rmse], data=[X_train, X_test], + alg_instance=regr) -fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) -y_pred = regr.predict(X_train) -train_rmse = bench.rmse_score(y_pred, y_train) +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='scikit-learn random forest ' + 'regression benchmark') -predict_time, y_pred = bench.measure_function_time( - regr.predict, X_test, params=params) -test_rmse = bench.rmse_score(y_pred, y_test) + parser.add_argument('--criterion', type=str, default='mse', + choices=('mse', 'mae'), + help='The function to measure the quality of a split') + parser.add_argument('--num-trees', type=int, default=100, + help='Number of trees in the forest') + parser.add_argument('--max-features', type=bench.float_or_int, default=None, + help='Upper bound on features used at each split') + parser.add_argument('--max-depth', type=int, default=None, + help='Upper bound on depth of constructed trees') + parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2, + help='Minimum samples number for node splitting') + parser.add_argument('--max-leaf-nodes', type=int, default=None, + help='Grow trees with max_leaf_nodes in best-first fashion' + 'if it is not None') + parser.add_argument('--min-impurity-decrease', type=float, default=0., + help='Needed impurity decrease for node splitting') + parser.add_argument('--no-bootstrap', dest='bootstrap', default=True, + action='store_false', help="Don't control bootstraping") -bench.print_output(library='sklearn', algorithm='decision_forest_regression', - stages=['training', 'prediction'], params=params, - functions=['df_regr.fit', 'df_regr.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_test], - alg_instance=regr) + params = bench.parse_args(parser) + bench.run_with_context(params, main) diff --git a/sklearn_bench/distances.py b/sklearn_bench/distances.py index d676a2b36..b3f7c1a46 100644 --- a/sklearn_bench/distances.py +++ b/sklearn_bench/distances.py @@ -18,23 +18,27 @@ import bench -parser = argparse.ArgumentParser(description='scikit-learn pairwise distances ' - 'benchmark') -parser.add_argument('--metric', default='cosine', - choices=['cosine', 'correlation'], - help='Metric to test for pairwise distances') -params = bench.parse_args(parser) - -if not params.no_intel_optimized: + +def main(): from sklearn.metrics.pairwise import pairwise_distances -# Load data -X, _, _, _ = bench.load_data(params, generated_data=['X_train'], add_dtype=True) + # Load data + X, _, _, _ = bench.load_data(params, generated_data=['X_train'], add_dtype=True) + + time, _ = bench.measure_function_time(pairwise_distances, X, metric=params.metric, + n_jobs=params.n_jobs, params=params) + + bench.print_output(library='sklearn', algorithm='distances', stages=['computation'], + params=params, functions=[params.metric.capitalize()], times=[time], + accuracy_type=None, accuracies=[None], data=[X], + alg_params={'metric': params.metric}) -time, _ = bench.measure_function_time(pairwise_distances, X, metric=params.metric, - n_jobs=params.n_jobs, params=params) -bench.print_output(library='sklearn', algorithm='distances', stages=['computation'], - params=params, functions=[params.metric.capitalize()], times=[time], - accuracy_type=None, accuracies=[None], data=[X], - alg_params={'metric': params.metric}) +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='scikit-learn pairwise distances ' + 'benchmark') + parser.add_argument('--metric', default='cosine', + choices=['cosine', 'correlation'], + help='Metric to test for pairwise distances') + params = bench.parse_args(parser) + bench.run_with_context(params, main) diff --git a/sklearn_bench/elasticnet.py b/sklearn_bench/elasticnet.py index b3f5ff2f5..ac7f34050 100755 --- a/sklearn_bench/elasticnet.py +++ b/sklearn_bench/elasticnet.py @@ -18,45 +18,49 @@ import bench -parser = argparse.ArgumentParser(description='scikit-learn elastic-net regression ' - 'benchmark') -parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=False, - action='store_false', - help="Don't fit intercept (assume data already centered)") -parser.add_argument('--alpha', dest='alpha', type=float, default=1.0, - help='Regularization parameter') -parser.add_argument('--maxiter', type=int, default=1000, - help='Maximum iterations for the iterative solver') -parser.add_argument('--l1_ratio', dest='l1_ratio', type=float, default=0.5, - help='Regularization parameter') -parser.add_argument('--tol', type=float, default=0.0, - help='Tolerance for solver.') -params = bench.parse_args(parser) - -if not params.no_intel_optimized: + +def main(): from sklearn.linear_model import ElasticNet -# Load data -X_train, X_test, y_train, y_test = bench.load_data(params) - -# Create our regression object -regr = ElasticNet(fit_intercept=params.fit_intercept, l1_ratio=params.l1_ratio, - alpha=params.alpha, tol=params.tol, - max_iter=params.maxiter, copy_X=False) -# Time fit -fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) - -# Time predict -predict_time, pred_train = bench.measure_function_time(regr.predict, - X_train, params=params) - -train_rmse = bench.rmse_score(pred_train, y_train) -pred_test = regr.predict(X_test) -test_rmse = bench.rmse_score(pred_test, y_test) - -bench.print_output(library='sklearn', algorithm='elastic-net', - stages=['training', 'prediction'], params=params, - functions=['ElasticNet.fit', 'ElasticNet.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_train], - alg_instance=regr) + # Load data + X_train, X_test, y_train, y_test = bench.load_data(params) + + # Create our regression object + regr = ElasticNet(fit_intercept=params.fit_intercept, l1_ratio=params.l1_ratio, + alpha=params.alpha, tol=params.tol, + max_iter=params.maxiter, copy_X=False) + # Time fit + fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) + + # Time predict + predict_time, pred_train = bench.measure_function_time(regr.predict, + X_train, params=params) + + train_rmse = bench.rmse_score(pred_train, y_train) + pred_test = regr.predict(X_test) + test_rmse = bench.rmse_score(pred_test, y_test) + + bench.print_output(library='sklearn', algorithm='elastic-net', + stages=['training', 'prediction'], params=params, + functions=['ElasticNet.fit', 'ElasticNet.predict'], + times=[fit_time, predict_time], accuracy_type='rmse', + accuracies=[train_rmse, test_rmse], data=[X_train, X_train], + alg_instance=regr) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='scikit-learn elastic-net regression ' + 'benchmark') + parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=False, + action='store_false', + help="Don't fit intercept (assume data already centered)") + parser.add_argument('--alpha', dest='alpha', type=float, default=1.0, + help='Regularization parameter') + parser.add_argument('--maxiter', type=int, default=1000, + help='Maximum iterations for the iterative solver') + parser.add_argument('--l1_ratio', dest='l1_ratio', type=float, default=0.5, + help='Regularization parameter') + parser.add_argument('--tol', type=float, default=0.0, + help='Tolerance for solver.') + params = bench.parse_args(parser) + bench.run_with_context(params, main) diff --git a/sklearn_bench/knn_clsf.py b/sklearn_bench/knn_clsf.py index 749a6ae82..2674bf631 100755 --- a/sklearn_bench/knn_clsf.py +++ b/sklearn_bench/knn_clsf.py @@ -20,65 +20,69 @@ import numpy as np from sklearn.metrics import accuracy_score -parser = argparse.ArgumentParser( - description='scikit-learn kNN classifier benchmark') -parser.add_argument('--task', default='classification', type=str, - choices=('search', 'classification'), - help='kNN task: search or classification') -parser.add_argument('--n-neighbors', default=5, type=int, - help='Number of neighbors to use') -parser.add_argument('--weights', type=str, default='uniform', - help='Weight function used in prediction') -parser.add_argument('--method', type=str, default='brute', - choices=('brute', 'kd_tree', 'ball_tree', 'auto'), - help='Algorithm used to compute the nearest neighbors') -parser.add_argument('--metric', type=str, default='euclidean', - help='Distance metric to use') -params = bench.parse_args(parser) - -if not params.no_intel_optimized: +def main(): from sklearn.neighbors import KNeighborsClassifier -# Load generated data -X_train, X_test, y_train, y_test = bench.load_data(params) -params.n_classes = len(np.unique(y_train)) + # Load generated data + X_train, X_test, y_train, y_test = bench.load_data(params) + params.n_classes = len(np.unique(y_train)) + + # Create classification object + knn_clsf = KNeighborsClassifier(n_neighbors=params.n_neighbors, + weights=params.weights, + algorithm=params.method, + metric=params.metric, + n_jobs=params.n_jobs) + + # Measure time and accuracy on fitting + train_time, _ = bench.measure_function_time(knn_clsf.fit, X_train, y_train, params=params) + if params.task == 'classification': + y_pred = knn_clsf.predict(X_train) + train_acc = 100 * accuracy_score(y_pred, y_train) + + # Measure time and accuracy on prediction + if params.task == 'classification': + predict_time, yp = bench.measure_function_time(knn_clsf.predict, X_test, + params=params) + test_acc = 100 * accuracy_score(yp, y_test) + else: + predict_time, _ = bench.measure_function_time(knn_clsf.kneighbors, X_test, + params=params) -# Create classification object -knn_clsf = KNeighborsClassifier(n_neighbors=params.n_neighbors, - weights=params.weights, - algorithm=params.method, - metric=params.metric, - n_jobs=params.n_jobs) + if params.task == 'classification': + bench.print_output(library='sklearn', + algorithm=knn_clsf._fit_method + '_knn_classification', + stages=['training', 'prediction'], params=params, + functions=['knn_clsf.fit', 'knn_clsf.predict'], + times=[train_time, predict_time], + accuracies=[train_acc, test_acc], accuracy_type='accuracy[%]', + data=[X_train, X_test], alg_instance=knn_clsf) + else: + bench.print_output(library='sklearn', + algorithm=knn_clsf._fit_method + '_knn_search', + stages=['training', 'search'], params=params, + functions=['knn_clsf.fit', 'knn_clsf.kneighbors'], + times=[train_time, predict_time], + accuracies=[], accuracy_type=None, + data=[X_train, X_test], alg_instance=knn_clsf) -# Measure time and accuracy on fitting -train_time, _ = bench.measure_function_time(knn_clsf.fit, X_train, y_train, params=params) -if params.task == 'classification': - y_pred = knn_clsf.predict(X_train) - train_acc = 100 * accuracy_score(y_pred, y_train) -# Measure time and accuracy on prediction -if params.task == 'classification': - predict_time, yp = bench.measure_function_time(knn_clsf.predict, X_test, - params=params) - test_acc = 100 * accuracy_score(yp, y_test) -else: - predict_time, _ = bench.measure_function_time(knn_clsf.kneighbors, X_test, - params=params) +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='scikit-learn kNN classifier benchmark') -if params.task == 'classification': - bench.print_output(library='sklearn', - algorithm=knn_clsf._fit_method + '_knn_classification', - stages=['training', 'prediction'], params=params, - functions=['knn_clsf.fit', 'knn_clsf.predict'], - times=[train_time, predict_time], - accuracies=[train_acc, test_acc], accuracy_type='accuracy[%]', - data=[X_train, X_test], alg_instance=knn_clsf) -else: - bench.print_output(library='sklearn', - algorithm=knn_clsf._fit_method + '_knn_search', - stages=['training', 'search'], params=params, - functions=['knn_clsf.fit', 'knn_clsf.kneighbors'], - times=[train_time, predict_time], - accuracies=[], accuracy_type=None, - data=[X_train, X_test], alg_instance=knn_clsf) + parser.add_argument('--task', default='classification', type=str, + choices=('search', 'classification'), + help='kNN task: search or classification') + parser.add_argument('--n-neighbors', default=5, type=int, + help='Number of neighbors to use') + parser.add_argument('--weights', type=str, default='uniform', + help='Weight function used in prediction') + parser.add_argument('--method', type=str, default='brute', + choices=('brute', 'kd_tree', 'ball_tree', 'auto'), + help='Algorithm used to compute the nearest neighbors') + parser.add_argument('--metric', type=str, default='euclidean', + help='Distance metric to use') + params = bench.parse_args(parser) + bench.run_with_context(params, main) diff --git a/sklearn_bench/lasso.py b/sklearn_bench/lasso.py index 51fd09181..32fd0d591 100755 --- a/sklearn_bench/lasso.py +++ b/sklearn_bench/lasso.py @@ -18,43 +18,47 @@ import bench -parser = argparse.ArgumentParser(description='scikit-learn lasso regression ' - 'benchmark') -parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=False, - action='store_false', - help="Don't fit intercept (assume data already centered)") -parser.add_argument('--alpha', dest='alpha', type=float, default=1.0, - help='Regularization parameter') -parser.add_argument('--maxiter', type=int, default=1000, - help='Maximum iterations for the iterative solver') -parser.add_argument('--tol', type=float, default=0.0, - help='Tolerance for solver.') -params = bench.parse_args(parser) - -if not params.no_intel_optimized: + +def main(): from sklearn.linear_model import Lasso -# Load data -X_train, X_test, y_train, y_test = bench.load_data(params) + # Load data + X_train, X_test, y_train, y_test = bench.load_data(params) + + # Create our regression object + regr = Lasso(fit_intercept=params.fit_intercept, alpha=params.alpha, + tol=params.tol, max_iter=params.maxiter, copy_X=False) + + # Time fit + fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) -# Create our regression object -regr = Lasso(fit_intercept=params.fit_intercept, alpha=params.alpha, - tol=params.tol, max_iter=params.maxiter, copy_X=False) + # Time predict + predict_time, pred_train = bench.measure_function_time( + regr.predict, X_train, params=params) -# Time fit -fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) + train_rmse = bench.rmse_score(pred_train, y_train) + pred_test = regr.predict(X_test) + test_rmse = bench.rmse_score(pred_test, y_test) -# Time predict -predict_time, pred_train = bench.measure_function_time( - regr.predict, X_train, params=params) + bench.print_output(library='sklearn', algorithm='lasso', + stages=['training', 'prediction'], params=params, + functions=['Lasso.fit', 'Lasso.predict'], + times=[fit_time, predict_time], accuracy_type='rmse', + accuracies=[train_rmse, test_rmse], data=[X_train, X_test], + alg_instance=regr) -train_rmse = bench.rmse_score(pred_train, y_train) -pred_test = regr.predict(X_test) -test_rmse = bench.rmse_score(pred_test, y_test) -bench.print_output(library='sklearn', algorithm='lasso', - stages=['training', 'prediction'], params=params, - functions=['Lasso.fit', 'Lasso.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_test], - alg_instance=regr) +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='scikit-learn lasso regression ' + 'benchmark') + parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=False, + action='store_false', + help="Don't fit intercept (assume data already centered)") + parser.add_argument('--alpha', dest='alpha', type=float, default=1.0, + help='Regularization parameter') + parser.add_argument('--maxiter', type=int, default=1000, + help='Maximum iterations for the iterative solver') + parser.add_argument('--tol', type=float, default=0.0, + help='Tolerance for solver.') + params = bench.parse_args(parser) + bench.run_with_context(params, main) diff --git a/sklearn_bench/pca.py b/sklearn_bench/pca.py index 3eb15465f..b810603a8 100644 --- a/sklearn_bench/pca.py +++ b/sklearn_bench/pca.py @@ -18,39 +18,43 @@ import bench -parser = argparse.ArgumentParser(description='scikit-learn PCA benchmark') -parser.add_argument('--svd-solver', type=str, choices=['full'], - default='full', help='SVD solver to use') -parser.add_argument('--n-components', type=int, default=None, - help='Number of components to find') -parser.add_argument('--whiten', action='store_true', default=False, - help='Perform whitening') -params = bench.parse_args(parser) - -if not params.no_intel_optimized: - from sklearn.decomposition import PCA - -# Load random data -X_train, X_test, _, _ = bench.load_data(params, generated_data=['X_train']) - -if params.n_components is None: - p, n = X_train.shape - params.n_components = min((n, (2 + min((n, p))) // 3)) -# Create our PCA object -pca = PCA(svd_solver=params.svd_solver, whiten=params.whiten, - n_components=params.n_components) - -# Time fit -fit_time, _ = bench.measure_function_time(pca.fit, X_train, params=params) - -# Time transform -transform_time, _ = bench.measure_function_time( - pca.transform, X_train, params=params) +def main(): + from sklearn.decomposition import PCA -bench.print_output(library='sklearn', algorithm='pca', - stages=['training', 'transformation'], - params=params, functions=['PCA.fit', 'PCA.transform'], - times=[fit_time, transform_time], accuracy_type=None, - accuracies=[None, None], data=[X_train, X_test], - alg_instance=pca) + # Load random data + X_train, X_test, _, _ = bench.load_data(params, generated_data=['X_train']) + + if params.n_components is None: + p, n = X_train.shape + params.n_components = min((n, (2 + min((n, p))) // 3)) + + # Create our PCA object + pca = PCA(svd_solver=params.svd_solver, whiten=params.whiten, + n_components=params.n_components) + + # Time fit + fit_time, _ = bench.measure_function_time(pca.fit, X_train, params=params) + + # Time transform + transform_time, _ = bench.measure_function_time( + pca.transform, X_train, params=params) + + bench.print_output(library='sklearn', algorithm='pca', + stages=['training', 'transformation'], + params=params, functions=['PCA.fit', 'PCA.transform'], + times=[fit_time, transform_time], accuracy_type=None, + accuracies=[None, None], data=[X_train, X_test], + alg_instance=pca) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='scikit-learn PCA benchmark') + parser.add_argument('--svd-solver', type=str, choices=['full'], + default='full', help='SVD solver to use') + parser.add_argument('--n-components', type=int, default=None, + help='Number of components to find') + parser.add_argument('--whiten', action='store_true', default=False, + help='Perform whitening') + params = bench.parse_args(parser) + bench.run_with_context(params, main) diff --git a/sklearn_bench/ridge.py b/sklearn_bench/ridge.py index 229fb29dc..029b1f1d6 100644 --- a/sklearn_bench/ridge.py +++ b/sklearn_bench/ridge.py @@ -18,41 +18,45 @@ import bench -parser = argparse.ArgumentParser(description='scikit-learn ridge regression ' - 'benchmark') -parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, - action='store_false', - help="Don't fit intercept (assume data already centered)") -parser.add_argument('--solver', default='auto', - help='Solver used for training') -parser.add_argument('--alpha', type=float, default=1.0, - help='Regularization strength') -params = bench.parse_args(parser) - -if not params.no_intel_optimized: - from sklearn.linear_model import Ridge - -# Load data -X_train, X_test, y_train, y_test = bench.load_data(params, - generated_data=['X_train', 'y_train']) - -# Create our regression object -regr = Ridge(fit_intercept=params.fit_intercept, alpha=params.alpha, - solver=params.solver) -# Time fit -fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) - -# Time predict -predict_time, yp = bench.measure_function_time(regr.predict, X_test, params=params) - -test_rmse = bench.rmse_score(yp, y_test) -yp = regr.predict(X_train) -train_rmse = bench.rmse_score(yp, y_train) +def main(): + from sklearn.linear_model import Ridge -bench.print_output(library='sklearn', algorithm='ridge_regression', - stages=['training', 'prediction'], params=params, - functions=['Ridge.fit', 'Ridge.predict'], - times=[fit_time, predict_time], accuracy_type='rmse', - accuracies=[train_rmse, test_rmse], data=[X_train, X_test], - alg_instance=regr) + # Load data + X_train, X_test, y_train, y_test = bench.load_data(params, + generated_data=['X_train', 'y_train']) + + # Create our regression object + regr = Ridge(fit_intercept=params.fit_intercept, alpha=params.alpha, + solver=params.solver) + + # Time fit + fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) + + # Time predict + predict_time, yp = bench.measure_function_time(regr.predict, X_test, params=params) + + test_rmse = bench.rmse_score(yp, y_test) + yp = regr.predict(X_train) + train_rmse = bench.rmse_score(yp, y_train) + + bench.print_output(library='sklearn', algorithm='ridge_regression', + stages=['training', 'prediction'], params=params, + functions=['Ridge.fit', 'Ridge.predict'], + times=[fit_time, predict_time], accuracy_type='rmse', + accuracies=[train_rmse, test_rmse], data=[X_train, X_test], + alg_instance=regr) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='scikit-learn ridge regression ' + 'benchmark') + parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, + action='store_false', + help="Don't fit intercept (assume data already centered)") + parser.add_argument('--solver', default='auto', + help='Solver used for training') + parser.add_argument('--alpha', type=float, default=1.0, + help='Regularization strength') + params = bench.parse_args(parser) + bench.run_with_context(params, main) diff --git a/sklearn_bench/svm.py b/sklearn_bench/svm.py index 0ea8611c6..102a6e62a 100644 --- a/sklearn_bench/svm.py +++ b/sklearn_bench/svm.py @@ -20,58 +20,62 @@ import numpy as np from sklearn.metrics import accuracy_score -parser = argparse.ArgumentParser(description='scikit-learn SVM benchmark') -parser.add_argument('-C', dest='C', type=float, default=1.0, - help='SVM regularization parameter') -parser.add_argument('--kernel', choices=('linear', 'rbf'), - default='linear', help='SVM kernel function') -parser.add_argument('--gamma', type=float, default=None, - help='Parameter for kernel="rbf"') -parser.add_argument('--maxiter', type=int, default=-1, - help='Maximum iterations for the iterative solver. ' - '-1 means no limit.') -parser.add_argument('--max-cache-size', type=int, default=8, - help='Maximum cache size, in gigabytes, for SVM.') -parser.add_argument('--tol', type=float, default=1e-3, - help='Tolerance passed to sklearn.svm.SVC') -parser.add_argument('--no-shrinking', action='store_false', default=True, - dest='shrinking', help="Don't use shrinking heuristic") -params = bench.parse_args(parser, loop_types=('fit', 'predict')) - -if not params.no_intel_optimized: +def main(): from sklearn.svm import SVC -# Load data -X_train, X_test, y_train, y_test = bench.load_data(params) + # Load data + X_train, X_test, y_train, y_test = bench.load_data(params) + + if params.gamma is None: + params.gamma = 1.0 / X_train.shape[1] + + cache_size_bytes = bench.get_optimal_cache_size(X_train.shape[0], + max_cache=params.max_cache_size) + params.cache_size_mb = cache_size_bytes / 1024**2 + params.n_classes = len(np.unique(y_train)) + + # Create our C-SVM classifier + clf = SVC(C=params.C, kernel=params.kernel, max_iter=params.maxiter, + cache_size=params.cache_size_mb, tol=params.tol, + shrinking=params.shrinking, gamma=params.gamma) -if params.gamma is None: - params.gamma = 1.0 / X_train.shape[1] + # Time fit and predict + fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) + params.sv_len = clf.support_.shape[0] -cache_size_bytes = bench.get_optimal_cache_size(X_train.shape[0], - max_cache=params.max_cache_size) -params.cache_size_mb = cache_size_bytes / 1024**2 -params.n_classes = len(np.unique(y_train)) + predict_time, y_pred = bench.measure_function_time( + clf.predict, X_train, params=params) + train_acc = 100 * accuracy_score(y_pred, y_train) -# Create our C-SVM classifier -clf = SVC(C=params.C, kernel=params.kernel, max_iter=params.maxiter, - cache_size=params.cache_size_mb, tol=params.tol, - shrinking=params.shrinking, gamma=params.gamma) + y_pred = clf.predict(X_test) + test_acc = 100 * accuracy_score(y_pred, y_test) -# Time fit and predict -fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) -params.sv_len = clf.support_.shape[0] + bench.print_output(library='sklearn', algorithm='svc', + stages=['training', 'prediction'], + params=params, functions=['SVM.fit', 'SVM.predict'], + times=[fit_time, predict_time], accuracy_type='accuracy[%]', + accuracies=[train_acc, test_acc], data=[X_train, X_train], + alg_instance=clf) -predict_time, y_pred = bench.measure_function_time( - clf.predict, X_train, params=params) -train_acc = 100 * accuracy_score(y_pred, y_train) -y_pred = clf.predict(X_test) -test_acc = 100 * accuracy_score(y_pred, y_test) +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='scikit-learn SVM benchmark') -bench.print_output(library='sklearn', algorithm='svc', - stages=['training', 'prediction'], - params=params, functions=['SVM.fit', 'SVM.predict'], - times=[fit_time, predict_time], accuracy_type='accuracy[%]', - accuracies=[train_acc, test_acc], data=[X_train, X_train], - alg_instance=clf) + parser.add_argument('-C', dest='C', type=float, default=1.0, + help='SVM regularization parameter') + parser.add_argument('--kernel', choices=('linear', 'rbf'), + default='linear', help='SVM kernel function') + parser.add_argument('--gamma', type=float, default=None, + help='Parameter for kernel="rbf"') + parser.add_argument('--maxiter', type=int, default=-1, + help='Maximum iterations for the iterative solver. ' + '-1 means no limit.') + parser.add_argument('--max-cache-size', type=int, default=8, + help='Maximum cache size, in gigabytes, for SVM.') + parser.add_argument('--tol', type=float, default=1e-3, + help='Tolerance passed to sklearn.svm.SVC') + parser.add_argument('--no-shrinking', action='store_false', default=True, + dest='shrinking', help="Don't use shrinking heuristic") + params = bench.parse_args(parser, loop_types=('fit', 'predict')) + bench.run_with_context(params, main) diff --git a/sklearn_bench/train_test_split.py b/sklearn_bench/train_test_split.py index 5ecaa157e..aac1ec2e3 100644 --- a/sklearn_bench/train_test_split.py +++ b/sklearn_bench/train_test_split.py @@ -19,50 +19,54 @@ import bench -parser = argparse.ArgumentParser( - description='scikit-learn train_test_split benchmark') -parser.add_argument('--train-size', type=float, default=0.75, - help='Size of training subset') -parser.add_argument('--test-size', type=float, default=0.25, - help='Size of testing subset') -parser.add_argument('--do-not-shuffle', default=False, action='store_true', - help='Do not perform data shuffle before splitting') -parser.add_argument('--include-y', default=False, action='store_true', - help='Include label (Y) in splitting') -parser.add_argument('--rng', default=None, - choices=('MT19937', 'SFMT19937', 'MT2203', 'R250', 'WH', - 'MCG31', 'MCG59', 'MRG32K3A', 'PHILOX4X32X10', - 'NONDETERM', None), - help='Random numbers generator for shuffling ' - '(only for IDP scikit-learn)') -params = bench.parse_args(parser) -if not params.no_intel_optimized: +def main(): from sklearn.model_selection import train_test_split -# Load generated data -X, y, _, _ = bench.load_data(params) + # Load generated data + X, y, _, _ = bench.load_data(params) -data_args: Iterable -if params.include_y: - data_args = (X, y) -else: - data_args = (X, ) + data_args: Iterable + if params.include_y: + data_args = (X, y) + else: + data_args = (X, ) -tts_params = { - 'train_size': params.train_size, - 'test_size': params.test_size, - 'shuffle': not params.do_not_shuffle, - 'random_state': params.seed -} + tts_params = { + 'train_size': params.train_size, + 'test_size': params.test_size, + 'shuffle': not params.do_not_shuffle, + 'random_state': params.seed + } -if params.rng is not None: - tts_params['rng'] = params.rng + if params.rng is not None: + tts_params['rng'] = params.rng -time, _ = bench.measure_function_time( - train_test_split, *data_args, params=params, **tts_params) + time, _ = bench.measure_function_time( + train_test_split, *data_args, params=params, **tts_params) -bench.print_output(library='sklearn', algorithm='train_test_split', - stages=['training'], params=params, - functions=['train_test_split'], times=[time], accuracies=[None], - accuracy_type=None, data=[X], alg_params=tts_params) + bench.print_output(library='sklearn', algorithm='train_test_split', + stages=['training'], params=params, + functions=['train_test_split'], times=[time], accuracies=[None], + accuracy_type=None, data=[X], alg_params=tts_params) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='scikit-learn train_test_split benchmark') + parser.add_argument('--train-size', type=float, default=0.75, + help='Size of training subset') + parser.add_argument('--test-size', type=float, default=0.25, + help='Size of testing subset') + parser.add_argument('--do-not-shuffle', default=False, action='store_true', + help='Do not perform data shuffle before splitting') + parser.add_argument('--include-y', default=False, action='store_true', + help='Include label (Y) in splitting') + parser.add_argument('--rng', default=None, + choices=('MT19937', 'SFMT19937', 'MT2203', 'R250', 'WH', + 'MCG31', 'MCG59', 'MRG32K3A', 'PHILOX4X32X10', + 'NONDETERM', None), + help='Random numbers generator for shuffling ' + '(only for IDP scikit-learn)') + params = bench.parse_args(parser) + bench.run_with_context(params, main) From 873754bf2ababd0616c052e81d7f7ef93435a818 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Thu, 1 Apr 2021 12:23:06 +0300 Subject: [PATCH 13/31] Split dataset loading to different files --- datasets/load_datasets.py | 13 +- datasets/{loader.py => loader_clf.py} | 1403 ++++++++++--------------- datasets/loader_mul.py | 241 +++++ datasets/loader_reg.py | 57 + datasets/loader_utils.py | 76 ++ 5 files changed, 931 insertions(+), 859 deletions(-) rename datasets/{loader.py => loader_clf.py} (64%) mode change 100755 => 100644 create mode 100644 datasets/loader_mul.py create mode 100644 datasets/loader_reg.py create mode 100755 datasets/loader_utils.py diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index 8a34611b0..a3e6b0adb 100755 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -21,11 +21,12 @@ from pathlib import Path from typing import Callable, Dict -from .loader import (a_nine_a, airline, airline_ohe, bosch, codrnanorm, - connect, covertype, covtype, epsilon, fraud, gisette, - higgs, higgs_one_m, ijcnn, klaverjas, mnist, - mortgage_first_q, msrank, plasticc, santander, sensit, - skin_segmentation, year) + +from .loader_clf import ( + a_nine_a, airline, airline_ohe, bosch, codrnanorm, epsilon, fraud, gisette, higgs, higgs_one_m, + ijcnn, klaverjas, santander, skin_segmentation) +from .loader_reg import (mortgage_first_q, year_prediction_msd) +from .loader_mul import (connect, covertype, covtype, mnist, msrank, plasticc, sensit) dataset_loaders: Dict[str, Callable[[Path], bool]] = { @@ -51,7 +52,7 @@ "santander": santander, "sensit": sensit, "skin_segmentation": skin_segmentation, - "year": year, + "year_prediction_msd": year_prediction_msd, } diff --git a/datasets/loader.py b/datasets/loader_clf.py old mode 100755 new mode 100644 similarity index 64% rename from datasets/loader.py rename to datasets/loader_clf.py index e5fe3387d..cf204ab95 --- a/datasets/loader.py +++ b/datasets/loader_clf.py @@ -1,853 +1,550 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import logging -import os -import re -import tarfile -from pathlib import Path -from typing import Any -from urllib.request import urlretrieve - -import numpy as np -import pandas as pd -import tqdm -from sklearn.datasets import fetch_covtype, fetch_openml, load_svmlight_file -from sklearn.model_selection import train_test_split - -pbar: tqdm.tqdm = None - - -def _show_progress(block_num: int, block_size: int, total_size: int) -> None: - global pbar - if pbar is None: - pbar = tqdm.tqdm(total=total_size / 1024, unit='kB') - - downloaded = block_num * block_size - if downloaded < total_size: - pbar.update(block_size / 1024) - else: - pbar.close() - pbar = None - - -def _retrieve(url: str, filename: str) -> None: - urlretrieve(url, filename, reporthook=_show_progress) - - -def _read_libsvm_msrank(file_obj, n_samples, n_features, dtype): - X = np.zeros((n_samples, n_features)) - y = np.zeros((n_samples,)) - - counter = 0 - - regexp = re.compile(r'[A-Za-z0-9]+:(-?\d*\.?\d+)') - - for line in file_obj: - line = str(line).replace("\\n'", "") - line = regexp.sub(r'\g<1>', line) - line = line.rstrip(" \n\r").split(' ') - - y[counter] = int(line[0]) - X[counter] = [float(i) for i in line[1:]] - - counter += 1 - if counter == n_samples: - break - - return np.array(X, dtype=dtype), np.array(y, dtype=dtype) - - -def _make_gen(reader): - b = reader(1024 * 1024) - while b: - yield b - b = reader(1024 * 1024) - - -def _count_lines(filename): - with open(filename, 'rb') as f: - f_gen = _make_gen(f.read) - return sum(buf.count(b'\n') for buf in f_gen) - - -def a_nine_a(dataset_dir: Path) -> bool: - """ - Author: Ronny Kohavi","Barry Becker - libSVM","AAD group - Source: original - Date unknown - Cite: http://archive.ics.uci.edu/ml/datasets/Adult - - Classification task. n_classes = 2. - a9a X train dataset (39073, 123) - a9a y train dataset (39073, 1) - a9a X test dataset (9769, 123) - a9a y test dataset (9769, 1) - """ - dataset_name = 'a9a' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='a9a', return_X_y=True, - as_frame=False, data_home=dataset_dir) - X = pd.DataFrame(X.todense()) - y = pd.DataFrame(y) - - y[y == -1] = 0 - - logging.info('a9a dataset is downloaded') - logging.info('reading CSV file...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=11) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - logging.info(f'dataset {dataset_name} ready.') - return True - - -def airline(dataset_dir: Path) -> bool: - dataset_name = 'airline' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'http://kt.ijs.si/elena_ikonomovska/datasets/airline/airline_14col.data.bz2' - local_url = os.path.join(dataset_dir, os.path.basename(url)) - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - _retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - - cols = [ - "Year", "Month", "DayofMonth", "DayofWeek", "CRSDepTime", - "CRSArrTime", "UniqueCarrier", "FlightNum", "ActualElapsedTime", - "Origin", "Dest", "Distance", "Diverted", "ArrDelay" - ] - - # load the data as int16 - dtype = np.int16 - - dtype_columns = { - "Year": dtype, "Month": dtype, "DayofMonth": dtype, "DayofWeek": dtype, - "CRSDepTime": dtype, "CRSArrTime": dtype, "FlightNum": dtype, - "ActualElapsedTime": dtype, "Distance": - dtype, - "Diverted": dtype, "ArrDelay": dtype, - } - - df: Any = pd.read_csv(local_url, names=cols, dtype=dtype_columns) - - # Encode categoricals as numeric - for col in df.select_dtypes(['object']).columns: - df[col] = df[col].astype("category").cat.codes - - # Turn into binary classification problem - df["ArrDelayBinary"] = 1 * (df["ArrDelay"] > 0) - - X = df[df.columns.difference(["ArrDelay", "ArrDelayBinary"]) - ].to_numpy(dtype=np.float32) - y = df["ArrDelayBinary"].to_numpy(dtype=np.float32) - del df - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, - test_size=0.2, - ) - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def airline_ohe(dataset_dir: Path) -> bool: - """ - Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf - TaskType:binclass - NumberOfFeatures:700 - NumberOfInstances:10100000 - """ - dataset_name = 'airline-ohe' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://s3.amazonaws.com/benchm-ml--main/train-10m.csv' - url_test = 'https://s3.amazonaws.com/benchm-ml--main/test.csv' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}') - _retrieve(url_train, local_url_train) - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}') - _retrieve(url_test, local_url_test) - logging.info(f'{dataset_name} is loaded, started parsing...') - - sets = [] - labels = [] - - categorical_names = ["Month", "DayofMonth", - "DayOfWeek", "UniqueCarrier", "Origin", "Dest"] - - for local_url in [local_url_train, local_url_train]: - df = pd.read_csv(local_url, nrows=1000000 - if local_url.endswith('train-10m.csv') else None) - X = df.drop('dep_delayed_15min', 1) - y = df["dep_delayed_15min"] - - y_num = np.where(y == "Y", 1, 0) - - sets.append(X) - labels.append(y_num) - - n_samples_train = sets[0].shape[0] - - X_final: Any = pd.concat(sets) - X_final = pd.get_dummies(X_final, columns=categorical_names) - sets = [X_final[:n_samples_train], X_final[n_samples_train:]] - - for data, name in zip((sets[0], sets[1], labels[0], labels[1]), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def bosch(dataset_dir: Path) -> bool: - dataset_name = 'bosch' - os.makedirs(dataset_dir, exist_ok=True) - - filename = "train_numeric.csv.zip" - local_url = os.path.join(dataset_dir, filename) - - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - os.system( - "kaggle competitions download -c bosch-production-line-performance -f " + - filename + " -p " + str(dataset_dir)) - logging.info(f'{dataset_name} is loaded, started parsing...') - X = pd.read_csv(local_url, index_col=0, compression='zip', dtype=np.float32) - y = X.iloc[:, -1].to_numpy(dtype=np.float32) - X.drop(X.columns[-1], axis=1, inplace=True) - X_np = X.to_numpy(dtype=np.float32) - X_train, X_test, y_train, y_test = train_test_split(X_np, y, random_state=77, - test_size=0.2, - ) - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def codrnanorm(dataset_dir: Path) -> bool: - """ - Abstract: Detection of non-coding RNAs on the basis of predicted secondary - structure formation free energy change. - Author: Andrew V Uzilov,Joshua M Keegan,David H Mathews. - Source: [original](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets) - - Classification task. n_classes = 2. - codrnanorm X train dataset (390852, 8) - codrnanorm y train dataset (390852, 1) - codrnanorm X test dataset (97713, 8) - codrnanorm y test dataset (97713, 1) - """ - dataset_name = 'codrnanorm' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='codrnaNorm', return_X_y=True, - as_frame=False, data_home=dataset_dir) - X = pd.DataFrame(X.todense()) - y = pd.DataFrame(y) - - logging.info(f'{dataset_name} dataset is downloaded') - logging.info('reading CSV file...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - logging.info(f'dataset {dataset_name} ready.') - return True - - -def connect(dataset_dir: Path) -> bool: - """ - Source: - UC Irvine Machine Learning Repository - http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.htm - - Classification task. n_classes = 3. - connect X train dataset (196045, 127) - connect y train dataset (196045, 1) - connect X test dataset (49012, 127) - connect y test dataset (49012, 1) - """ - dataset_name = 'connect' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='connect-4', return_X_y=True, - as_frame=False, data_home=dataset_dir) - X = pd.DataFrame(X.todense()) - y = pd.DataFrame(y) - y = y.astype(int) - - logging.info(f'{dataset_name} dataset is downloaded') - logging.info('reading CSV file...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.1, random_state=42) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - logging.info(f'dataset {dataset_name} ready.') - return True - - -def covertype(dataset_dir: Path) -> bool: - """ - Abstract: This is the original version of the famous - covertype dataset in ARFF format. - Author: Jock A. Blackard, Dr. Denis J. Dean, Dr. Charles W. Anderson - Source: [original](https://archive.ics.uci.edu/ml/datasets/covertype) - - Classification task. n_classes = 7. - covertype X train dataset (390852, 54) - covertype y train dataset (390852, 1) - covertype X test dataset (97713, 54) - covertype y test dataset (97713, 1) - """ - dataset_name = 'covertype' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='covertype', version=3, return_X_y=True, - as_frame=True, data_home=dataset_dir) - y = y.astype(int) - - logging.info(f'{dataset_name} dataset is downloaded') - logging.info('reading CSV file...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - logging.info(f'dataset {dataset_name} ready.') - return True - - -def covtype(dataset_dir: Path) -> bool: - dataset_name = 'covtype' - os.makedirs(dataset_dir, exist_ok=True) - - logging.info(f'Started loading {dataset_name}') - X, y = fetch_covtype(return_X_y=True) # pylint: disable=unexpected-keyword-arg - logging.info(f'{dataset_name} is loaded, started parsing...') - - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, - test_size=0.2, - ) - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def epsilon(dataset_dir: Path) -> bool: - dataset_name = 'epsilon' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.bz2' - url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.t.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - _retrieve(url_train, local_url_train) - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - _retrieve(url_test, local_url_test) - logging.info(f'{dataset_name} is loaded, started parsing...') - X_train, y_train = load_svmlight_file(local_url_train, - dtype=np.float32) - X_test, y_test = load_svmlight_file(local_url_test, - dtype=np.float32) - X_train = X_train.toarray() - X_test = X_test.toarray() - y_train[y_train <= 0] = 0 - y_test[y_test <= 0] = 0 - - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def fraud(dataset_dir: Path) -> bool: - dataset_name = 'fraud' - os.makedirs(dataset_dir, exist_ok=True) - - filename = "creditcard.csv" - local_url = os.path.join(dataset_dir, filename) - - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - os.system("kaggle datasets download mlg-ulb/creditcardfraud -f" + - filename + " -p " + str(dataset_dir)) - logging.info(f'{dataset_name} is loaded, started parsing...') - - df = pd.read_csv(local_url + ".zip", dtype=np.float32) - X = df[[col for col in df.columns if col.startswith('V')]].to_numpy(dtype=np.float32) - y = df['Class'].to_numpy(dtype=np.float32) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, - test_size=0.2, - ) - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def gisette(dataset_dir: Path) -> bool: - """ - GISETTE is a handwritten digit recognition problem. - The problem is to separate the highly confusable digits '4' and '9'. - This dataset is one of five datasets of the NIPS 2003 feature selection challenge. - - Classification task. n_classes = 2. - gisette X train dataset (6000, 5000) - gisette y train dataset (6000, 1) - gisette X test dataset (1000, 5000) - gisette y test dataset (1000, 1) - """ - dataset_name = 'gisette' - os.makedirs(dataset_dir, exist_ok=True) - - cache_dir = os.path.join(dataset_dir, '_gisette') - os.makedirs(cache_dir, exist_ok=True) - - domen_hhtp = 'http://archive.ics.uci.edu/ml/machine-learning-databases/' - - gisette_train_data_url = domen_hhtp + '/gisette/GISETTE/gisette_train.data' - filename_train_data = os.path.join(cache_dir, 'gisette_train.data') - if not os.path.exists(filename_train_data): - _retrieve(gisette_train_data_url, filename_train_data) - - gisette_train_labels_url = domen_hhtp + '/gisette/GISETTE/gisette_train.labels' - filename_train_labels = os.path.join(cache_dir, 'gisette_train.labels') - if not os.path.exists(filename_train_labels): - _retrieve(gisette_train_labels_url, filename_train_labels) - - gisette_test_data_url = domen_hhtp + '/gisette/GISETTE/gisette_valid.data' - filename_test_data = os.path.join(cache_dir, 'gisette_valid.data') - if not os.path.exists(filename_test_data): - _retrieve(gisette_test_data_url, filename_test_data) - - gisette_test_labels_url = domen_hhtp + '/gisette/gisette_valid.labels' - filename_test_labels = os.path.join(cache_dir, 'gisette_valid.labels') - if not os.path.exists(filename_test_labels): - _retrieve(gisette_test_labels_url, filename_test_labels) - - logging.info('gisette dataset is downloaded') - logging.info('reading CSV file...') - - num_cols = 5000 - - df_train = pd.read_csv(filename_train_data, header=None) - df_labels = pd.read_csv(filename_train_labels, header=None) - num_train = 6000 - x_train_arr = df_train.iloc[:num_train].values - x_train = pd.DataFrame(np.array([np.fromstring( - elem[0], dtype=int, count=num_cols, sep=' ') for elem in x_train_arr])) - y_train_arr = df_labels.iloc[:num_train].values - y_train = pd.DataFrame((y_train_arr > 0).astype(int)) - - num_train = 1000 - df_test = pd.read_csv(filename_test_data, header=None) - df_labels = pd.read_csv(filename_test_labels, header=None) - x_test_arr = df_test.iloc[:num_train].values - x_test = pd.DataFrame(np.array( - [np.fromstring( - elem[0], - dtype=int, count=num_cols, sep=' ') - for elem in x_test_arr])) - y_test_arr = df_labels.iloc[:num_train].values - y_test = pd.DataFrame((y_test_arr > 0).astype(int)) - - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - - logging.info('dataset gisette ready.') - return True - - -def higgs(dataset_dir: Path) -> bool: - dataset_name = 'higgs' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' - local_url = os.path.join(dataset_dir, os.path.basename(url)) - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - _retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - - higgs = pd.read_csv(local_url) - X = higgs.iloc[:, 1:].to_numpy(dtype=np.float32) - y = higgs.iloc[:, 0].to_numpy(dtype=np.float32) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, - test_size=0.2, - ) - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def higgs_one_m(dataset_dir: Path) -> bool: - """ - Higgs dataset from UCI machine learning repository ( - https://archive.ics.uci.edu/ml/datasets/HIGGS). - TaskType:binclass - NumberOfFeatures:28 - NumberOfInstances:11M - """ - dataset_name = 'higgs1m' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' - local_url = os.path.join(dataset_dir, os.path.basename(url)) - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - _retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - - nrows_train, nrows_test, dtype = 1000000, 500000, np.float32 - data: Any = pd.read_csv(local_url, delimiter=",", header=None, - compression="gzip", dtype=dtype, nrows=nrows_train+nrows_test) - - data = data[list(data.columns[1:])+list(data.columns[0:1])] - n_features = data.shape[1]-1 - train_data = np.ascontiguousarray(data.values[:nrows_train, :n_features], dtype=dtype) - train_label = np.ascontiguousarray(data.values[:nrows_train, n_features], dtype=dtype) - test_data = np.ascontiguousarray( - data.values[nrows_train: nrows_train + nrows_test, : n_features], - dtype=dtype) - test_label = np.ascontiguousarray( - data.values[nrows_train: nrows_train + nrows_test, n_features], - dtype=dtype) - for data, name in zip((train_data, test_data, train_label, test_label), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def ijcnn(dataset_dir: Path) -> bool: - """ - Author: Danil Prokhorov. - libSVM,AAD group - Cite: Danil Prokhorov. IJCNN 2001 neural network competition. - Slide presentation in IJCNN'01, - Ford Research Laboratory, 2001. http://www.geocities.com/ijcnn/nnc_ijcnn01.pdf. - - Classification task. n_classes = 2. - ijcnn X train dataset (153344, 22) - ijcnn y train dataset (153344, 1) - ijcnn X test dataset (38337, 22) - ijcnn y test dataset (38337, 1) - """ - dataset_name = 'ijcnn' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='ijcnn', return_X_y=True, - as_frame=False, data_home=dataset_dir) - X = pd.DataFrame(X.todense()) - y = pd.DataFrame(y) - - y[y == -1] = 0 - - logging.info(f'{dataset_name} dataset is downloaded') - logging.info('reading CSV file...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - logging.info(f'dataset {dataset_name} ready.') - return True - - -def klaverjas(dataset_dir: Path) -> bool: - """ - Abstract: - Klaverjas is an example of the Jack-Nine card games, - which are characterized as trick-taking games where the the Jack - and nine of the trump suit are the highest-ranking trumps, and - the tens and aces of other suits are the most valuable cards - of these suits. It is played by four players in two teams. - - Task Information: - Classification task. n_classes = 2. - klaverjas X train dataset (196045, 3) - klaverjas y train dataset (196045, 1) - klaverjas X test dataset (49012, 3) - klaverjas y test dataset (49012, 1) - """ - dataset_name = 'klaverjas' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='Klaverjas2018', return_X_y=True, - as_frame=True, data_home=dataset_dir) - - y = y.cat.codes - logging.info(f'{dataset_name} dataset is downloaded') - logging.info('reading CSV file...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, train_size=0.2, random_state=42) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - logging.info(f'dataset {dataset_name} ready.') - return True - - -def mnist(dataset_dir: Path) -> bool: - """ - Abstract: - The MNIST database of handwritten digits with 784 features. - It can be split in a training set of the first 60,000 examples, - and a test set of 10,000 examples - Source: - Yann LeCun, Corinna Cortes, Christopher J.C. Burges - http://yann.lecun.com/exdb/mnist/ - - Classification task. n_classes = 10. - mnist X train dataset (60000, 784) - mnist y train dataset (60000, 1) - mnist X test dataset (10000, 784) - mnist y test dataset (10000, 1) - """ - dataset_name = 'mnist' - - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='mnist_784', return_X_y=True, - as_frame=True, data_home=dataset_dir) - y = y.astype(int) - X = X / 255 - - logging.info(f'{dataset_name} dataset is downloaded') - logging.info('reading CSV file...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=10000, shuffle=False) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - logging.info(f'dataset {dataset_name} ready.') - return True - - -def mortgage_first_q(dataset_dir: Path) -> bool: - return False - - -def msrank(dataset_dir: Path) -> bool: - """ - Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf - TaskType:binclass - NumberOfFeatures:700 - NumberOfInstances:10100000 - """ - dataset_name = 'msrank' - os.makedirs(dataset_dir, exist_ok=True) - url = "https://storage.mds.yandex.net/get-devtools-opensource/471749/msrank.tar.gz" - local_url = os.path.join(dataset_dir, os.path.basename(url)) - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - _retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, unzipping...') - tar = tarfile.open(local_url, "r:gz") - tar.extractall(dataset_dir) - tar.close() - logging.info(f'{dataset_name} is unzipped, started parsing...') - - sets = [] - labels = [] - n_features = 137 - - for set_name in ['train.txt', 'vali.txt', 'test.txt']: - file_name = str(dataset_dir) + os.path.join('MSRank', set_name) - - n_samples = _count_lines(file_name) - with open(file_name, 'r') as file_obj: - X, y = _read_libsvm_msrank(file_obj, n_samples, n_features, np.float32) - - sets.append(X) - labels.append(y) - - sets[0] = np.vstack((sets[0], sets[1])) - labels[0] = np.hstack((labels[0], labels[1])) - - sets = [np.ascontiguousarray(sets[i]) for i in [0, 2]] - labels = [np.ascontiguousarray(labels[i]) for i in [0, 2]] - - for data, name in zip((sets[0], sets[1], labels[0], labels[1]), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def plasticc(dataset_dir: Path) -> bool: - return False - - -def santander(dataset_dir: Path) -> bool: - return False - - -def sensit(dataset_dir: Path) -> bool: - """ - Abstract: Vehicle classification in distributed sensor networks. - Author: M. Duarte, Y. H. Hu - Source: [original](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets) - - Classification task. n_classes = 2. - sensit X train dataset (196045, 3) - sensit y train dataset (196045, 1) - sensit X test dataset (49012, 3) - sensit y test dataset (49012, 1) - """ - dataset_name = 'sensit' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='SensIT-Vehicle-Combined', - return_X_y=True, as_frame=False, data_home=dataset_dir) - X = pd.DataFrame(X.todense()) - y = pd.DataFrame(y) - y = y.astype(int) - - logging.info(f'{dataset_name} dataset is downloaded') - logging.info('reading CSV file...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - logging.info(f'dataset {dataset_name} ready.') - return True - - -def skin_segmentation(dataset_dir: Path) -> bool: - """ - Abstract: - The Skin Segmentation dataset is constructed over B, G, R color space. - Skin and Nonskin dataset is generated using skin textures from - face images of diversity of age, gender, and race people. - Author: Rajen Bhatt, Abhinav Dhall, rajen.bhatt '@' gmail.com, IIT Delhi. - - Classification task. n_classes = 2. - skin_segmentation X train dataset (196045, 3) - skin_segmentation y train dataset (196045, 1) - skin_segmentation X test dataset (49012, 3) - skin_segmentation y test dataset (49012, 1) - """ - dataset_name = 'skin_segmentation' - os.makedirs(dataset_dir, exist_ok=True) - - X, y = fetch_openml(name='skin-segmentation', - return_X_y=True, as_frame=True, data_home=dataset_dir) - y = y.astype(int) - y[y == 2] = 0 - - logging.info(f'{dataset_name} dataset is downloaded') - logging.info('reading CSV file...') - - x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42) - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - logging.info(f'dataset {dataset_name} ready.') - return True - - -def year(dataset_dir: Path) -> bool: - dataset_name = 'year' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt' \ - '.zip' - local_url = os.path.join(dataset_dir, os.path.basename(url)) - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - _retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - - year = pd.read_csv(local_url, header=None) - X = year.iloc[:, 1:].to_numpy(dtype=np.float32) - y = year.iloc[:, 0].to_numpy(dtype=np.float32) - - X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, - train_size=463715, - test_size=51630) - - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True +# =============================================================================== +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import logging +import os +from pathlib import Path +from typing import Any + +import numpy as np +import pandas as pd +from sklearn.datasets import fetch_openml, load_svmlight_file +from sklearn.model_selection import train_test_split + +from .loader_utils import retrieve + + +def a_nine_a(dataset_dir: Path) -> bool: + """ + Author: Ronny Kohavi","Barry Becker + libSVM","AAD group + Source: original - Date unknown + Cite: http://archive.ics.uci.edu/ml/datasets/Adult + + Classification task. n_classes = 2. + a9a X train dataset (39073, 123) + a9a y train dataset (39073, 1) + a9a X test dataset (9769, 123) + a9a y test dataset (9769, 1) + """ + dataset_name = 'a9a' + os.makedirs(dataset_dir, exist_ok=True) + + X, y = fetch_openml(name='a9a', return_X_y=True, + as_frame=False, data_home=dataset_dir) + X = pd.DataFrame(X.todense()) + y = pd.DataFrame(y) + + y[y == -1] = 0 + + logging.info('a9a dataset is downloaded') + logging.info('reading CSV file...') + + x_train, x_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=11) + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.csv' + data.to_csv(os.path.join(dataset_dir, filename), + header=False, index=False) + logging.info(f'dataset {dataset_name} ready.') + return True + + +def airline(dataset_dir: Path) -> bool: + dataset_name = 'airline' + os.makedirs(dataset_dir, exist_ok=True) + + url = 'http://kt.ijs.si/elena_ikonomovska/datasets/airline/airline_14col.data.bz2' + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, started parsing...') + + cols = [ + "Year", "Month", "DayofMonth", "DayofWeek", "CRSDepTime", + "CRSArrTime", "UniqueCarrier", "FlightNum", "ActualElapsedTime", + "Origin", "Dest", "Distance", "Diverted", "ArrDelay" + ] + + # load the data as int16 + dtype = np.int16 + + dtype_columns = { + "Year": dtype, "Month": dtype, "DayofMonth": dtype, "DayofWeek": dtype, + "CRSDepTime": dtype, "CRSArrTime": dtype, "FlightNum": dtype, + "ActualElapsedTime": dtype, "Distance": + dtype, + "Diverted": dtype, "ArrDelay": dtype, + } + + df: Any = pd.read_csv(local_url, names=cols, dtype=dtype_columns) + + # Encode categoricals as numeric + for col in df.select_dtypes(['object']).columns: + df[col] = df[col].astype("category").cat.codes + + # Turn into binary classification problem + df["ArrDelayBinary"] = 1 * (df["ArrDelay"] > 0) + + X = df[df.columns.difference(["ArrDelay", "ArrDelayBinary"]) + ].to_numpy(dtype=np.float32) + y = df["ArrDelayBinary"].to_numpy(dtype=np.float32) + del df + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, + test_size=0.2, + ) + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def airline_ohe(dataset_dir: Path) -> bool: + """ + Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf + TaskType:binclass + NumberOfFeatures:700 + NumberOfInstances:10100000 + """ + dataset_name = 'airline-ohe' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://s3.amazonaws.com/benchm-ml--main/train-10m.csv' + url_test = 'https://s3.amazonaws.com/benchm-ml--main/test.csv' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}') + retrieve(url_train, local_url_train) + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}') + retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + + sets = [] + labels = [] + + categorical_names = ["Month", "DayofMonth", + "DayOfWeek", "UniqueCarrier", "Origin", "Dest"] + + for local_url in [local_url_train, local_url_train]: + df = pd.read_csv(local_url, nrows=1000000 + if local_url.endswith('train-10m.csv') else None) + X = df.drop('dep_delayed_15min', 1) + y = df["dep_delayed_15min"] + + y_num = np.where(y == "Y", 1, 0) + + sets.append(X) + labels.append(y_num) + + n_samples_train = sets[0].shape[0] + + X_final: Any = pd.concat(sets) + X_final = pd.get_dummies(X_final, columns=categorical_names) + sets = [X_final[:n_samples_train], X_final[n_samples_train:]] + + for data, name in zip((sets[0], sets[1], labels[0], labels[1]), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def bosch(dataset_dir: Path) -> bool: + dataset_name = 'bosch' + os.makedirs(dataset_dir, exist_ok=True) + + filename = "train_numeric.csv.zip" + local_url = os.path.join(dataset_dir, filename) + + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + os.system( + "kaggle competitions download -c bosch-production-line-performance -f " + + filename + " -p " + str(dataset_dir)) + logging.info(f'{dataset_name} is loaded, started parsing...') + X = pd.read_csv(local_url, index_col=0, compression='zip', dtype=np.float32) + y = X.iloc[:, -1].to_numpy(dtype=np.float32) + X.drop(X.columns[-1], axis=1, inplace=True) + X_np = X.to_numpy(dtype=np.float32) + X_train, X_test, y_train, y_test = train_test_split(X_np, y, random_state=77, + test_size=0.2, + ) + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def codrnanorm(dataset_dir: Path) -> bool: + """ + Abstract: Detection of non-coding RNAs on the basis of predicted secondary + structure formation free energy change. + Author: Andrew V Uzilov,Joshua M Keegan,David H Mathews. + Source: [original](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets) + + Classification task. n_classes = 2. + codrnanorm X train dataset (390852, 8) + codrnanorm y train dataset (390852, 1) + codrnanorm X test dataset (97713, 8) + codrnanorm y test dataset (97713, 1) + """ + dataset_name = 'codrnanorm' + os.makedirs(dataset_dir, exist_ok=True) + + X, y = fetch_openml(name='codrnaNorm', return_X_y=True, + as_frame=False, data_home=dataset_dir) + X = pd.DataFrame(X.todense()) + y = pd.DataFrame(y) + + logging.info(f'{dataset_name} dataset is downloaded') + logging.info('reading CSV file...') + + x_train, x_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42) + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.csv' + data.to_csv(os.path.join(dataset_dir, filename), + header=False, index=False) + logging.info(f'dataset {dataset_name} ready.') + return True + + +def epsilon(dataset_dir: Path) -> bool: + dataset_name = 'epsilon' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.bz2' + url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.t.bz2' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + X_train, y_train = load_svmlight_file(local_url_train, + dtype=np.float32) + X_test, y_test = load_svmlight_file(local_url_test, + dtype=np.float32) + X_train = X_train.toarray() + X_test = X_test.toarray() + y_train[y_train <= 0] = 0 + y_test[y_test <= 0] = 0 + + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def fraud(dataset_dir: Path) -> bool: + dataset_name = 'fraud' + os.makedirs(dataset_dir, exist_ok=True) + + filename = "creditcard.csv" + local_url = os.path.join(dataset_dir, filename) + + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + os.system("kaggle datasets download mlg-ulb/creditcardfraud -f" + + filename + " -p " + str(dataset_dir)) + logging.info(f'{dataset_name} is loaded, started parsing...') + + df = pd.read_csv(local_url + ".zip", dtype=np.float32) + X = df[[col for col in df.columns if col.startswith('V')]].to_numpy(dtype=np.float32) + y = df['Class'].to_numpy(dtype=np.float32) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, + test_size=0.2, + ) + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def gisette(dataset_dir: Path) -> bool: + """ + GISETTE is a handwritten digit recognition problem. + The problem is to separate the highly confusable digits '4' and '9'. + This dataset is one of five datasets of the NIPS 2003 feature selection challenge. + + Classification task. n_classes = 2. + gisette X train dataset (6000, 5000) + gisette y train dataset (6000, 1) + gisette X test dataset (1000, 5000) + gisette y test dataset (1000, 1) + """ + dataset_name = 'gisette' + os.makedirs(dataset_dir, exist_ok=True) + + cache_dir = os.path.join(dataset_dir, '_gisette') + os.makedirs(cache_dir, exist_ok=True) + + domen_hhtp = 'http://archive.ics.uci.edu/ml/machine-learning-databases/' + + gisette_train_data_url = domen_hhtp + '/gisette/GISETTE/gisette_train.data' + filename_train_data = os.path.join(cache_dir, 'gisette_train.data') + if not os.path.exists(filename_train_data): + retrieve(gisette_train_data_url, filename_train_data) + + gisette_train_labels_url = domen_hhtp + '/gisette/GISETTE/gisette_train.labels' + filename_train_labels = os.path.join(cache_dir, 'gisette_train.labels') + if not os.path.exists(filename_train_labels): + retrieve(gisette_train_labels_url, filename_train_labels) + + gisette_test_data_url = domen_hhtp + '/gisette/GISETTE/gisette_valid.data' + filename_test_data = os.path.join(cache_dir, 'gisette_valid.data') + if not os.path.exists(filename_test_data): + retrieve(gisette_test_data_url, filename_test_data) + + gisette_test_labels_url = domen_hhtp + '/gisette/gisette_valid.labels' + filename_test_labels = os.path.join(cache_dir, 'gisette_valid.labels') + if not os.path.exists(filename_test_labels): + retrieve(gisette_test_labels_url, filename_test_labels) + + logging.info('gisette dataset is downloaded') + logging.info('reading CSV file...') + + num_cols = 5000 + + df_train = pd.read_csv(filename_train_data, header=None) + df_labels = pd.read_csv(filename_train_labels, header=None) + num_train = 6000 + x_train_arr = df_train.iloc[:num_train].values + x_train = pd.DataFrame(np.array([np.fromstring( + elem[0], dtype=int, count=num_cols, sep=' ') for elem in x_train_arr])) + y_train_arr = df_labels.iloc[:num_train].values + y_train = pd.DataFrame((y_train_arr > 0).astype(int)) + + num_train = 1000 + df_test = pd.read_csv(filename_test_data, header=None) + df_labels = pd.read_csv(filename_test_labels, header=None) + x_test_arr = df_test.iloc[:num_train].values + x_test = pd.DataFrame(np.array( + [np.fromstring( + elem[0], + dtype=int, count=num_cols, sep=' ') + for elem in x_test_arr])) + y_test_arr = df_labels.iloc[:num_train].values + y_test = pd.DataFrame((y_test_arr > 0).astype(int)) + + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.csv' + data.to_csv(os.path.join(dataset_dir, filename), + header=False, index=False) + + logging.info('dataset gisette ready.') + return True + + +def higgs(dataset_dir: Path) -> bool: + dataset_name = 'higgs' + os.makedirs(dataset_dir, exist_ok=True) + + url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, started parsing...') + + higgs = pd.read_csv(local_url) + X = higgs.iloc[:, 1:].to_numpy(dtype=np.float32) + y = higgs.iloc[:, 0].to_numpy(dtype=np.float32) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, + test_size=0.2, + ) + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def higgs_one_m(dataset_dir: Path) -> bool: + """ + Higgs dataset from UCI machine learning repository ( + https://archive.ics.uci.edu/ml/datasets/HIGGS). + TaskType:binclass + NumberOfFeatures:28 + NumberOfInstances:11M + """ + dataset_name = 'higgs1m' + os.makedirs(dataset_dir, exist_ok=True) + + url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, started parsing...') + + nrows_train, nrows_test, dtype = 1000000, 500000, np.float32 + data: Any = pd.read_csv(local_url, delimiter=",", header=None, + compression="gzip", dtype=dtype, nrows=nrows_train+nrows_test) + + data = data[list(data.columns[1:])+list(data.columns[0:1])] + n_features = data.shape[1]-1 + train_data = np.ascontiguousarray(data.values[:nrows_train, :n_features], dtype=dtype) + train_label = np.ascontiguousarray(data.values[:nrows_train, n_features], dtype=dtype) + test_data = np.ascontiguousarray( + data.values[nrows_train: nrows_train + nrows_test, : n_features], + dtype=dtype) + test_label = np.ascontiguousarray( + data.values[nrows_train: nrows_train + nrows_test, n_features], + dtype=dtype) + for data, name in zip((train_data, test_data, train_label, test_label), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def ijcnn(dataset_dir: Path) -> bool: + """ + Author: Danil Prokhorov. + libSVM,AAD group + Cite: Danil Prokhorov. IJCNN 2001 neural network competition. + Slide presentation in IJCNN'01, + Ford Research Laboratory, 2001. http://www.geocities.com/ijcnn/nnc_ijcnn01.pdf. + + Classification task. n_classes = 2. + ijcnn X train dataset (153344, 22) + ijcnn y train dataset (153344, 1) + ijcnn X test dataset (38337, 22) + ijcnn y test dataset (38337, 1) + """ + dataset_name = 'ijcnn' + os.makedirs(dataset_dir, exist_ok=True) + + X, y = fetch_openml(name='ijcnn', return_X_y=True, + as_frame=False, data_home=dataset_dir) + X = pd.DataFrame(X.todense()) + y = pd.DataFrame(y) + + y[y == -1] = 0 + + logging.info(f'{dataset_name} dataset is downloaded') + logging.info('reading CSV file...') + + x_train, x_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42) + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.csv' + data.to_csv(os.path.join(dataset_dir, filename), + header=False, index=False) + logging.info(f'dataset {dataset_name} ready.') + return True + + +def klaverjas(dataset_dir: Path) -> bool: + """ + Abstract: + Klaverjas is an example of the Jack-Nine card games, + which are characterized as trick-taking games where the the Jack + and nine of the trump suit are the highest-ranking trumps, and + the tens and aces of other suits are the most valuable cards + of these suits. It is played by four players in two teams. + + Task Information: + Classification task. n_classes = 2. + klaverjas X train dataset (196045, 3) + klaverjas y train dataset (196045, 1) + klaverjas X test dataset (49012, 3) + klaverjas y test dataset (49012, 1) + """ + dataset_name = 'klaverjas' + os.makedirs(dataset_dir, exist_ok=True) + + X, y = fetch_openml(name='Klaverjas2018', return_X_y=True, + as_frame=True, data_home=dataset_dir) + + y = y.cat.codes + logging.info(f'{dataset_name} dataset is downloaded') + logging.info('reading CSV file...') + + x_train, x_test, y_train, y_test = train_test_split( + X, y, train_size=0.2, random_state=42) + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.csv' + data.to_csv(os.path.join(dataset_dir, filename), + header=False, index=False) + logging.info(f'dataset {dataset_name} ready.') + return True + + +def santander(dataset_dir: Path) -> bool: + return False + + +def skin_segmentation(dataset_dir: Path) -> bool: + """ + Abstract: + The Skin Segmentation dataset is constructed over B, G, R color space. + Skin and Nonskin dataset is generated using skin textures from + face images of diversity of age, gender, and race people. + Author: Rajen Bhatt, Abhinav Dhall, rajen.bhatt '@' gmail.com, IIT Delhi. + + Classification task. n_classes = 2. + skin_segmentation X train dataset (196045, 3) + skin_segmentation y train dataset (196045, 1) + skin_segmentation X test dataset (49012, 3) + skin_segmentation y test dataset (49012, 1) + """ + dataset_name = 'skin_segmentation' + os.makedirs(dataset_dir, exist_ok=True) + + X, y = fetch_openml(name='skin-segmentation', + return_X_y=True, as_frame=True, data_home=dataset_dir) + y = y.astype(int) + y[y == 2] = 0 + + logging.info(f'{dataset_name} dataset is downloaded') + logging.info('reading CSV file...') + + x_train, x_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42) + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.csv' + data.to_csv(os.path.join(dataset_dir, filename), + header=False, index=False) + logging.info(f'dataset {dataset_name} ready.') + return True diff --git a/datasets/loader_mul.py b/datasets/loader_mul.py new file mode 100644 index 000000000..662a22338 --- /dev/null +++ b/datasets/loader_mul.py @@ -0,0 +1,241 @@ +# =============================================================================== +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import logging +import os +import tarfile +from pathlib import Path + +import numpy as np +import pandas as pd +from sklearn.datasets import fetch_covtype, fetch_openml +from sklearn.model_selection import train_test_split + +from .loader_utils import count_lines, read_libsvm_msrank, retrieve + + +def connect(dataset_dir: Path) -> bool: + """ + Source: + UC Irvine Machine Learning Repository + http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.htm + + Classification task. n_classes = 3. + connect X train dataset (196045, 127) + connect y train dataset (196045, 1) + connect X test dataset (49012, 127) + connect y test dataset (49012, 1) + """ + dataset_name = 'connect' + os.makedirs(dataset_dir, exist_ok=True) + + X, y = fetch_openml(name='connect-4', return_X_y=True, + as_frame=False, data_home=dataset_dir) + X = pd.DataFrame(X.todense()) + y = pd.DataFrame(y) + y = y.astype(int) + + logging.info(f'{dataset_name} dataset is downloaded') + logging.info('reading CSV file...') + + x_train, x_test, y_train, y_test = train_test_split( + X, y, test_size=0.1, random_state=42) + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.csv' + data.to_csv(os.path.join(dataset_dir, filename), + header=False, index=False) + logging.info(f'dataset {dataset_name} ready.') + return True + + +def covertype(dataset_dir: Path) -> bool: + """ + Abstract: This is the original version of the famous + covertype dataset in ARFF format. + Author: Jock A. Blackard, Dr. Denis J. Dean, Dr. Charles W. Anderson + Source: [original](https://archive.ics.uci.edu/ml/datasets/covertype) + + Classification task. n_classes = 7. + covertype X train dataset (390852, 54) + covertype y train dataset (390852, 1) + covertype X test dataset (97713, 54) + covertype y test dataset (97713, 1) + """ + dataset_name = 'covertype' + os.makedirs(dataset_dir, exist_ok=True) + + X, y = fetch_openml(name='covertype', version=3, return_X_y=True, + as_frame=True, data_home=dataset_dir) + y = y.astype(int) + + logging.info(f'{dataset_name} dataset is downloaded') + logging.info('reading CSV file...') + + x_train, x_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42) + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.csv' + data.to_csv(os.path.join(dataset_dir, filename), + header=False, index=False) + logging.info(f'dataset {dataset_name} ready.') + return True + + +def covtype(dataset_dir: Path) -> bool: + dataset_name = 'covtype' + os.makedirs(dataset_dir, exist_ok=True) + + logging.info(f'Started loading {dataset_name}') + X, y = fetch_covtype(return_X_y=True) # pylint: disable=unexpected-keyword-arg + logging.info(f'{dataset_name} is loaded, started parsing...') + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, + test_size=0.2, + ) + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def mnist(dataset_dir: Path) -> bool: + """ + Abstract: + The MNIST database of handwritten digits with 784 features. + It can be split in a training set of the first 60,000 examples, + and a test set of 10,000 examples + Source: + Yann LeCun, Corinna Cortes, Christopher J.C. Burges + http://yann.lecun.com/exdb/mnist/ + + Classification task. n_classes = 10. + mnist X train dataset (60000, 784) + mnist y train dataset (60000, 1) + mnist X test dataset (10000, 784) + mnist y test dataset (10000, 1) + """ + dataset_name = 'mnist' + + os.makedirs(dataset_dir, exist_ok=True) + + X, y = fetch_openml(name='mnist_784', return_X_y=True, + as_frame=True, data_home=dataset_dir) + y = y.astype(int) + X = X / 255 + + logging.info(f'{dataset_name} dataset is downloaded') + logging.info('reading CSV file...') + + x_train, x_test, y_train, y_test = train_test_split( + X, y, test_size=10000, shuffle=False) + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.csv' + data.to_csv(os.path.join(dataset_dir, filename), + header=False, index=False) + logging.info(f'dataset {dataset_name} ready.') + return True + + +def msrank(dataset_dir: Path) -> bool: + """ + Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf + TaskType:binclass + NumberOfFeatures:700 + NumberOfInstances:10100000 + """ + dataset_name = 'msrank' + os.makedirs(dataset_dir, exist_ok=True) + url = "https://storage.mds.yandex.net/get-devtools-opensource/471749/msrank.tar.gz" + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, unzipping...') + tar = tarfile.open(local_url, "r:gz") + tar.extractall(dataset_dir) + tar.close() + logging.info(f'{dataset_name} is unzipped, started parsing...') + + sets = [] + labels = [] + n_features = 137 + + for set_name in ['train.txt', 'vali.txt', 'test.txt']: + file_name = str(dataset_dir) + os.path.join('MSRank', set_name) + + n_samples = count_lines(file_name) + with open(file_name, 'r') as file_obj: + X, y = read_libsvm_msrank(file_obj, n_samples, n_features, np.float32) + + sets.append(X) + labels.append(y) + + sets[0] = np.vstack((sets[0], sets[1])) + labels[0] = np.hstack((labels[0], labels[1])) + + sets = [np.ascontiguousarray(sets[i]) for i in [0, 2]] + labels = [np.ascontiguousarray(labels[i]) for i in [0, 2]] + + for data, name in zip((sets[0], sets[1], labels[0], labels[1]), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def plasticc(dataset_dir: Path) -> bool: + return False + + +def sensit(dataset_dir: Path) -> bool: + """ + Abstract: Vehicle classification in distributed sensor networks. + Author: M. Duarte, Y. H. Hu + Source: [original](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets) + + Classification task. n_classes = 2. + sensit X train dataset (196045, 3) + sensit y train dataset (196045, 1) + sensit X test dataset (49012, 3) + sensit y test dataset (49012, 1) + """ + dataset_name = 'sensit' + os.makedirs(dataset_dir, exist_ok=True) + + X, y = fetch_openml(name='SensIT-Vehicle-Combined', + return_X_y=True, as_frame=False, data_home=dataset_dir) + X = pd.DataFrame(X.todense()) + y = pd.DataFrame(y) + y = y.astype(int) + + logging.info(f'{dataset_name} dataset is downloaded') + logging.info('reading CSV file...') + + x_train, x_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42) + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.csv' + data.to_csv(os.path.join(dataset_dir, filename), + header=False, index=False) + logging.info(f'dataset {dataset_name} ready.') + return True diff --git a/datasets/loader_reg.py b/datasets/loader_reg.py new file mode 100644 index 000000000..73ce477c6 --- /dev/null +++ b/datasets/loader_reg.py @@ -0,0 +1,57 @@ +# =============================================================================== +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import logging +import os +from pathlib import Path + +import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split + +from .loader_utils import retrieve + + +def mortgage_first_q(dataset_dir: Path) -> bool: + return False + + +def year_prediction_msd(dataset_dir: Path) -> bool: + dataset_name = 'year_prediction_msd' + os.makedirs(dataset_dir, exist_ok=True) + + url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt' \ + '.zip' + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, started parsing...') + + year = pd.read_csv(local_url, header=None) + X = year.iloc[:, 1:].to_numpy(dtype=np.float32) + y = year.iloc[:, 0].to_numpy(dtype=np.float32) + + X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, + train_size=463715, + test_size=51630) + + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True diff --git a/datasets/loader_utils.py b/datasets/loader_utils.py new file mode 100755 index 000000000..29366eccb --- /dev/null +++ b/datasets/loader_utils.py @@ -0,0 +1,76 @@ +# =============================================================================== +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import re +from urllib.request import urlretrieve + +import numpy as np +import tqdm + +pbar: tqdm.tqdm = None + + +def _show_progress(block_num: int, block_size: int, total_size: int) -> None: + global pbar + if pbar is None: + pbar = tqdm.tqdm(total=total_size / 1024, unit='kB') + + downloaded = block_num * block_size + if downloaded < total_size: + pbar.update(block_size / 1024) + else: + pbar.close() + pbar = None + + +def retrieve(url: str, filename: str) -> None: + urlretrieve(url, filename, reporthook=_show_progress) + + +def read_libsvm_msrank(file_obj, n_samples, n_features, dtype): + X = np.zeros((n_samples, n_features)) + y = np.zeros((n_samples,)) + + counter = 0 + + regexp = re.compile(r'[A-Za-z0-9]+:(-?\d*\.?\d+)') + + for line in file_obj: + line = str(line).replace("\\n'", "") + line = regexp.sub(r'\g<1>', line) + line = line.rstrip(" \n\r").split(' ') + + y[counter] = int(line[0]) + X[counter] = [float(i) for i in line[1:]] + + counter += 1 + if counter == n_samples: + break + + return np.array(X, dtype=dtype), np.array(y, dtype=dtype) + + +def _make_gen(reader): + b = reader(1024 * 1024) + while b: + yield b + b = reader(1024 * 1024) + + +def count_lines(filename): + with open(filename, 'rb') as f: + f_gen = _make_gen(f.read) + return sum(buf.count(b'\n') for buf in f_gen) From dcfc5b91c95f8708ed231e86ff10207ea58ee3d3 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Thu, 1 Apr 2021 12:48:25 +0300 Subject: [PATCH 14/31] Why doesnt mypy work? --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 67cef5c81..30725479d 100755 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -71,7 +71,7 @@ jobs: steps: - task: UsePythonVersion@0 inputs: - versionSpec: '3.7' + versionSpec: '3.8' addToPath: true - script: | python -m pip install --upgrade pip setuptools From 340402e1176f7f5839c1372e172871b6f5186a47 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Thu, 15 Apr 2021 08:48:07 +0300 Subject: [PATCH 15/31] Added abalone + letters, updated all GB configs --- configs/lgbm_mb_cpu_config.json | 144 +++++++++--------- configs/xgb_cpu_config.json | 155 +++++++++++++------- configs/xgb_cpu_nvda_config.json | 16 +- configs/xgb_gpu_config.json | 241 +++++++++++++++++++------------ configs/xgb_mb_cpu_config.json | 152 ++++++++++--------- datasets/load_datasets.py | 8 +- datasets/loader_mul.py | 35 +++++ datasets/loader_reg.py | 34 +++++ runner.py | 2 + xgboost_bench/gbt.py | 2 + 10 files changed, 489 insertions(+), 300 deletions(-) diff --git a/configs/lgbm_mb_cpu_config.json b/configs/lgbm_mb_cpu_config.json index 3f1e12e5f..fbf8a538d 100755 --- a/configs/lgbm_mb_cpu_config.json +++ b/configs/lgbm_mb_cpu_config.json @@ -1,108 +1,120 @@ { "common": { - "lib": ["modelbuilders"], - "data-format": ["pandas"], - "data-order": ["F"], - "dtype": ["float32"] + "lib": "modelbuilders", + "data-format": "pandas", + "data-order": "F", + "dtype": "float32", + "algorithm": "lgbm_mb" }, "cases": [ { - "algorithm": "lgbm_mb", "dataset": [ { - "source": "csv", - "name": "mortgage1Q", + "source": "npy", + "name": "airline-ohe", "training": { - "x": "data/mortgage_x.csv", - "y": "data/mortgage_y.csv" + "x": "data/airline-ohe_x_train.npy", + "y": "data/airline-ohe_y_train.npy" + }, + "testing": + { + "x": "data/airline-ohe_x_test.npy", + "y": "data/airline-ohe_y_test.npy" } } ], - "n-estimators": [100], - "objective": ["regression"], - "max-depth": [8], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-alpha": [0.9], - "reg-lambda": [1], - "min-child-weight": [0], - "max-leaves": [256] + "reg-alpha": 0.9, + "max-bin": 256, + "scale-pos-weight": 2, + "learning-rate": 0.1, + "subsample": 1, + "reg-lambda": 1, + "min-child-weight": 0, + "max-depth": 8, + "max-leaves": 256, + "n-estimators": 1000, + "objective": "binary" }, { - "algorithm": "lgbm_mb", "dataset": [ { - "source": "csv", - "name": "airline-ohe", + "source": "npy", + "name": "higgs1m", "training": { - "x": "data/airline-ohe_x_train.csv", - "y": "data/airline-ohe_y_train.csv" + "x": "data/higgs1m_x_train.npy", + "y": "data/higgs1m_y_train.npy" + }, + "testing": + { + "x": "data/higgs1m_x_test.npy", + "y": "data/higgs1m_y_test.npy" } } ], - "reg-alpha": [0.9], - "max-bin": [256], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-lambda": [1], - "min-child-weight": [0], - "max-depth": [8], - "max-leaves": [256], - "n-estimators": [1000], - "objective": ["binary"] + "reg-alpha": 0.9, + "max-bin": 256, + "scale-pos-weight": 2, + "learning-rate": 0.1, + "subsample": 1, + "reg-lambda": 1, + "min-child-weight": 0, + "max-depth": 8, + "max-leaves": 256, + "n-estimators": 1000, + "objective": "binary" }, { - "algorithm": "lgbm_mb", "dataset": [ { - "source": "csv", - "name": "higgs1m", + "source": "csv", + "name": "mortgage1Q", "training": { - "x": "data/higgs1m_x_train.csv", - "y": "data/higgs1m_y_train.csv" + "x": "data/mortgage_x.csv", + "y": "data/mortgage_y.csv" } } ], - "reg-alpha": [0.9], - "max-bin": [256], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-lambda": [1], - "min-child-weight": [0], - "max-depth": [8], - "max-leaves": [256], - "n-estimators": [1000], - "objective": ["binary"] + "n-estimators": 100, + "objective": "regression", + "max-depth": 8, + "scale-pos-weight": 2, + "learning-rate": 0.1, + "subsample": 1, + "reg-alpha": 0.9, + "reg-lambda": 1, + "min-child-weight": 0, + "max-leaves": 256 }, { - "algorithm": "lgbm_mb", "dataset": [ { - "source": "csv", - "name": "msrank", + "source": "npy", + "name": "msrank", "training": { - "x": "data/mlsr_x_train.csv", - "y": "data/mlsr_y_train.csv" + "x": "data/msrank_x_train.npy", + "y": "data/msrank_y_train.npy" + }, + "testing": + { + "x": "data/msrank_x_test.npy", + "y": "data/msrank_y_test.npy" } } ], - "max-bin": [256], - "learning-rate": [0.3], - "subsample": [1], - "reg-lambda": [2], - "min-child-weight": [1], - "min-split-gain": [0.1], - "max-depth": [8], - "max-leaves": [256], - "n-estimators": [200], - "objective": ["multiclass"] + "max-bin": 256, + "learning-rate": 0.3, + "subsample": 1, + "reg-lambda": 2, + "min-child-weight": 1, + "min-split-loss": 0.1, + "max-depth": 8, + "max-leaves": 256, + "n-estimators": 200, + "objective": "multiclass" } ] } diff --git a/configs/xgb_cpu_config.json b/configs/xgb_cpu_config.json index 7bbc09b2d..43bc5640b 100644 --- a/configs/xgb_cpu_config.json +++ b/configs/xgb_cpu_config.json @@ -4,77 +4,32 @@ "data-format": "pandas", "data-order": "F", "dtype": "float32", - "count-dmatrix":"", "algorithm": "gbt", "tree-method": "hist", - "num-threads": 56 + "count-dmatrix":"" }, "cases": [ { "dataset": [ { - "source": "csv", - "name": "plasticc", + "source": "npy", + "name": "abalone", "training": { - "x": "data/plasticc_x_train.csv", - "y": "data/plasticc_y_train.csv" + "x": "data/abalone_x_train.npy", + "y": "data/abalone_y_train.npy" }, "testing": { - "x": "data/plasticc_x_test.csv", - "y": "data/plasticc_y_test.csv" - } - } - ], - "n-estimators": 60, - "objective": "multi:softprob", - "max-depth": 7, - "subsample": 0.7, - "colsample-bytree": 0.7 - }, - { - "dataset": [ - { - "source": "csv", - "name": "santander", - "training": - { - "x": "data/santander_x_train.csv", - "y": "data/santander_y_train.csv" - } - } - ], - "n-estimators": 10000, - "objective": "binary:logistic", - "max-depth": 1, - "subsample": 0.5, - "eta": 0.1, - "colsample-bytree": 0.05, - "single-precision-histogram": "" - }, - { - "dataset": [ - { - "source": "csv", - "name": "mortgage1Q", - "training": - { - "x": "data/mortgage_x.csv", - "y": "data/mortgage_y.csv" + "x": "data/abalone_x_test.npy", + "y": "data/abalone_y_test.npy" } } ], - "n-estimators": 100, - "objective": "reg:squarederror", - "max-depth": 8, - "scale-pos-weight": 2, - "learning-rate": 0.1, - "subsample": 1, - "reg-alpha": 0.9, - "reg-lambda": 1, - "min-child-weight": 0, - "max-leaves": 256 + "learning-rate": 0.03, + "max-depth": 6, + "n-estimators": 1000, + "objective": "reg:squarederror" }, { "dataset": [ @@ -136,6 +91,51 @@ "enable-experimental-json-serialization": "False", "inplace-predict": "" }, + { + "dataset": [ + { + "source": "npy", + "name": "letters", + "training": + { + "x": "data/letters_x_train.npy", + "y": "data/letters_y_train.npy" + }, + "testing": + { + "x": "data/letters_x_test.npy", + "y": "data/letters_y_test.npy" + } + } + ], + "learning-rate": 0.03, + "max-depth": 6, + "n-estimators": 1000, + "objective": "multi:softprob" + }, + { + "dataset": [ + { + "source": "csv", + "name": "mortgage1Q", + "training": + { + "x": "data/mortgage_x.csv", + "y": "data/mortgage_y.csv" + } + } + ], + "n-estimators": 100, + "objective": "reg:squarederror", + "max-depth": 8, + "scale-pos-weight": 2, + "learning-rate": 0.1, + "subsample": 1, + "reg-alpha": 0.9, + "reg-lambda": 1, + "min-child-weight": 0, + "max-leaves": 256 + }, { "dataset": [ { @@ -163,6 +163,49 @@ "n-estimators": 200, "objective": "multi:softprob", "single-precision-histogram": "" + }, + { + "dataset": [ + { + "source": "csv", + "name": "plasticc", + "training": + { + "x": "data/plasticc_x_train.csv", + "y": "data/plasticc_y_train.csv" + }, + "testing": + { + "x": "data/plasticc_x_test.csv", + "y": "data/plasticc_y_test.csv" + } + } + ], + "n-estimators": 60, + "objective": "multi:softprob", + "max-depth": 7, + "subsample": 0.7, + "colsample-bytree": 0.7 + }, + { + "dataset": [ + { + "source": "csv", + "name": "santander", + "training": + { + "x": "data/santander_x_train.csv", + "y": "data/santander_y_train.csv" + } + } + ], + "n-estimators": 10000, + "objective": "binary:logistic", + "max-depth": 1, + "subsample": 0.5, + "eta": 0.1, + "colsample-bytree": 0.05, + "single-precision-histogram": "" } ] } diff --git a/configs/xgb_cpu_nvda_config.json b/configs/xgb_cpu_nvda_config.json index d387c861b..a3f738c00 100644 --- a/configs/xgb_cpu_nvda_config.json +++ b/configs/xgb_cpu_nvda_config.json @@ -1,16 +1,16 @@ { "common": { - "lib": ["xgboost"], + "lib": "xgboost", "data-format": "pandas", "data-order": "F", "dtype": "float32", "algorithm": "gbt", "tree-method": "hist", + "count-dmatrix":"", "max-depth": 8, "learning-rate":0.1, "reg-lambda": 1, - "max-leaves": 256, - "num-threads": 48 + "max-leaves": 256 }, "cases": [ { @@ -137,16 +137,16 @@ "dataset": [ { "source": "npy", - "name": "year", + "name": "year_prediction_msd", "training": { - "x": "data/year_x_train.npy", - "y": "data/year_y_train.npy" + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" }, "testing": { - "x": "data/year_x_test.npy", - "y": "data/year_y_test.npy" + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" } } ] diff --git a/configs/xgb_gpu_config.json b/configs/xgb_gpu_config.json index 7fa81e828..5fadb80fa 100644 --- a/configs/xgb_gpu_config.json +++ b/configs/xgb_gpu_config.json @@ -1,159 +1,208 @@ { "common": { - "lib": ["xgboost"], - "data-format": ["cudf"], - "data-order": ["F"], - "dtype": ["float32"], - "count-dmatrix": [""] + "lib": "xgboost", + "data-format": "cudf", + "data-order": "F", + "dtype": "float32", + "algorithm": "gbt", + "tree-method": "gpu_hist", + "count-dmatrix":"" }, "cases": [ { - "algorithm": "gbt", "dataset": [ { - "source": "csv", - "name": "plasticc", + "source": "npy", + "name": "abalone", "training": { - "x": "data/plasticc_x_train.csv", - "y": "data/plasticc_y_train.csv" + "x": "data/abalone_x_train.npy", + "y": "data/abalone_y_train.npy" }, "testing": { - "x": "data/plasticc_x_test.csv", - "y": "data/plasticc_y_test.csv" + "x": "data/abalone_x_test.npy", + "y": "data/abalone_y_test.npy" } } ], - "n-estimators": [60], - "objective": ["multi:softprob"], - "tree-method": ["gpu_hist"], - "max-depth": [7], - "subsample": [0.7], - "colsample-bytree": [0.7] + "learning-rate": 0.03, + "max-depth": 6, + "n-estimators": 1000, + "objective": "reg:squarederror" }, { - "algorithm": "gbt", "dataset": [ { - "source": "csv", - "name": "santander", + "source": "npy", + "name": "airline-ohe", "training": { - "x": "data/santander_x_train.csv", - "y": "data/santander_y_train.csv" + "x": "data/airline-ohe_x_train.npy", + "y": "data/airline-ohe_y_train.npy" + }, + "testing": + { + "x": "data/airline-ohe_x_test.npy", + "y": "data/airline-ohe_y_test.npy" } } ], - "n-estimators": [10000], - "objective": ["binary:logistic"], - "tree-method": ["gpu_hist"], - "max-depth": [1], - "subsample": [0.5], - "eta": [0.1], - "colsample-bytree": [0.05] + "reg-alpha": 0.9, + "max-bin": 256, + "scale-pos-weight": 2, + "learning-rate": 0.1, + "subsample": 1, + "reg-lambda": 1, + "min-child-weight": 0, + "max-depth": 8, + "max-leaves": 256, + "n-estimators": 1000, + "objective": "binary:logistic" }, { - "algorithm": "gbt", "dataset": [ { - "source": "csv", - "name": "mortgage1Q", + "source": "npy", + "name": "higgs1m", "training": { - "x": "data/mortgage_x.csv", - "y": "data/mortgage_y.csv" + "x": "data/higgs1m_x_train.npy", + "y": "data/higgs1m_y_train.npy" + }, + "testing": + { + "x": "data/higgs1m_x_test.npy", + "y": "data/higgs1m_y_test.npy" } } ], - "n-estimators": [100], - "objective": ["reg:squarederror"], - "tree-method": ["gpu_hist"], - "max-depth": [8], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-alpha": [0.9], - "reg-lambda": [1], - "min-child-weight": [0], - "max-leaves": [256] + "reg-alpha": 0.9, + "max-bin": 256, + "scale-pos-weight": 2, + "learning-rate": 0.1, + "subsample": 1, + "reg-lambda": 1, + "min-child-weight": 0, + "max-depth": 8, + "max-leaves": 256, + "n-estimators": 1000, + "objective": "binary:logistic", + "inplace-predict": "" }, { - "algorithm": "gbt", "dataset": [ { - "source": "csv", - "name": "airline-ohe", + "source": "npy", + "name": "letters", "training": { - "x": "data/airline-ohe_x_train.csv", - "y": "data/airline-ohe_y_train.csv" + "x": "data/letters_x_train.npy", + "y": "data/letters_y_train.npy" + }, + "testing": + { + "x": "data/letters_x_test.npy", + "y": "data/letters_y_test.npy" } } ], - "reg-alpha": [0.9], - "max-bin": [256], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-lambda": [1], - "min-child-weight": [0], - "max-depth": [8], - "max-leaves": [256], - "n-estimators": [1000], - "objective": ["binary:logistic"], - "tree-method": ["gpu_hist"] + "learning-rate": 0.03, + "max-depth": 6, + "n-estimators": 1000, + "objective": "multi:softprob" }, { - "algorithm": "gbt", "dataset": [ { - "source": "csv", - "name": "higgs1m", + "source": "csv", + "name": "mortgage1Q", "training": { - "x": "data/higgs1m_x_train.csv", - "y": "data/higgs1m_y_train.csv" + "x": "data/mortgage_x.csv", + "y": "data/mortgage_y.csv" + } + } + ], + "n-estimators": 100, + "objective": "reg:squarederror", + "max-depth": 8, + "scale-pos-weight": 2, + "learning-rate": 0.1, + "subsample": 1, + "reg-alpha": 0.9, + "reg-lambda": 1, + "min-child-weight": 0, + "max-leaves": 256 + }, + { + "dataset": [ + { + "source": "npy", + "name": "msrank", + "training": + { + "x": "data/msrank_x_train.npy", + "y": "data/msrank_y_train.npy" + }, + "testing": + { + "x": "data/msrank_x_test.npy", + "y": "data/msrank_y_test.npy" + } + } + ], + "max-bin": 256, + "learning-rate": 0.3, + "subsample": 1, + "reg-lambda": 2, + "min-child-weight": 1, + "min-split-loss": 0.1, + "max-depth": 8, + "n-estimators": 200, + "objective": "multi:softprob" + }, + { + "dataset": [ + { + "source": "csv", + "name": "plasticc", + "training": + { + "x": "data/plasticc_x_train.csv", + "y": "data/plasticc_y_train.csv" + }, + "testing": + { + "x": "data/plasticc_x_test.csv", + "y": "data/plasticc_y_test.csv" } } ], - "reg-alpha": [0.9], - "max-bin": [256], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-lambda": [1], - "min-child-weight": [0], - "max-depth": [8], - "max-leaves": [256], - "n-estimators": [1000], - "objective": ["binary:logistic"], - "tree-method": ["gpu_hist"], - "inplace-predict": [""] + "n-estimators": 60, + "objective": "multi:softprob", + "max-depth": 7, + "subsample": 0.7, + "colsample-bytree": 0.7 }, { - "algorithm": "gbt", "dataset": [ { - "source": "csv", - "name": "msrank", + "source": "csv", + "name": "santander", "training": { - "x": "data/mlsr_x_train.csv", - "y": "data/mlsr_y_train.csv" + "x": "data/santander_x_train.csv", + "y": "data/santander_y_train.csv" } } ], - "max-bin": [256], - "learning-rate": [0.3], - "subsample": [1], - "reg-lambda": [2], - "min-child-weight": [1], - "min-split-loss": [0.1], - "max-depth": [8], - "n-estimators": [200], - "objective": ["multi:softprob"], - "tree-method": ["gpu_hist"] + "n-estimators": 10000, + "objective": "binary:logistic", + "max-depth": 1, + "subsample": 0.5, + "eta": 0.1, + "colsample-bytree": 0.05 } ] } diff --git a/configs/xgb_mb_cpu_config.json b/configs/xgb_mb_cpu_config.json index eefc97fed..2b10e5592 100755 --- a/configs/xgb_mb_cpu_config.json +++ b/configs/xgb_mb_cpu_config.json @@ -1,113 +1,123 @@ { "common": { - "lib": ["modelbuilders"], - "data-format": ["pandas"], - "data-order": ["F"], - "dtype": ["float32"], - "count-dmatrix": [""] + "lib": "modelbuilders", + "data-format": "pandas", + "data-order": "F", + "dtype": "float32", + "algorithm": "xgb_mb", + "tree-method": "hist", + "count-dmatrix":"" }, "cases": [ { - "algorithm": "xgb_mb", "dataset": [ { - "source": "csv", - "name": "mortgage1Q", + "source": "npy", + "name": "airline-ohe", "training": { - "x": "data/mortgage_x.csv", - "y": "data/mortgage_y.csv" + "x": "data/airline-ohe_x_train.npy", + "y": "data/airline-ohe_y_train.npy" + }, + "testing": + { + "x": "data/airline-ohe_x_test.npy", + "y": "data/airline-ohe_y_test.npy" } } ], - "n-estimators": [100], - "objective": ["reg:squarederror"], - "tree-method": ["hist"], - "max-depth": [8], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-alpha": [0.9], - "reg-lambda": [1], - "min-child-weight": [0], - "max-leaves": [256] + "reg-alpha": 0.9, + "max-bin": 256, + "scale-pos-weight": 2, + "learning-rate": 0.1, + "subsample": 1, + "reg-lambda": 1, + "min-child-weight": 0, + "max-depth": 8, + "max-leaves": 256, + "n-estimators": 1000, + "objective": "binary:logistic" }, { - "algorithm": "xgb_mb", "dataset": [ { - "source": "csv", - "name": "airline-ohe", + "source": "npy", + "name": "higgs1m", "training": { - "x": "data/airline-ohe_x_train.csv", - "y": "data/airline-ohe_y_train.csv" + "x": "data/higgs1m_x_train.npy", + "y": "data/higgs1m_y_train.npy" + }, + "testing": + { + "x": "data/higgs1m_x_test.npy", + "y": "data/higgs1m_y_test.npy" } } ], - "reg-alpha": [0.9], - "max-bin": [256], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-lambda": [1], - "min-child-weight": [0], - "max-depth": [8], - "max-leaves": [256], - "n-estimators": [1000], - "objective": ["binary:logistic"], - "tree-method": ["hist"] + "reg-alpha": 0.9, + "max-bin": 256, + "scale-pos-weight": 2, + "learning-rate": 0.1, + "subsample": 1, + "reg-lambda": 1, + "min-child-weight": 0, + "max-depth": 8, + "max-leaves": 256, + "n-estimators": 1000, + "objective": "binary:logistic", + "enable-experimental-json-serialization": "False", + "inplace-predict": "" }, { - "algorithm": "xgb_mb", "dataset": [ { - "source": "csv", - "name": "higgs1m", + "source": "csv", + "name": "mortgage1Q", "training": { - "x": "data/higgs1m_x_train.csv", - "y": "data/higgs1m_y_train.csv" + "x": "data/mortgage_x.csv", + "y": "data/mortgage_y.csv" } } ], - "reg-alpha": [0.9], - "max-bin": [256], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-lambda": [1], - "min-child-weight": [0], - "max-depth": [8], - "max-leaves": [256], - "n-estimators": [1000], - "objective": ["binary:logistic"], - "tree-method": ["hist"], - "enable-experimental-json-serialization": ["False"] + "n-estimators": 100, + "objective": "reg:squarederror", + "max-depth": 8, + "scale-pos-weight": 2, + "learning-rate": 0.1, + "subsample": 1, + "reg-alpha": 0.9, + "reg-lambda": 1, + "min-child-weight": 0, + "max-leaves": 256 }, { - "algorithm": "xgb_mb", "dataset": [ { - "source": "csv", - "name": "msrank", + "source": "npy", + "name": "msrank", "training": { - "x": "data/mlsr_x_train.csv", - "y": "data/mlsr_y_train.csv" + "x": "data/msrank_x_train.npy", + "y": "data/msrank_y_train.npy" + }, + "testing": + { + "x": "data/msrank_x_test.npy", + "y": "data/msrank_y_test.npy" } } ], - "max-bin": [256], - "learning-rate": [0.3], - "subsample": [1], - "reg-lambda": [2], - "min-child-weight": [1], - "min-split-loss": [0.1], - "max-depth": [8], - "n-estimators": [200], - "objective": ["multi:softprob"], - "tree-method": ["hist"] + "max-bin": 256, + "learning-rate": 0.3, + "subsample": 1, + "reg-lambda": 2, + "min-child-weight": 1, + "min-split-loss": 0.1, + "max-depth": 8, + "n-estimators": 200, + "objective": "multi:softprob" } ] } diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index 91c643081..ad0822ae9 100755 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -24,12 +24,13 @@ from .loader_clf import (a_nine_a, airline, airline_ohe, bosch, codrnanorm, epsilon, fraud, gisette, higgs, higgs_one_m, ijcnn, klaverjas, santander, skin_segmentation) -from .loader_mul import (connect, covertype, covtype, mnist, msrank, plasticc, - sensit) -from .loader_reg import mortgage_first_q, year_prediction_msd +from .loader_mul import (connect, covertype, covtype, letters, mnist, msrank, + plasticc, sensit) +from .loader_reg import abalone, mortgage_first_q, year_prediction_msd dataset_loaders: Dict[str, Callable[[Path], bool]] = { "a9a": a_nine_a, + "abalone": abalone, "airline": airline, "airline-ohe": airline_ohe, "bosch": bosch, @@ -44,6 +45,7 @@ "higgs1m": higgs_one_m, "ijcnn": ijcnn, "klaverjas": klaverjas, + "letters": letters, "mnist": mnist, "mortgage1Q": mortgage_first_q, "msrank": msrank, diff --git a/datasets/loader_mul.py b/datasets/loader_mul.py index 662a22338..d5043743b 100644 --- a/datasets/loader_mul.py +++ b/datasets/loader_mul.py @@ -18,6 +18,7 @@ import os import tarfile from pathlib import Path +from typing import Any import numpy as np import pandas as pd @@ -115,6 +116,40 @@ def covtype(dataset_dir: Path) -> bool: return True +def letters(dataset_dir: Path) -> bool: + """ + http://archive.ics.uci.edu/ml/datasets/Letter+Recognition + + TaskType:multiclass + NumberOfFeatures:16 + NumberOfInstances:20.000 + """ + dataset_name = 'letters' + os.makedirs(dataset_dir, exist_ok=True) + + url = ('http://archive.ics.uci.edu/ml/machine-learning-databases/' + + 'letter-recognition/letter-recognition.data') + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, started parsing...') + + letters = pd.read_csv(local_url, header=None) + X = letters.iloc[:, 1:].values + y: Any = letters.iloc[:, 0] + y = y.astype('category').cat.codes.values + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) + + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + def mnist(dataset_dir: Path) -> bool: """ Abstract: diff --git a/datasets/loader_reg.py b/datasets/loader_reg.py index 73ce477c6..86cee1597 100644 --- a/datasets/loader_reg.py +++ b/datasets/loader_reg.py @@ -17,6 +17,7 @@ import logging import os from pathlib import Path +from typing import Any import numpy as np import pandas as pd @@ -25,6 +26,39 @@ from .loader_utils import retrieve +def abalone(dataset_dir: Path) -> bool: + """ + https://archive.ics.uci.edu/ml/machine-learning-databases/abalone + + TaskType:regression + NumberOfFeatures:8 + NumberOfInstances:4177 + """ + dataset_name = 'abalone' + os.makedirs(dataset_dir, exist_ok=True) + + url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data' + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, started parsing...') + + abalone: Any = pd.read_csv(local_url, header=None) + abalone[0] = abalone[0].astype('category').cat.codes + X = abalone.iloc[:, :-1].values + y = abalone.iloc[:, -1].values + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) + + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + def mortgage_first_q(dataset_dir: Path) -> bool: return False diff --git a/runner.py b/runner.py index 7f6ca08fd..4c8c0745b 100755 --- a/runner.py +++ b/runner.py @@ -72,6 +72,8 @@ params.update(params_set.copy()) algorithm = params['algorithm'] libs = params['lib'] + if not isinstance(libs, list): + libs = [libs] del params['dataset'], params['algorithm'], params['lib'] cases = utils.generate_cases(params) logging.info(f'{algorithm} algorithm: {len(libs) * len(cases)} case(s),' diff --git a/xgboost_bench/gbt.py b/xgboost_bench/gbt.py index d7366b563..b7a8b54db 100644 --- a/xgboost_bench/gbt.py +++ b/xgboost_bench/gbt.py @@ -132,9 +132,11 @@ def convert_xgb_predictions(y_pred, objective): params.n_classes = y_train[y_train.columns[0]].nunique() else: params.n_classes = len(np.unique(y_train)) + # BE VERY CAREFUL ON IT!! It should only work for COVTYPE DATASET if params.objective.startswith('multi:softmax'): params.n_classes += 1 + if params.n_classes > 2: xgb_params['num_class'] = params.n_classes From 6e4742363777fa273b3e76f7c9fa78774643b4f0 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Thu, 15 Apr 2021 09:42:57 +0300 Subject: [PATCH 16/31] Added links and descriptions for new datasets --- configs/xgb_cpu_config.json | 5 --- configs/xgb_gpu_config.json | 67 +++++++++++++++++-------------------- datasets/loader_clf.py | 64 +++++++++++++++++++++++++++++++---- datasets/loader_mul.py | 19 +++++++++-- datasets/loader_reg.py | 11 ++++++ 5 files changed, 117 insertions(+), 49 deletions(-) diff --git a/configs/xgb_cpu_config.json b/configs/xgb_cpu_config.json index 43bc5640b..a155fcbb1 100644 --- a/configs/xgb_cpu_config.json +++ b/configs/xgb_cpu_config.json @@ -173,11 +173,6 @@ { "x": "data/plasticc_x_train.csv", "y": "data/plasticc_y_train.csv" - }, - "testing": - { - "x": "data/plasticc_x_test.csv", - "y": "data/plasticc_y_test.csv" } } ], diff --git a/configs/xgb_gpu_config.json b/configs/xgb_gpu_config.json index 5fadb80fa..30cf73bc8 100644 --- a/configs/xgb_gpu_config.json +++ b/configs/xgb_gpu_config.json @@ -77,18 +77,18 @@ } } ], - "reg-alpha": 0.9, - "max-bin": 256, - "scale-pos-weight": 2, - "learning-rate": 0.1, - "subsample": 1, - "reg-lambda": 1, - "min-child-weight": 0, - "max-depth": 8, - "max-leaves": 256, - "n-estimators": 1000, - "objective": "binary:logistic", - "inplace-predict": "" + "reg-alpha": 0.9, + "max-bin": 256, + "scale-pos-weight": 2, + "learning-rate": 0.1, + "subsample": 1, + "reg-lambda": 1, + "min-child-weight": 0, + "max-depth": 8, + "max-leaves": 256, + "n-estimators": 1000, + "objective": "binary:logistic", + "inplace-predict": "" }, { "dataset": [ @@ -107,10 +107,10 @@ } } ], - "learning-rate": 0.03, - "max-depth": 6, - "n-estimators": 1000, - "objective": "multi:softprob" + "learning-rate":0.03, + "max-depth": 6, + "n-estimators": 1000, + "objective": "multi:softprob" }, { "dataset": [ @@ -152,15 +152,15 @@ } } ], - "max-bin": 256, - "learning-rate": 0.3, - "subsample": 1, - "reg-lambda": 2, - "min-child-weight": 1, - "min-split-loss": 0.1, - "max-depth": 8, - "n-estimators": 200, - "objective": "multi:softprob" + "max-bin": 256, + "learning-rate": 0.3, + "subsample": 1, + "reg-lambda": 2, + "min-child-weight": 1, + "min-split-loss": 0.1, + "max-depth": 8, + "n-estimators": 200, + "objective": "multi:softprob" }, { "dataset": [ @@ -171,11 +171,6 @@ { "x": "data/plasticc_x_train.csv", "y": "data/plasticc_y_train.csv" - }, - "testing": - { - "x": "data/plasticc_x_test.csv", - "y": "data/plasticc_y_test.csv" } } ], @@ -197,12 +192,12 @@ } } ], - "n-estimators": 10000, - "objective": "binary:logistic", - "max-depth": 1, - "subsample": 0.5, - "eta": 0.1, - "colsample-bytree": 0.05 + "n-estimators": 10000, + "objective": "binary:logistic", + "max-depth": 1, + "subsample": 0.5, + "eta": 0.1, + "colsample-bytree": 0.05 } ] } diff --git a/datasets/loader_clf.py b/datasets/loader_clf.py index cf204ab95..9e63ac30d 100644 --- a/datasets/loader_clf.py +++ b/datasets/loader_clf.py @@ -32,7 +32,7 @@ def a_nine_a(dataset_dir: Path) -> bool: Author: Ronny Kohavi","Barry Becker libSVM","AAD group Source: original - Date unknown - Cite: http://archive.ics.uci.edu/ml/datasets/Adult + Site: http://archive.ics.uci.edu/ml/datasets/Adult Classification task. n_classes = 2. a9a X train dataset (39073, 123) @@ -65,6 +65,14 @@ def a_nine_a(dataset_dir: Path) -> bool: def airline(dataset_dir: Path) -> bool: + """ + Airline dataset + http://kt.ijs.si/elena_ikonomovska/data.html + + TaskType:binclass + NumberOfFeatures:13 + NumberOfInstances:115M + """ dataset_name = 'airline' os.makedirs(dataset_dir, exist_ok=True) @@ -131,10 +139,10 @@ def airline_ohe(dataset_dir: Path) -> bool: local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}') + logging.info(f'Started loading {dataset_name} train') retrieve(url_train, local_url_train) if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}') + logging.info(f'Started loading {dataset_name} test') retrieve(url_test, local_url_test) logging.info(f'{dataset_name} is loaded, started parsing...') @@ -170,6 +178,17 @@ def airline_ohe(dataset_dir: Path) -> bool: def bosch(dataset_dir: Path) -> bool: + """ + Bosch Production Line Performance data set + https://www.kaggle.com/c/bosch-production-line-performance + + Requires Kaggle API and API token (https://github.com/Kaggle/kaggle-api) + Contains missing values as NaN. + + TaskType:binclass + NumberOfFeatures:968 + NumberOfInstances:1.184M + """ dataset_name = 'bosch' os.makedirs(dataset_dir, exist_ok=True) @@ -233,6 +252,14 @@ def codrnanorm(dataset_dir: Path) -> bool: def epsilon(dataset_dir: Path) -> bool: + """ + Epsilon dataset + https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html + + TaskType:binclass + NumberOfFeatures:2000 + NumberOfInstances:500K + """ dataset_name = 'epsilon' os.makedirs(dataset_dir, exist_ok=True) @@ -268,6 +295,17 @@ def epsilon(dataset_dir: Path) -> bool: def fraud(dataset_dir: Path) -> bool: + """ + Credit Card Fraud Detection contest + https://www.kaggle.com/mlg-ulb/creditcardfraud + + Requires Kaggle API and API token (https://github.com/Kaggle/kaggle-api) + Contains missing values as NaN. + + TaskType:binclass + NumberOfFeatures:30 + NumberOfInstances:285K + """ dataset_name = 'fraud' os.makedirs(dataset_dir, exist_ok=True) @@ -371,6 +409,14 @@ def gisette(dataset_dir: Path) -> bool: def higgs(dataset_dir: Path) -> bool: + """ + Higgs dataset from UCI machine learning repository + https://archive.ics.uci.edu/ml/datasets/HIGGS + + TaskType:binclass + NumberOfFeatures:28 + NumberOfInstances:11M + """ dataset_name = 'higgs' os.makedirs(dataset_dir, exist_ok=True) @@ -397,11 +443,14 @@ def higgs(dataset_dir: Path) -> bool: def higgs_one_m(dataset_dir: Path) -> bool: """ - Higgs dataset from UCI machine learning repository ( - https://archive.ics.uci.edu/ml/datasets/HIGGS). + Higgs dataset from UCI machine learning repository + https://archive.ics.uci.edu/ml/datasets/HIGGS + + Only first 1.5M samples is taken + TaskType:binclass NumberOfFeatures:28 - NumberOfInstances:11M + NumberOfInstances:1.5M """ dataset_name = 'higgs1m' os.makedirs(dataset_dir, exist_ok=True) @@ -511,6 +560,9 @@ def klaverjas(dataset_dir: Path) -> bool: def santander(dataset_dir: Path) -> bool: + """ + Still doesn't have an loading instruction + """ return False diff --git a/datasets/loader_mul.py b/datasets/loader_mul.py index d5043743b..a2653d200 100644 --- a/datasets/loader_mul.py +++ b/datasets/loader_mul.py @@ -98,6 +98,15 @@ def covertype(dataset_dir: Path) -> bool: def covtype(dataset_dir: Path) -> bool: + """ + Cover type dataset from UCI machine learning repository + https://archive.ics.uci.edu/ml/datasets/covertype + + y contains 7 unique class labels from 1 to 7 inclusive. + TaskType:multiclass + NumberOfFeatures:54 + NumberOfInstances:581012 + """ dataset_name = 'covtype' os.makedirs(dataset_dir, exist_ok=True) @@ -192,6 +201,7 @@ def mnist(dataset_dir: Path) -> bool: def msrank(dataset_dir: Path) -> bool: """ Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf + TaskType:binclass NumberOfFeatures:700 NumberOfInstances:10100000 @@ -200,21 +210,23 @@ def msrank(dataset_dir: Path) -> bool: os.makedirs(dataset_dir, exist_ok=True) url = "https://storage.mds.yandex.net/get-devtools-opensource/471749/msrank.tar.gz" local_url = os.path.join(dataset_dir, os.path.basename(url)) + unzipped_url = os.path.join(dataset_dir, "MSRank") if not os.path.isfile(local_url): logging.info(f'Started loading {dataset_name}') retrieve(url, local_url) + if not os.path.isdir(unzipped_url): logging.info(f'{dataset_name} is loaded, unzipping...') tar = tarfile.open(local_url, "r:gz") tar.extractall(dataset_dir) tar.close() - logging.info(f'{dataset_name} is unzipped, started parsing...') + logging.info(f'{dataset_name} is unzipped, started parsing...') sets = [] labels = [] n_features = 137 for set_name in ['train.txt', 'vali.txt', 'test.txt']: - file_name = str(dataset_dir) + os.path.join('MSRank', set_name) + file_name = os.path.join(unzipped_url, set_name) n_samples = count_lines(file_name) with open(file_name, 'r') as file_obj: @@ -238,6 +250,9 @@ def msrank(dataset_dir: Path) -> bool: def plasticc(dataset_dir: Path) -> bool: + """ + Still doesn't have an loading instruction + """ return False diff --git a/datasets/loader_reg.py b/datasets/loader_reg.py index 86cee1597..627ae4ba5 100644 --- a/datasets/loader_reg.py +++ b/datasets/loader_reg.py @@ -60,10 +60,21 @@ def abalone(dataset_dir: Path) -> bool: def mortgage_first_q(dataset_dir: Path) -> bool: + """ + Still doesn't have an loading instruction + """ return False def year_prediction_msd(dataset_dir: Path) -> bool: + """ + YearPredictionMSD dataset from UCI repository + https://archive.ics.uci.edu/ml/datasets/yearpredictionmsd + + TaskType:regression + NumberOfFeatures:90 + NumberOfInstances:515345 + """ dataset_name = 'year_prediction_msd' os.makedirs(dataset_dir, exist_ok=True) From 4be37201471c88d4c14a8e8332f4b55ca3c47a81 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Thu, 15 Apr 2021 11:11:47 +0300 Subject: [PATCH 17/31] handling mypy --- datasets/loader_clf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datasets/loader_clf.py b/datasets/loader_clf.py index 9e63ac30d..fed0d9332 100644 --- a/datasets/loader_clf.py +++ b/datasets/loader_clf.py @@ -158,7 +158,7 @@ def airline_ohe(dataset_dir: Path) -> bool: X = df.drop('dep_delayed_15min', 1) y = df["dep_delayed_15min"] - y_num = np.where(y == "Y", 1, 0) + y_num = np.where((y == "Y").tolist(), 1, 0) sets.append(X) labels.append(y_num) @@ -382,7 +382,7 @@ def gisette(dataset_dir: Path) -> bool: num_train = 6000 x_train_arr = df_train.iloc[:num_train].values x_train = pd.DataFrame(np.array([np.fromstring( - elem[0], dtype=int, count=num_cols, sep=' ') for elem in x_train_arr])) + elem[0], dtype=int, count=num_cols, sep=' ').tolist() for elem in x_train_arr])) y_train_arr = df_labels.iloc[:num_train].values y_train = pd.DataFrame((y_train_arr > 0).astype(int)) @@ -393,7 +393,7 @@ def gisette(dataset_dir: Path) -> bool: x_test = pd.DataFrame(np.array( [np.fromstring( elem[0], - dtype=int, count=num_cols, sep=' ') + dtype=int, count=num_cols, sep=' ').tolist() for elem in x_test_arr])) y_test_arr = df_labels.iloc[:num_train].values y_test = pd.DataFrame((y_test_arr > 0).astype(int)) From 8184016d47a77c8a5c491f6e6800451ec2263033 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Thu, 15 Apr 2021 11:25:38 +0300 Subject: [PATCH 18/31] Handled skex fake message throwing --- utils.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/utils.py b/utils.py index 5a62413d0..99f57bbf2 100755 --- a/utils.py +++ b/utils.py @@ -27,10 +27,15 @@ def filter_stderr(text: str) -> str: # delete 'Intel(R) DAAL usage in sklearn' messages - fake_error_message = 'Intel(R) oneAPI Data Analytics Library solvers ' + \ - 'for sklearn enabled: ' + \ - 'https://intelpython.github.io/daal4py/sklearn.html' - return ''.join(text.split(fake_error_message)) + daal_fake_error_message = ('Intel(R) oneAPI Data Analytics Library solvers ' + + 'for sklearn enabled: ' + + 'https://intelpython.github.io/daal4py/sklearn.html') + skex_fake_error_message = ('Intel(R) Extension for Scikit-learn* enabled ' + + '(https://github.com/intel/scikit-learn-intelex)') + + text = ''.join(text.split(daal_fake_error_message)) + text = ''.join(text.split(skex_fake_error_message)) + return text def filter_stdout(text: str) -> Tuple[str, str]: From cf5ee76dc0460c4b7812322278feb90f45cb8757 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Thu, 15 Apr 2021 11:27:34 +0300 Subject: [PATCH 19/31] Trying to handle mypy, at. 3 --- datasets/loader_clf.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datasets/loader_clf.py b/datasets/loader_clf.py index fed0d9332..a13c3babc 100644 --- a/datasets/loader_clf.py +++ b/datasets/loader_clf.py @@ -156,9 +156,9 @@ def airline_ohe(dataset_dir: Path) -> bool: df = pd.read_csv(local_url, nrows=1000000 if local_url.endswith('train-10m.csv') else None) X = df.drop('dep_delayed_15min', 1) - y = df["dep_delayed_15min"] + y: Any = df["dep_delayed_15min"] - y_num = np.where((y == "Y").tolist(), 1, 0) + y_num = np.where(y == "Y", 1, 0) sets.append(X) labels.append(y_num) @@ -380,7 +380,7 @@ def gisette(dataset_dir: Path) -> bool: df_train = pd.read_csv(filename_train_data, header=None) df_labels = pd.read_csv(filename_train_labels, header=None) num_train = 6000 - x_train_arr = df_train.iloc[:num_train].values + x_train_arr: Any = df_train.iloc[:num_train].values x_train = pd.DataFrame(np.array([np.fromstring( elem[0], dtype=int, count=num_cols, sep=' ').tolist() for elem in x_train_arr])) y_train_arr = df_labels.iloc[:num_train].values @@ -389,7 +389,7 @@ def gisette(dataset_dir: Path) -> bool: num_train = 1000 df_test = pd.read_csv(filename_test_data, header=None) df_labels = pd.read_csv(filename_test_labels, header=None) - x_test_arr = df_test.iloc[:num_train].values + x_test_arr: Any = df_test.iloc[:num_train].values x_test = pd.DataFrame(np.array( [np.fromstring( elem[0], From 9db31774fea79670f7508088472618d64d7a8a02 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Thu, 15 Apr 2021 11:39:44 +0300 Subject: [PATCH 20/31] Trying to handle mypy, at. 4 --- datasets/loader_clf.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/datasets/loader_clf.py b/datasets/loader_clf.py index a13c3babc..38c9ef9e2 100644 --- a/datasets/loader_clf.py +++ b/datasets/loader_clf.py @@ -172,7 +172,7 @@ def airline_ohe(dataset_dir: Path) -> bool: for data, name in zip((sets[0], sets[1], labels[0], labels[1]), ('x_train', 'x_test', 'y_train', 'y_test')): filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) + np.save(os.path.join(dataset_dir, filename), data) # type: ignore logging.info(f'dataset {dataset_name} is ready.') return True @@ -380,21 +380,21 @@ def gisette(dataset_dir: Path) -> bool: df_train = pd.read_csv(filename_train_data, header=None) df_labels = pd.read_csv(filename_train_labels, header=None) num_train = 6000 - x_train_arr: Any = df_train.iloc[:num_train].values + x_train_arr = df_train.iloc[:num_train].values x_train = pd.DataFrame(np.array([np.fromstring( - elem[0], dtype=int, count=num_cols, sep=' ').tolist() for elem in x_train_arr])) + elem[0], dtype=int, count=num_cols, sep=' ') for elem in x_train_arr])) # type: ignore y_train_arr = df_labels.iloc[:num_train].values y_train = pd.DataFrame((y_train_arr > 0).astype(int)) num_train = 1000 df_test = pd.read_csv(filename_test_data, header=None) df_labels = pd.read_csv(filename_test_labels, header=None) - x_test_arr: Any = df_test.iloc[:num_train].values + x_test_arr = df_test.iloc[:num_train].values x_test = pd.DataFrame(np.array( [np.fromstring( elem[0], - dtype=int, count=num_cols, sep=' ').tolist() - for elem in x_test_arr])) + dtype=int, count=num_cols, sep=' ') + for elem in x_test_arr])) # type: ignore y_test_arr = df_labels.iloc[:num_train].values y_test = pd.DataFrame((y_test_arr > 0).astype(int)) From 5e76a0bc2595993eb47015bbedf165d7876f0fef Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Thu, 15 Apr 2021 11:42:00 +0300 Subject: [PATCH 21/31] Trying to handle mypy, at. 5 --- datasets/loader_clf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/loader_clf.py b/datasets/loader_clf.py index 38c9ef9e2..29011b068 100644 --- a/datasets/loader_clf.py +++ b/datasets/loader_clf.py @@ -393,8 +393,8 @@ def gisette(dataset_dir: Path) -> bool: x_test = pd.DataFrame(np.array( [np.fromstring( elem[0], - dtype=int, count=num_cols, sep=' ') - for elem in x_test_arr])) # type: ignore + dtype=int, count=num_cols, sep=' ') # type: ignore + for elem in x_test_arr])) y_test_arr = df_labels.iloc[:num_train].values y_test = pd.DataFrame((y_test_arr > 0).astype(int)) From 13fcd20204a2aeecc1569d8f061fdeaca1ea2e1e Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Tue, 20 Apr 2021 11:45:29 +0300 Subject: [PATCH 22/31] Changed configs readme and made small fixes in GB testing configs --- configs/README.md | 60 ++++++++++++++-------------- configs/testing/daal4py_xgboost.json | 28 ++++++------- configs/testing/xgboost.json | 53 ++++++++++++------------ 3 files changed, 69 insertions(+), 72 deletions(-) diff --git a/configs/README.md b/configs/README.md index eaacb493f..0f1e67cd9 100644 --- a/configs/README.md +++ b/configs/README.md @@ -1,4 +1,4 @@ -## Config JSON Schema +## Config JSON Schema Configure benchmarks by editing the `config.json` file. You can configure some algorithm parameters, datasets, a list of frameworks to use, and the usage of some environment variables. @@ -11,57 +11,57 @@ Refer to the tables below for descriptions of all fields in the configuration fi - [Training Object](#training-object) - [Testing Object](#testing-object) -### Root Config Object +### Root Config Object | Field Name | Type | Description | | ----- | ---- |------------ | -|omp_env| array[string] | For xgboost only. Specify an environment variable to set the number of omp threads | |common| [Common Object](#common-object)| **REQUIRED** common benchmarks setting: frameworks and input data settings | -|cases| array[[Case Object](#case-object)] | **REQUIRED** list of algorithms, their parameters and training data | +|cases| List[[Case Object](#case-object)] | **REQUIRED** list of algorithms, their parameters and training data | -### Common Object +### Common Object | Field Name | Type | Description | | ----- | ---- |------------ | -|lib| array[string] | **REQUIRED** list of test frameworks. It can be *sklearn*, *daal4py*, *cuml* or *xgboost* | -|data-format| array[string] | **REQUIRED** input data format. Data formats: *numpy*, *pandas* or *cudf* | -|data-order| array[string] | **REQUIRED** input data order. Data order: *C* (row-major, default) or *F* (column-major) | -|dtype| array[string] | **REQUIRED** input data type. Data type: *float64* (default) or *float32* | -|check-finitness| array[] | Check finiteness in sklearn input check(disabled by default) | +|data-format| Union[str, List[str]] | **REQUIRED** input data format. Data formats: *numpy*, *pandas* or *cudf* | +|data-order| Union[str, List[str]] | **REQUIRED** input data order. Data order: *C* (row-major, default) or *F* (column-major) | +|dtype| Union[str, List[str]] | **REQUIRED** input data type. Data type: *float64* (default) or *float32* | +|check-finitness| List[] | Check finiteness in sklearn input check(disabled by default) | -### Case Object +### Case Object | Field Name | Type | Description | | ----- | ---- |------------ | -|lib| array[string] | **REQUIRED** list of test frameworks. It can be *sklearn*, *daal4py*, *cuml* or *xgboost*| -|algorithm| string | **REQUIRED** benchmark name | -|dataset| array[[Dataset Object](#dataset-object)] | **REQUIRED** input data specifications. | -|benchmark parameters| array[Any] | **REQUIRED** algorithm parameters. a list of supported parameters can be found here | +|lib| Union[str, List[str]] | **REQUIRED** Test framework or list of frameworks. Must be from [*sklearn*, *daal4py*, *cuml* or *xgboost*] | +|algorithm| string | **REQUIRED** benchmark file name. | +|dataset| List[[Dataset Object](#dataset-object)] | **REQUIRED** input data specifications. | +|**specific algorithm parameters**| Union[int, float, str, List[int], List[float], List[str]] | other specific algorithm parameters. The list of supported parameters can be found here | -### Dataset Object +#### **Important:** feel free to move any parameter from **cases** to **common** section since this parameter is common for all cases + +### Dataset Object | Field Name | Type | Description | | ----- | ---- |------------ | -|source| string | **REQUIRED** data source. It can be *synthetic* or *csv* | -|type| string | **REQUIRED** for synthetic data only. The type of task for which the dataset is generated. It can be *classification*, *blobs* or *regression* | +|source| string | **REQUIRED** data source. It can be *synthetic*, *csv* or *npy* | +|type| string | **REQUIRED for synthetic data**. The type of task for which the dataset is generated. It can be *classification*, *blobs* or *regression* | |n_classes| int | For *synthetic* data and for *classification* type only. The number of classes (or labels) of the classification problem | |n_clusters| int | For *synthetic* data and for *blobs* type only. The number of centers to generate | -|n_features| int | **REQUIRED** For *synthetic* data only. The number of features to generate | -|name| string | Name of dataset | -|training| [Training Object](#training-object) | **REQUIRED** algorithm parameters. a list of supported parameters can be found here | -|testing| [Testing Object](#testing-object) | **REQUIRED** algorithm parameters. a list of supported parameters can be found here | +|n_features| int | **REQUIRED for *synthetic* data**. The number of features to generate | +|name| string | Name of the dataset | +|training| [Training Object](#training-object) | **REQUIRED** An object with training dataset paths | +|testing| [Testing Object](#testing-object) | An object with testing dataset paths. If not provided, training datasets are used | -### Training Object +### Training Object | Field Name | Type | Description | | ----- | ---- |------------ | -| n_samples | int | The total number of the training points | -| x | str | The path to the training samples | -| y | str | The path to the training labels | +| n_samples | int | **REQUIRED** The total number of the training samples | +| x | str | **REQUIRED** The path to the training samples | +| y | str | **REQUIRED** The path to the training labels | -### Testing Object +### Testing Object | Field Name | Type | Description | | ----- | ---- |------------ | -| n_samples | int | The total number of the testing points | -| x | str | The path to the testing samples | -| y | str | The path to the testing labels | +| n_samples | int | **REQUIRED** The total number of the testing samples | +| x | str | **REQUIRED** The path to the testing samples | +| y | str | **REQUIRED** The path to the testing labels | diff --git a/configs/testing/daal4py_xgboost.json b/configs/testing/daal4py_xgboost.json index 56accdce3..548ec82bf 100755 --- a/configs/testing/daal4py_xgboost.json +++ b/configs/testing/daal4py_xgboost.json @@ -1,20 +1,21 @@ { - "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"], "common": { - "lib": ["modelbuilders"], - "data-format": ["pandas"], - "data-order": ["F"], - "dtype": ["float32"] + "lib": "modelbuilders", + "data-format": "pandas", + "data-order": "F", + "dtype": "float32", + "algorithm": "xgb_mb", + "tree-method": "hist", + "count-dmatrix":"" }, "cases": [ { - "algorithm": "xgb_mb", "dataset": [ { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 10, + "source": "synthetic", + "type": "classification", + "n_classes": 5, + "n_features": 10, "training": { "n_samples": 100 }, @@ -23,10 +24,9 @@ } } ], - "n-estimators": [10], - "tree-method": ["hist"], - "objective": ["multi:softprob"], - "max-depth": [8] + "n-estimators": 10, + "max-depth": 8, + "objective": "multi:softprob" } ] } diff --git a/configs/testing/xgboost.json b/configs/testing/xgboost.json index 5107ee793..33242a630 100755 --- a/configs/testing/xgboost.json +++ b/configs/testing/xgboost.json @@ -1,21 +1,21 @@ { - "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"], "common": { - "lib": ["xgboost"], - "data-format": ["pandas"], - "data-order": ["F"], - "dtype": ["float64"] + "lib": "xgboost", + "data-format": "pandas", + "data-order": "F", + "dtype": "float32", + "algorithm": "gbt", + "tree-method": "hist", + "count-dmatrix":"" }, "cases": [ - { - "algorithm": "gbt", "dataset": [ { - "source": "synthetic", - "type": "classification", - "n_classes": 5, - "n_features": 10, + "source": "synthetic", + "type": "classification", + "n_classes": 5, + "n_features": 10, "training": { "n_samples": 1000 }, @@ -24,21 +24,19 @@ } } ], - "n-estimators": [50], - "objective": ["multi:softprob"], - "tree-method": ["hist"], - "max-depth": [7], - "subsample": [0.7], - "colsample-bytree": [0.7] + "n-estimators": 50, + "max-depth": 7, + "subsample": 0.7, + "colsample-bytree": 0.7, + "objective": "multi:softprob" }, { - "algorithm": "gbt", "dataset": [ { - "source": "synthetic", - "type": "regression", - "n_classes": 5, - "n_features": 10, + "source": "synthetic", + "type": "regression", + "n_classes": 5, + "n_features": 10, "training": { "n_samples": 100 }, @@ -47,12 +45,11 @@ } } ], - "n-estimators": [50], - "objective": ["reg:squarederror"], - "tree-method": ["hist"], - "max-depth": [8], - "learning-rate": [0.1], - "reg-alpha": [0.9] + "n-estimators": 50, + "max-depth": 8, + "learning-rate": 0.1, + "reg-alpha": 0.9, + "objective": "reg:squarederror" } ] } From 877e0fd7429d79408c7f21d7e337008894ff661f Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Tue, 20 Apr 2021 12:41:59 +0300 Subject: [PATCH 23/31] Applying more comments, updating readme's --- README.md | 75 ++++++++++++++++++++++------------------- azure-pipelines.yml | 4 --- configs/README.md | 17 +++++----- datasets/loader_clf.py | 2 +- datasets/loader_mul.py | 2 +- datasets/loader_reg.py | 2 +- sklearn_bench/README.md | 34 +++++++++---------- xgboost_bench/README.md | 11 +++--- 8 files changed, 76 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index 1c17f69ca..c54dfc5de 100755 --- a/README.md +++ b/README.md @@ -26,42 +26,42 @@ We publish blogs on Medium, so [follow us](https://medium.com/intel-analytics-so ## Table of content -* [How to create conda environment for benchmarking](#how-to-create-conda-environment-for-benchmarking) -* [Running Python benchmarks with runner script](#running-python-benchmarks-with-runner-script) -* [Benchmark supported algorithms](#benchmark-supported-algorithms) -* [Intel(R) Extension for Scikit-learn* support](#intelr-extension-for-scikit-learn-support) -* [Algorithms parameters](#algorithms-parameters) +- [How to create conda environment for benchmarking](#how-to-create-conda-environment-for-benchmarking) +- [Running Python benchmarks with runner script](#running-python-benchmarks-with-runner-script) +- [Benchmark supported algorithms](#benchmark-supported-algorithms) +- [Intel(R) Extension for Scikit-learn* support](#intelr-extension-for-scikit-learn-support) +- [Algorithms parameters](#algorithms-parameters) ## How to create conda environment for benchmarking Create a suitable conda environment for each framework to test. Each item in the list below links to instructions to create an appropriate conda environment for the framework. -* [**scikit-learn**](sklearn_bench#how-to-create-conda-environment-for-benchmarking) +- [**scikit-learn**](sklearn_bench#how-to-create-conda-environment-for-benchmarking) ```bash pip install -r sklearn_bench/requirements.txt # or -conda install -c intel scikit-learn scikit-learn-intelex pandas +conda install -c intel scikit-learn scikit-learn-intelex pandas tqdm ``` -* [**daal4py**](daal4py_bench#how-to-create-conda-environment-for-benchmarking) +- [**daal4py**](daal4py_bench#how-to-create-conda-environment-for-benchmarking) ```bash -conda install -c conda-forge scikit-learn daal4py pandas +conda install -c conda-forge scikit-learn daal4py pandas tqdm ``` -* [**cuml**](cuml_bench#how-to-create-conda-environment-for-benchmarking) +- [**cuml**](cuml_bench#how-to-create-conda-environment-for-benchmarking) ```bash -conda install -c rapidsai -c conda-forge cuml pandas cudf +conda install -c rapidsai -c conda-forge cuml pandas cudf tqdm ``` -* [**xgboost**](xgboost_bench#how-to-create-conda-environment-for-benchmarking) +- [**xgboost**](xgboost_bench#how-to-create-conda-environment-for-benchmarking) ```bash pip install -r xgboost_bench/requirements.txt # or -conda install -c conda-forge xgboost pandas +conda install -c conda-forge xgboost scikit-learn pandas tqdm ``` ## Running Python benchmarks with runner script @@ -69,12 +69,13 @@ conda install -c conda-forge xgboost pandas Run `python runner.py --configs configs/config_example.json [--output-file result.json --verbose INFO --report]` to launch benchmarks. Options: -* ``--configs``: specify the path to a configuration file. -* ``--no-intel-optimized``: use Scikit-learn without [Intel(R) Extension for Scikit-learn*](#intelr-extension-for-scikit-learn-support). Now available for [scikit-learn benchmarks](https://github.com/IntelPython/scikit-learn_bench/tree/master/sklearn_bench). By default, the runner uses Intel(R) Extension for Scikit-learn. -* ``--output-file``: output file name for the benchmark result. The default name is `result.json` -* ``--report``: create an Excel report based on benchmark results. The `openpyxl` library is required. -* ``--dummy-run``: run configuration parser and dataset generation without benchmarks running. -* ``--verbose``: *WARNING*, *INFO*, *DEBUG*. print additional information during benchmarks running. Default is *INFO*. + +- ``--configs``: specify the path to a configuration file. +- ``--no-intel-optimized``: use Scikit-learn without [Intel(R) Extension for Scikit-learn*](#intelr-extension-for-scikit-learn-support). Now available for [scikit-learn benchmarks](https://github.com/IntelPython/scikit-learn_bench/tree/master/sklearn_bench). By default, the runner uses Intel(R) Extension for Scikit-learn. +- ``--output-file``: output file name for the benchmark result. The default name is `result.json` +- ``--report``: create an Excel report based on benchmark results. The `openpyxl` library is required. +- ``--dummy-run``: run configuration parser and dataset generation without benchmarks running. +- ``--verbose``: *WARNING*, *INFO*, *DEBUG*. print additional information during benchmarks running. Default is *INFO*. | Level | Description | |-----------|---------------| @@ -83,10 +84,11 @@ Options: | *WARNING* | An indication that something unexpected happened, or indicative of some problem in the near future (e.g. ‘disk space low’). The software is still working as expected. | Benchmarks currently support the following frameworks: -* **scikit-learn** -* **daal4py** -* **cuml** -* **xgboost** + +- **scikit-learn** +- **daal4py** +- **cuml** +- **xgboost** The configuration of benchmarks allows you to select the frameworks to run, select datasets for measurements and configure the parameters of the algorithms. @@ -116,27 +118,32 @@ The configuration of benchmarks allows you to select the frameworks to run, sele When you run scikit-learn benchmarks on CPU, [Intel(R) Extension for Scikit-learn](https://github.com/intel/scikit-learn-intelex) is used by default. Use the ``--no-intel-optimized`` option to run the benchmarks without the extension. The following benchmarks have a GPU support: -* dbscan -* kmeans -* linear -* log_reg + +- dbscan +- kmeans +- linear +- log_reg You may use the [configuration file for these benchmarks](https://github.com/IntelPython/scikit-learn_bench/blob/master/configs/skl_xpu_config.json) to run them on both CPU and GPU. -## Algorithms parameters +## Algorithms parameters You can launch benchmarks for each algorithm separately. To do this, go to the directory with the benchmark: - cd +```bash +cd +``` Run the following command: - python --dataset-name +```bash +python --dataset-name +``` The list of supported parameters for each algorithm you can find here: -* [**scikit-learn**](sklearn_bench#algorithms-parameters) -* [**daal4py**](daal4py_bench#algorithms-parameters) -* [**cuml**](cuml_bench#algorithms-parameters) -* [**xgboost**](xgboost_bench#algorithms-parameters) +- [**scikit-learn**](sklearn_bench#algorithms-parameters) +- [**daal4py**](daal4py_bench#algorithms-parameters) +- [**cuml**](cuml_bench#algorithms-parameters) +- [**xgboost**](xgboost_bench#algorithms-parameters) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 0d58e74a9..34b1efec5 100755 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -72,11 +72,7 @@ jobs: steps: - task: UsePythonVersion@0 inputs: -<<<<<<< HEAD - versionSpec: '3.8' -======= versionSpec: '$(python.version)' ->>>>>>> 4d6bafcd1503b8d13c4ae885d387cf0c523ef49b addToPath: true - script: | python -m pip install --upgrade pip setuptools diff --git a/configs/README.md b/configs/README.md index 5d8ad733d..b1c8221a6 100644 --- a/configs/README.md +++ b/configs/README.md @@ -1,4 +1,4 @@ -## Config JSON Schema +# Config JSON Schema Configure benchmarks by editing the `config.json` file. You can configure some algorithm parameters, datasets, a list of frameworks to use, and the usage of some environment variables. @@ -11,13 +11,14 @@ Refer to the tables below for descriptions of all fields in the configuration fi - [Training Object](#training-object) - [Testing Object](#testing-object) -### Root Config Object +## Root Config Object + | Field Name | Type | Description | | ----- | ---- |------------ | |common| [Common Object](#common-object)| **REQUIRED** common benchmarks setting: frameworks and input data settings | |cases| List[[Case Object](#case-object)] | **REQUIRED** list of algorithms, their parameters and training data | -### Common Object +## Common Object | Field Name | Type | Description | | ----- | ---- |------------ | @@ -27,7 +28,7 @@ Refer to the tables below for descriptions of all fields in the configuration fi |check-finitness| List[] | Check finiteness in sklearn input check(disabled by default) | |device| array[string] | For scikit-learn only. The list of devices to run the benchmarks on.
It can be *None* (default, run on CPU without sycl context) or one of the types of sycl devices: *cpu*, *gpu*, *host*.
Refer to [SYCL specification](https://www.khronos.org/files/sycl/sycl-2020-reference-guide.pdf) for details| -### Case Object +## Case Object | Field Name | Type | Description | | ----- | ---- |------------ | @@ -36,9 +37,9 @@ Refer to the tables below for descriptions of all fields in the configuration fi |dataset| List[[Dataset Object](#dataset-object)] | **REQUIRED** input data specifications. | |**specific algorithm parameters**| Union[int, float, str, List[int], List[float], List[str]] | other specific algorithm parameters. The list of supported parameters can be found here | -#### **Important:** feel free to move any parameter from **cases** to **common** section since this parameter is common for all cases +### **Important:** feel free to move any parameter from **cases** to **common** section since this parameter is common for all cases -### Dataset Object +## Dataset Object | Field Name | Type | Description | | ----- | ---- |------------ | @@ -51,7 +52,7 @@ Refer to the tables below for descriptions of all fields in the configuration fi |training| [Training Object](#training-object) | **REQUIRED** An object with training dataset paths | |testing| [Testing Object](#testing-object) | An object with testing dataset paths. If not provided, training datasets are used | -### Training Object +## Training Object | Field Name | Type | Description | | ----- | ---- |------------ | @@ -59,7 +60,7 @@ Refer to the tables below for descriptions of all fields in the configuration fi | x | str | **REQUIRED** The path to the training samples | | y | str | **REQUIRED** The path to the training labels | -### Testing Object +## Testing Object | Field Name | Type | Description | | ----- | ---- |------------ | diff --git a/datasets/loader_clf.py b/datasets/loader_clf.py index 29011b068..5f804e888 100644 --- a/datasets/loader_clf.py +++ b/datasets/loader_clf.py @@ -561,7 +561,7 @@ def klaverjas(dataset_dir: Path) -> bool: def santander(dataset_dir: Path) -> bool: """ - Still doesn't have an loading instruction + # TODO: add an loading instruction """ return False diff --git a/datasets/loader_mul.py b/datasets/loader_mul.py index a2653d200..c6a0bc0e3 100644 --- a/datasets/loader_mul.py +++ b/datasets/loader_mul.py @@ -251,7 +251,7 @@ def msrank(dataset_dir: Path) -> bool: def plasticc(dataset_dir: Path) -> bool: """ - Still doesn't have an loading instruction + # TODO: add an loading instruction """ return False diff --git a/datasets/loader_reg.py b/datasets/loader_reg.py index 627ae4ba5..c19cdf55c 100644 --- a/datasets/loader_reg.py +++ b/datasets/loader_reg.py @@ -61,7 +61,7 @@ def abalone(dataset_dir: Path) -> bool: def mortgage_first_q(dataset_dir: Path) -> bool: """ - Still doesn't have an loading instruction + # TODO: add an loading instruction """ return False diff --git a/sklearn_bench/README.md b/sklearn_bench/README.md index b21da94da..8cca0f81d 100644 --- a/sklearn_bench/README.md +++ b/sklearn_bench/README.md @@ -1,15 +1,14 @@ - -## How to create conda environment for benchmarking +# How to create conda environment for benchmarking If you want to test scikit-learn, then use ```bash pip install -r sklearn_bench/requirements.txt # or -conda install -c intel scikit-learn scikit-learn-intelex pandas +conda install -c intel scikit-learn scikit-learn-intelex pandas tqdm ``` -## Algorithms parameters +## Algorithms parameters You can launch benchmarks for each algorithm separately. The tables below list all supported parameters for each algorithm: @@ -27,7 +26,8 @@ You can launch benchmarks for each algorithm separately. The tables below list a - [SVC](#svc) - [train_test_split](#train_test_split) -#### General +### General + | parameter Name | Type | default value | description | | ----- | ---- |---- |---- | |num-threads|int|-1| The number of threads to use| @@ -50,14 +50,14 @@ You can launch benchmarks for each algorithm separately. The tables below list a |seed|int|12345|Seed to pass as random_state| |dataset-name|str|None|Dataset name| +### DBSCAN -#### DBSCAN | parameter Name | Type | default value | description | | ----- | ---- |---- |---- | | epsilon | float | 10 | Radius of neighborhood of a point| | min_samples | int | 5 | The minimum number of samples required in a 'neighborhood to consider a point a core point | -#### RandomForestClassifier +### RandomForestClassifier | parameter Name | Type | default value | description | | ----- | ---- |---- |---- | @@ -70,7 +70,7 @@ You can launch benchmarks for each algorithm separately. The tables below list a | min-impurity-decrease | float | 0 | Needed impurity decrease for node splitting | | no-bootstrap | store_false | True | Don't control bootstraping | -#### RandomForestRegressor +### RandomForestRegressor | parameter Name | Type | default value | description | | ----- | ---- |---- |---- | @@ -84,13 +84,13 @@ You can launch benchmarks for each algorithm separately. The tables below list a | no-bootstrap | action | True | Don't control bootstraping | | use-sklearn-class | action | | Force use of sklearn.ensemble.RandomForestClassifier | -#### pairwise_distances +### pairwise_distances | parameter Name | Type | default value | description | | ----- | ---- |---- |---- | | metric | str | cosine | *cosine* or *correlation* Metric to test for pairwise distances | -#### KMeans +### KMeans | parameter Name | Type | default value | description | | ----- | ---- |---- |---- | @@ -99,7 +99,7 @@ You can launch benchmarks for each algorithm separately. The tables below list a | maxiter | inte | 100 | Maximum number of iterations | | n-clusters | int | | The number of clusters | -#### KNeighborsClassifier +### KNeighborsClassifier | parameter Name | Type | default value | description | | ----- | ---- |---- |---- | @@ -108,13 +108,13 @@ You can launch benchmarks for each algorithm separately. The tables below list a | method | str | brute | Algorithm used to compute the nearest neighbors | | metric | str | euclidean | Distance metric to use | -#### LinearRegression +### LinearRegression | parameter Name | Type | default value | description | | ----- | ---- |---- |---- | | no-fit-intercept | action | True | Don't fit intercept (assume data already centered) | -#### LogisticRegression +### LogisticRegression | parameter Name | Type | default value | description | | ----- | ---- |---- |---- | @@ -125,7 +125,7 @@ You can launch benchmarks for each algorithm separately. The tables below list a | C | float | 1.0 | Regularization parameter | | tol | float | None | Tolerance for solver | -#### PCA +### PCA | parameter Name | Type | default value | description | | ----- | ---- |---- |---- | @@ -133,7 +133,7 @@ You can launch benchmarks for each algorithm separately. The tables below list a | n-components | int | None | The number of components to find | | whiten | action | False | Perform whitening | -#### Ridge +### Ridge | parameter Name | Type | default value | description | | ----- | ---- |---- |---- | @@ -141,7 +141,7 @@ You can launch benchmarks for each algorithm separately. The tables below list a | solver | str | auto | Solver used for training | | alpha | float | 1.0 | Regularization strength | -#### SVC +### SVC | parameter Name | Type | default value | description | | ----- | ---- |---- |---- | @@ -152,7 +152,7 @@ You can launch benchmarks for each algorithm separately. The tables below list a | tol | float | 1e-16 | Tolerance passed to sklearn.svm.SVC | | probability | action | True | Use probability for SVC | -#### train_test_split +### train_test_split | parameter Name | Type | default value | description | | ----- | ---- |---- |---- | diff --git a/xgboost_bench/README.md b/xgboost_bench/README.md index 2b4e93ec5..45f27be87 100644 --- a/xgboost_bench/README.md +++ b/xgboost_bench/README.md @@ -1,16 +1,17 @@ -## How to create conda environment for benchmarking +# How to create conda environment for benchmarking ```bash pip install -r xgboost_bench/requirements.txt # or -conda install -c conda-forge xgboost pandas +conda install -c intel scikit-learn scikit-learn-intelex pandas tqdm ``` -## Algorithms parameters +## Algorithms parameters You can launch benchmarks for each algorithm separately. The table below lists all supported parameters for each algorithm. -#### General +### General + | parameter Name | Type | default value | description | | ----- | ---- |---- |---- | |num-threads|int|-1| The number of threads to use| @@ -33,7 +34,7 @@ You can launch benchmarks for each algorithm separately. The table below lists a |seed|int|12345|Seed to pass as random_state| |dataset-name|str|None|Dataset name| -#### GradientBoostingTrees +### GradientBoostingTrees | parameter Name | Type | default value | description | | ----- | ---- |---- |---- | From 8bdc7f28758350ff832ef26f867568a72e63450f Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Tue, 20 Apr 2021 13:07:39 +0300 Subject: [PATCH 24/31] Applying comments: renamed configs --- configs/{ => modelbuilders}/lgbm_mb_cpu_config.json | 0 configs/{ => modelbuilders}/xgb_mb_cpu_config.json | 0 .../xgb_cpu_main_config.json} | 0 .../xgb_cpu_nvidia_config.json} | 0 configs/{ => xgboost}/xgb_gpu_config.json | 0 datasets/load_datasets.py | 12 ++++++------ datasets/{loader_clf.py => loader_classification.py} | 0 datasets/{loader_mul.py => loader_multiclass.py} | 0 datasets/{loader_reg.py => loader_regression.py} | 0 9 files changed, 6 insertions(+), 6 deletions(-) rename configs/{ => modelbuilders}/lgbm_mb_cpu_config.json (100%) rename configs/{ => modelbuilders}/xgb_mb_cpu_config.json (100%) rename configs/{xgb_cpu_config.json => xgboost/xgb_cpu_main_config.json} (100%) rename configs/{xgb_cpu_nvda_config.json => xgboost/xgb_cpu_nvidia_config.json} (100%) rename configs/{ => xgboost}/xgb_gpu_config.json (100%) rename datasets/{loader_clf.py => loader_classification.py} (100%) rename datasets/{loader_mul.py => loader_multiclass.py} (100%) rename datasets/{loader_reg.py => loader_regression.py} (100%) diff --git a/configs/lgbm_mb_cpu_config.json b/configs/modelbuilders/lgbm_mb_cpu_config.json similarity index 100% rename from configs/lgbm_mb_cpu_config.json rename to configs/modelbuilders/lgbm_mb_cpu_config.json diff --git a/configs/xgb_mb_cpu_config.json b/configs/modelbuilders/xgb_mb_cpu_config.json similarity index 100% rename from configs/xgb_mb_cpu_config.json rename to configs/modelbuilders/xgb_mb_cpu_config.json diff --git a/configs/xgb_cpu_config.json b/configs/xgboost/xgb_cpu_main_config.json similarity index 100% rename from configs/xgb_cpu_config.json rename to configs/xgboost/xgb_cpu_main_config.json diff --git a/configs/xgb_cpu_nvda_config.json b/configs/xgboost/xgb_cpu_nvidia_config.json similarity index 100% rename from configs/xgb_cpu_nvda_config.json rename to configs/xgboost/xgb_cpu_nvidia_config.json diff --git a/configs/xgb_gpu_config.json b/configs/xgboost/xgb_gpu_config.json similarity index 100% rename from configs/xgb_gpu_config.json rename to configs/xgboost/xgb_gpu_config.json diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index ad0822ae9..29fbc3533 100755 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -21,12 +21,12 @@ from pathlib import Path from typing import Callable, Dict -from .loader_clf import (a_nine_a, airline, airline_ohe, bosch, codrnanorm, - epsilon, fraud, gisette, higgs, higgs_one_m, ijcnn, - klaverjas, santander, skin_segmentation) -from .loader_mul import (connect, covertype, covtype, letters, mnist, msrank, - plasticc, sensit) -from .loader_reg import abalone, mortgage_first_q, year_prediction_msd +from .loader_classification import ( + a_nine_a, airline, airline_ohe, bosch, codrnanorm, epsilon, fraud, gisette, + higgs, higgs_one_m, ijcnn, klaverjas, santander, skin_segmentation) +from .loader_multiclass import (connect, covertype, covtype, letters, mnist, + msrank, plasticc, sensit) +from .loader_regression import abalone, mortgage_first_q, year_prediction_msd dataset_loaders: Dict[str, Callable[[Path], bool]] = { "a9a": a_nine_a, diff --git a/datasets/loader_clf.py b/datasets/loader_classification.py similarity index 100% rename from datasets/loader_clf.py rename to datasets/loader_classification.py diff --git a/datasets/loader_mul.py b/datasets/loader_multiclass.py similarity index 100% rename from datasets/loader_mul.py rename to datasets/loader_multiclass.py diff --git a/datasets/loader_reg.py b/datasets/loader_regression.py similarity index 100% rename from datasets/loader_reg.py rename to datasets/loader_regression.py From f9cf09b83fa0d8ce99b58c64a5dc9980a51e5c4a Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Fri, 23 Apr 2021 12:17:44 +0300 Subject: [PATCH 25/31] Changed all datasets to npy, applied Kirill's comments --- bench.py | 2 +- configs/cuml_config.json | 94 +++++------ configs/modelbuilders/lgbm_mb_cpu_config.json | 17 +- configs/modelbuilders/xgb_mb_cpu_config.json | 17 +- configs/skl_config.json | 146 +++++++++--------- configs/svm/svc_proba_cuml.json | 100 ++++++------ configs/svm/svc_proba_sklearn.json | 100 ++++++------ configs/xgboost/xgb_cpu_main_config.json | 73 +++++---- configs/xgboost/xgb_gpu_config.json | 71 +++++---- datasets/load_datasets.py | 13 +- datasets/loader_classification.py | 92 ++++++----- datasets/loader_multiclass.py | 73 +++++---- runner.py | 6 +- 13 files changed, 402 insertions(+), 402 deletions(-) diff --git a/bench.py b/bench.py index b99fdb382..411fceea6 100644 --- a/bench.py +++ b/bench.py @@ -419,7 +419,7 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False, # load and convert data from npy/csv file if path is specified if param_vars[file_arg] is not None: if param_vars[file_arg].name.endswith('.npy'): - data = np.load(param_vars[file_arg].name) + data = np.load(param_vars[file_arg].name, allow_pickle=True) else: data = read_csv(param_vars[file_arg].name, params) full_data[element] = convert_data( diff --git a/configs/cuml_config.json b/configs/cuml_config.json index 70361023e..6217b6e96 100755 --- a/configs/cuml_config.json +++ b/configs/cuml_config.json @@ -103,31 +103,31 @@ "dtype": ["float32"], "dataset": [ { - "source": "csv", + "source": "npy", "name": "higgs1m", "training": { - "x": "data/higgs1m_x_train.csv", - "y": "data/higgs1m_y_train.csv" + "x": "data/higgs1m_x_train.npy", + "y": "data/higgs1m_y_train.npy" }, "testing": { - "x": "data/higgs1m_x_test.csv", - "y": "data/higgs1m_y_test.csv" + "x": "data/higgs1m_x_test.npy", + "y": "data/higgs1m_y_test.npy" } }, { - "source": "csv", + "source": "npy", "name": "airline-ohe", "training": { - "x": "data/airline-ohe_x_train.csv", - "y": "data/airline-ohe_y_train.csv" + "x": "data/airline-ohe_x_train.npy", + "y": "data/airline-ohe_y_train.npy" }, "testing": { - "x": "data/airline-ohe_x_test.csv", - "y": "data/airline-ohe_y_test.csv" + "x": "data/airline-ohe_x_test.npy", + "y": "data/airline-ohe_y_test.npy" } } ], @@ -226,17 +226,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "ijcnn", "training": { - "x": "data/ijcnn_x_train.csv", - "y": "data/ijcnn_y_train.csv" + "x": "data/ijcnn_x_train.npy", + "y": "data/ijcnn_y_train.npy" }, "testing": { - "x": "data/ijcnn_x_test.csv", - "y": "data/ijcnn_y_test.csv" + "x": "data/ijcnn_x_test.npy", + "y": "data/ijcnn_y_test.npy" } } ], @@ -247,17 +247,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "a9a", "training": { - "x": "data/a9a_x_train.csv", - "y": "data/a9a_y_train.csv" + "x": "data/a9a_x_train.npy", + "y": "data/a9a_y_train.npy" }, "testing": { - "x": "data/a9a_x_test.csv", - "y": "data/a9a_y_test.csv" + "x": "data/a9a_x_test.npy", + "y": "data/a9a_y_test.npy" } } ], @@ -268,17 +268,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "gisette", "training": { - "x": "data/gisette_x_train.csv", - "y": "data/gisette_y_train.csv" + "x": "data/gisette_x_train.npy", + "y": "data/gisette_y_train.npy" }, "testing": { - "x": "data/gisette_x_test.csv", - "y": "data/gisette_y_test.csv" + "x": "data/gisette_x_test.npy", + "y": "data/gisette_y_test.npy" } } ], @@ -289,17 +289,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "klaverjas", "training": { - "x": "data/klaverjas_x_train.csv", - "y": "data/klaverjas_y_train.csv" + "x": "data/klaverjas_x_train.npy", + "y": "data/klaverjas_y_train.npy" }, "testing": { - "x": "data/klaverjas_x_test.csv", - "y": "data/klaverjas_y_test.csv" + "x": "data/klaverjas_x_test.npy", + "y": "data/klaverjas_y_test.npy" } } ], @@ -310,17 +310,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "skin_segmentation", "training": { - "x": "data/skin_segmentation_x_train.csv", - "y": "data/skin_segmentation_y_train.csv" + "x": "data/skin_segmentation_x_train.npy", + "y": "data/skin_segmentation_y_train.npy" }, "testing": { - "x": "data/skin_segmentation_x_test.csv", - "y": "data/skin_segmentation_y_test.csv" + "x": "data/skin_segmentation_x_test.npy", + "y": "data/skin_segmentation_y_test.npy" } } ], @@ -452,12 +452,12 @@ "algorithm": "train_test_split", "dataset": [ { - "source": "csv", + "source": "npy", "name": "census", "training": { - "x": "data/census_x.csv", - "y": "data/census_y.csv" + "x": "data/census_x_train.npy", + "y": "data/census_y_train.npy" } } ], @@ -468,12 +468,12 @@ "algorithm": "lasso", "dataset": [ { - "source": "csv", - "name": "mortgage", + "source": "npy", + "name": "mortgage1Q", "training": { - "x": "data/mortgage_x.csv", - "y": "data/mortgage_y.csv" + "x": "data/mortgage1Q_x_train.npy", + "y": "data/mortgage1Q_y_train.npy" } } ], @@ -484,17 +484,17 @@ "algorithm": "elasticnet", "dataset": [ { - "source": "csv", + "source": "npy", "name": "year_prediction_msd", "training": { - "x": "data/year_prediction_msd_x_train.csv", - "y": "data/year_prediction_msd_y_train.csv" + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" }, "testing": { - "x": "data/year_prediction_msd_x_test.csv", - "y": "data/year_prediction_msd_y_test.csv" + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" } } ], diff --git a/configs/modelbuilders/lgbm_mb_cpu_config.json b/configs/modelbuilders/lgbm_mb_cpu_config.json index fbf8a538d..a0dabdffa 100755 --- a/configs/modelbuilders/lgbm_mb_cpu_config.json +++ b/configs/modelbuilders/lgbm_mb_cpu_config.json @@ -68,12 +68,12 @@ { "dataset": [ { - "source": "csv", + "source": "npy", "name": "mortgage1Q", "training": { - "x": "data/mortgage_x.csv", - "y": "data/mortgage_y.csv" + "x": "data/mortgage1Q_x_train.npy", + "y": "data/mortgage1Q_y_train.npy" } } ], @@ -92,16 +92,11 @@ "dataset": [ { "source": "npy", - "name": "msrank", + "name": "mlsr", "training": { - "x": "data/msrank_x_train.npy", - "y": "data/msrank_y_train.npy" - }, - "testing": - { - "x": "data/msrank_x_test.npy", - "y": "data/msrank_y_test.npy" + "x": "data/mlsr_x_train.npy", + "y": "data/mlsr_y_train.npy" } } ], diff --git a/configs/modelbuilders/xgb_mb_cpu_config.json b/configs/modelbuilders/xgb_mb_cpu_config.json index 2b10e5592..483f3c158 100755 --- a/configs/modelbuilders/xgb_mb_cpu_config.json +++ b/configs/modelbuilders/xgb_mb_cpu_config.json @@ -72,12 +72,12 @@ { "dataset": [ { - "source": "csv", + "source": "npy", "name": "mortgage1Q", "training": { - "x": "data/mortgage_x.csv", - "y": "data/mortgage_y.csv" + "x": "data/mortgage1Q_x_train.npy", + "y": "data/mortgage1Q_y_train.npy" } } ], @@ -96,16 +96,11 @@ "dataset": [ { "source": "npy", - "name": "msrank", + "name": "mlsr", "training": { - "x": "data/msrank_x_train.npy", - "y": "data/msrank_y_train.npy" - }, - "testing": - { - "x": "data/msrank_x_test.npy", - "y": "data/msrank_y_test.npy" + "x": "data/mlsr_x_train.npy", + "y": "data/mlsr_y_train.npy" } } ], diff --git a/configs/skl_config.json b/configs/skl_config.json index 93c23e068..a385e50be 100755 --- a/configs/skl_config.json +++ b/configs/skl_config.json @@ -115,31 +115,31 @@ "dtype": ["float32"], "dataset": [ { - "source": "csv", + "source": "npy", "name": "higgs1m", "training": { - "x": "data/higgs1m_x_train.csv", - "y": "data/higgs1m_y_train.csv" + "x": "data/higgs1m_x_train.npy", + "y": "data/higgs1m_y_train.npy" }, "testing": { - "x": "data/higgs1m_x_test.csv", - "y": "data/higgs1m_y_test.csv" + "x": "data/higgs1m_x_test.npy", + "y": "data/higgs1m_y_test.npy" } }, { - "source": "csv", + "source": "npy", "name": "airline-ohe", "training": { - "x": "data/airline-ohe_x_train.csv", - "y": "data/airline-ohe_y_train.csv" + "x": "data/airline-ohe_x_train.npy", + "y": "data/airline-ohe_y_train.npy" }, "testing": { - "x": "data/airline-ohe_x_test.csv", - "y": "data/airline-ohe_y_test.csv" + "x": "data/airline-ohe_x_test.npy", + "y": "data/airline-ohe_y_test.npy" } } ], @@ -238,17 +238,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "ijcnn", "training": { - "x": "data/ijcnn_x_train.csv", - "y": "data/ijcnn_y_train.csv" + "x": "data/ijcnn_x_train.npy", + "y": "data/ijcnn_y_train.npy" }, "testing": { - "x": "data/ijcnn_x_test.csv", - "y": "data/ijcnn_y_test.csv" + "x": "data/ijcnn_x_test.npy", + "y": "data/ijcnn_y_test.npy" } } ], @@ -259,17 +259,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "a9a", "training": { - "x": "data/a9a_x_train.csv", - "y": "data/a9a_y_train.csv" + "x": "data/a9a_x_train.npy", + "y": "data/a9a_y_train.npy" }, "testing": { - "x": "data/a9a_x_test.csv", - "y": "data/a9a_y_test.csv" + "x": "data/a9a_x_test.npy", + "y": "data/a9a_y_test.npy" } } ], @@ -280,17 +280,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "gisette", "training": { - "x": "data/gisette_x_train.csv", - "y": "data/gisette_y_train.csv" + "x": "data/gisette_x_train.npy", + "y": "data/gisette_y_train.npy" }, "testing": { - "x": "data/gisette_x_test.csv", - "y": "data/gisette_y_test.csv" + "x": "data/gisette_x_test.npy", + "y": "data/gisette_y_test.npy" } } ], @@ -301,17 +301,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "klaverjas", "training": { - "x": "data/klaverjas_x_train.csv", - "y": "data/klaverjas_y_train.csv" + "x": "data/klaverjas_x_train.npy", + "y": "data/klaverjas_y_train.npy" }, "testing": { - "x": "data/klaverjas_x_test.csv", - "y": "data/klaverjas_y_test.csv" + "x": "data/klaverjas_x_test.npy", + "y": "data/klaverjas_y_test.npy" } } ], @@ -322,17 +322,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", - "name": "connect4", + "source": "npy", + "name": "connect", "training": { - "x": "data/connect_x_train.csv", - "y": "data/connect_y_train.csv" + "x": "data/connect_x_train.npy", + "y": "data/connect_y_train.npy" }, "testing": { - "x": "data/connect_x_test.csv", - "y": "data/connect_y_test.csv" + "x": "data/connect_x_test.npy", + "y": "data/connect_y_test.npy" } } ], @@ -343,17 +343,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "mnist", "training": { - "x": "data/mnist_x_train.csv", - "y": "data/mnist_y_train.csv" + "x": "data/mnist_x_train.npy", + "y": "data/mnist_y_train.npy" }, "testing": { - "x": "data/mnist_x_test.csv", - "y": "data/mnist_y_test.csv" + "x": "data/mnist_x_test.npy", + "y": "data/mnist_y_test.npy" } } ], @@ -364,17 +364,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "sensit", "training": { - "x": "data/sensit_x_train.csv", - "y": "data/sensit_y_train.csv" + "x": "data/sensit_x_train.npy", + "y": "data/sensit_y_train.npy" }, "testing": { - "x": "data/sensit_x_test.csv", - "y": "data/sensit_y_test.csv" + "x": "data/sensit_x_test.npy", + "y": "data/sensit_y_test.npy" } } ], @@ -385,17 +385,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "skin_segmentation", "training": { - "x": "data/skin_segmentation_x_train.csv", - "y": "data/skin_segmentation_y_train.csv" + "x": "data/skin_segmentation_x_train.npy", + "y": "data/skin_segmentation_y_train.npy" }, "testing": { - "x": "data/skin_segmentation_x_test.csv", - "y": "data/skin_segmentation_y_test.csv" + "x": "data/skin_segmentation_x_test.npy", + "y": "data/skin_segmentation_y_test.npy" } } ], @@ -406,17 +406,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "covertype", "training": { - "x": "data/covertype_x_train.csv", - "y": "data/covertype_y_train.csv" + "x": "data/covertype_x_train.npy", + "y": "data/covertype_y_train.npy" }, "testing": { - "x": "data/covertype_x_test.csv", - "y": "data/covertype_y_test.csv" + "x": "data/covertype_x_test.npy", + "y": "data/covertype_y_test.npy" } } ], @@ -427,17 +427,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "codrnanorm", "training": { - "x": "data/codrnanorm_x_train.csv", - "y": "data/codrnanorm_y_train.csv" + "x": "data/codrnanorm_x_train.npy", + "y": "data/codrnanorm_y_train.npy" }, "testing": { - "x": "data/codrnanorm_x_test.csv", - "y": "data/codrnanorm_y_test.csv" + "x": "data/codrnanorm_x_test.npy", + "y": "data/codrnanorm_y_test.npy" } } ], @@ -570,12 +570,12 @@ "algorithm": "train_test_split", "dataset": [ { - "source": "csv", + "source": "npy", "name": "census", "training": { - "x": "data/census_x.csv", - "y": "data/census_y.csv" + "x": "data/census_x_train.npy", + "y": "data/census_y_train.npy" } } ], @@ -589,12 +589,12 @@ "algorithm": "lasso", "dataset": [ { - "source": "csv", - "name": "mortgage", + "source": "npy", + "name": "mortgage1Q", "training": { - "x": "data/mortgage_x.csv", - "y": "data/mortgage_y.csv" + "x": "data/mortgage1Q_x_train.npy", + "y": "data/mortgage1Q_y_train.npy" } } ], @@ -605,17 +605,17 @@ "algorithm": "elasticnet", "dataset": [ { - "source": "csv", + "source": "npy", "name": "year_prediction_msd", "training": { - "x": "data/year_prediction_msd_x_train.csv", - "y": "data/year_prediction_msd_y_train.csv" + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" }, "testing": { - "x": "data/year_prediction_msd_x_test.csv", - "y": "data/year_prediction_msd_y_test.csv" + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" } } ], diff --git a/configs/svm/svc_proba_cuml.json b/configs/svm/svc_proba_cuml.json index 85fe1f0df..c765a2164 100755 --- a/configs/svm/svc_proba_cuml.json +++ b/configs/svm/svc_proba_cuml.json @@ -12,17 +12,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "ijcnn", "training": { - "x": "data/ijcnn_x_train.csv", - "y": "data/ijcnn_y_train.csv" + "x": "data/ijcnn_x_train.npy", + "y": "data/ijcnn_y_train.npy" }, "testing": { - "x": "data/ijcnn_x_test.csv", - "y": "data/ijcnn_y_test.csv" + "x": "data/ijcnn_x_test.npy", + "y": "data/ijcnn_y_test.npy" } } ], @@ -33,17 +33,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "a9a", "training": { - "x": "data/a9a_x_train.csv", - "y": "data/a9a_y_train.csv" + "x": "data/a9a_x_train.npy", + "y": "data/a9a_y_train.npy" }, "testing": { - "x": "data/a9a_x_test.csv", - "y": "data/a9a_y_test.csv" + "x": "data/a9a_x_test.npy", + "y": "data/a9a_y_test.npy" } } ], @@ -54,17 +54,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "gisette", "training": { - "x": "data/gisette_x_train.csv", - "y": "data/gisette_y_train.csv" + "x": "data/gisette_x_train.npy", + "y": "data/gisette_y_train.npy" }, "testing": { - "x": "data/gisette_x_test.csv", - "y": "data/gisette_y_test.csv" + "x": "data/gisette_x_test.npy", + "y": "data/gisette_y_test.npy" } } ], @@ -75,17 +75,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "klaverjas", "training": { - "x": "data/klaverjas_x_train.csv", - "y": "data/klaverjas_y_train.csv" + "x": "data/klaverjas_x_train.npy", + "y": "data/klaverjas_y_train.npy" }, "testing": { - "x": "data/klaverjas_x_test.csv", - "y": "data/klaverjas_y_test.csv" + "x": "data/klaverjas_x_test.npy", + "y": "data/klaverjas_y_test.npy" } } ], @@ -96,17 +96,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "connect", "training": { - "x": "data/connect_x_train.csv", - "y": "data/connect_y_train.csv" + "x": "data/connect_x_train.npy", + "y": "data/connect_y_train.npy" }, "testing": { - "x": "data/connect_x_test.csv", - "y": "data/connect_y_test.csv" + "x": "data/connect_x_test.npy", + "y": "data/connect_y_test.npy" } } ], @@ -117,17 +117,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "mnist", "training": { - "x": "data/mnist_x_train.csv", - "y": "data/mnist_y_train.csv" + "x": "data/mnist_x_train.npy", + "y": "data/mnist_y_train.npy" }, "testing": { - "x": "data/mnist_x_test.csv", - "y": "data/mnist_y_test.csv" + "x": "data/mnist_x_test.npy", + "y": "data/mnist_y_test.npy" } } ], @@ -138,17 +138,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "sensit", "training": { - "x": "data/sensit_x_train.csv", - "y": "data/sensit_y_train.csv" + "x": "data/sensit_x_train.npy", + "y": "data/sensit_y_train.npy" }, "testing": { - "x": "data/sensit_x_test.csv", - "y": "data/sensit_y_test.csv" + "x": "data/sensit_x_test.npy", + "y": "data/sensit_y_test.npy" } } ], @@ -159,17 +159,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "skin_segmentation", "training": { - "x": "data/skin_segmentation_x_train.csv", - "y": "data/skin_segmentation_y_train.csv" + "x": "data/skin_segmentation_x_train.npy", + "y": "data/skin_segmentation_y_train.npy" }, "testing": { - "x": "data/skin_segmentation_x_test.csv", - "y": "data/skin_segmentation_y_test.csv" + "x": "data/skin_segmentation_x_test.npy", + "y": "data/skin_segmentation_y_test.npy" } } ], @@ -180,17 +180,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "covertype", "training": { - "x": "data/covertype_x_train.csv", - "y": "data/covertype_y_train.csv" + "x": "data/covertype_x_train.npy", + "y": "data/covertype_y_train.npy" }, "testing": { - "x": "data/covertype_x_test.csv", - "y": "data/covertype_y_test.csv" + "x": "data/covertype_x_test.npy", + "y": "data/covertype_y_test.npy" } } ], @@ -201,17 +201,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "codrnanorm", "training": { - "x": "data/codrnanorm_x_train.csv", - "y": "data/codrnanorm_y_train.csv" + "x": "data/codrnanorm_x_train.npy", + "y": "data/codrnanorm_y_train.npy" }, "testing": { - "x": "data/codrnanorm_x_test.csv", - "y": "data/codrnanorm_y_test.csv" + "x": "data/codrnanorm_x_test.npy", + "y": "data/codrnanorm_y_test.npy" } } ], diff --git a/configs/svm/svc_proba_sklearn.json b/configs/svm/svc_proba_sklearn.json index 53c1676cf..3ded70b29 100755 --- a/configs/svm/svc_proba_sklearn.json +++ b/configs/svm/svc_proba_sklearn.json @@ -12,17 +12,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "ijcnn", "training": { - "x": "data/ijcnn_x_train.csv", - "y": "data/ijcnn_y_train.csv" + "x": "data/ijcnn_x_train.npy", + "y": "data/ijcnn_y_train.npy" }, "testing": { - "x": "data/ijcnn_x_test.csv", - "y": "data/ijcnn_y_test.csv" + "x": "data/ijcnn_x_test.npy", + "y": "data/ijcnn_y_test.npy" } } ], @@ -33,17 +33,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "a9a", "training": { - "x": "data/a9a_x_train.csv", - "y": "data/a9a_y_train.csv" + "x": "data/a9a_x_train.npy", + "y": "data/a9a_y_train.npy" }, "testing": { - "x": "data/a9a_x_test.csv", - "y": "data/a9a_y_test.csv" + "x": "data/a9a_x_test.npy", + "y": "data/a9a_y_test.npy" } } ], @@ -54,17 +54,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "gisette", "training": { - "x": "data/gisette_x_train.csv", - "y": "data/gisette_y_train.csv" + "x": "data/gisette_x_train.npy", + "y": "data/gisette_y_train.npy" }, "testing": { - "x": "data/gisette_x_test.csv", - "y": "data/gisette_y_test.csv" + "x": "data/gisette_x_test.npy", + "y": "data/gisette_y_test.npy" } } ], @@ -75,17 +75,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "klaverjas", "training": { - "x": "data/klaverjas_x_train.csv", - "y": "data/klaverjas_y_train.csv" + "x": "data/klaverjas_x_train.npy", + "y": "data/klaverjas_y_train.npy" }, "testing": { - "x": "data/klaverjas_x_test.csv", - "y": "data/klaverjas_y_test.csv" + "x": "data/klaverjas_x_test.npy", + "y": "data/klaverjas_y_test.npy" } } ], @@ -96,17 +96,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "connect", "training": { - "x": "data/connect_x_train.csv", - "y": "data/connect_y_train.csv" + "x": "data/connect_x_train.npy", + "y": "data/connect_y_train.npy" }, "testing": { - "x": "data/connect_x_test.csv", - "y": "data/connect_y_test.csv" + "x": "data/connect_x_test.npy", + "y": "data/connect_y_test.npy" } } ], @@ -117,17 +117,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "mnist", "training": { - "x": "data/mnist_x_train.csv", - "y": "data/mnist_y_train.csv" + "x": "data/mnist_x_train.npy", + "y": "data/mnist_y_train.npy" }, "testing": { - "x": "data/mnist_x_test.csv", - "y": "data/mnist_y_test.csv" + "x": "data/mnist_x_test.npy", + "y": "data/mnist_y_test.npy" } } ], @@ -138,17 +138,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "sensit", "training": { - "x": "data/sensit_x_train.csv", - "y": "data/sensit_y_train.csv" + "x": "data/sensit_x_train.npy", + "y": "data/sensit_y_train.npy" }, "testing": { - "x": "data/sensit_x_test.csv", - "y": "data/sensit_y_test.csv" + "x": "data/sensit_x_test.npy", + "y": "data/sensit_y_test.npy" } } ], @@ -159,17 +159,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "skin_segmentation", "training": { - "x": "data/skin_segmentation_x_train.csv", - "y": "data/skin_segmentation_y_train.csv" + "x": "data/skin_segmentation_x_train.npy", + "y": "data/skin_segmentation_y_train.npy" }, "testing": { - "x": "data/skin_segmentation_x_test.csv", - "y": "data/skin_segmentation_y_test.csv" + "x": "data/skin_segmentation_x_test.npy", + "y": "data/skin_segmentation_y_test.npy" } } ], @@ -180,17 +180,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "covertype", "training": { - "x": "data/covertype_x_train.csv", - "y": "data/covertype_y_train.csv" + "x": "data/covertype_x_train.npy", + "y": "data/covertype_y_train.npy" }, "testing": { - "x": "data/covertype_x_test.csv", - "y": "data/covertype_y_test.csv" + "x": "data/covertype_x_test.npy", + "y": "data/covertype_y_test.npy" } } ], @@ -201,17 +201,17 @@ "algorithm": "svm", "dataset": [ { - "source": "csv", + "source": "npy", "name": "codrnanorm", "training": { - "x": "data/codrnanorm_x_train.csv", - "y": "data/codrnanorm_y_train.csv" + "x": "data/codrnanorm_x_train.npy", + "y": "data/codrnanorm_y_train.npy" }, "testing": { - "x": "data/codrnanorm_x_test.csv", - "y": "data/codrnanorm_y_test.csv" + "x": "data/codrnanorm_x_test.npy", + "y": "data/codrnanorm_y_test.npy" } } ], diff --git a/configs/xgboost/xgb_cpu_main_config.json b/configs/xgboost/xgb_cpu_main_config.json index a155fcbb1..f5a2c4b67 100644 --- a/configs/xgboost/xgb_cpu_main_config.json +++ b/configs/xgboost/xgb_cpu_main_config.json @@ -116,12 +116,35 @@ { "dataset": [ { - "source": "csv", + "source": "npy", + "name": "mlsr", + "training": + { + "x": "data/mlsr_x_train.npy", + "y": "data/mlsr_y_train.npy" + } + } + ], + "max-bin": 256, + "learning-rate": 0.3, + "subsample": 1, + "reg-lambda": 2, + "min-child-weight": 1, + "min-split-loss": 0.1, + "max-depth": 8, + "n-estimators": 200, + "objective": "multi:softprob", + "single-precision-histogram": "" + }, + { + "dataset": [ + { + "source": "npy", "name": "mortgage1Q", "training": { - "x": "data/mortgage_x.csv", - "y": "data/mortgage_y.csv" + "x": "data/mortgage1Q_x_train.npy", + "y": "data/mortgage1Q_y_train.npy" } } ], @@ -140,39 +163,16 @@ "dataset": [ { "source": "npy", - "name": "msrank", + "name": "plasticc", "training": { - "x": "data/msrank_x_train.npy", - "y": "data/msrank_y_train.npy" + "x": "data/plasticc_x_train.npy", + "y": "data/plasticc_y_train.npy" }, "testing": { - "x": "data/msrank_x_test.npy", - "y": "data/msrank_y_test.npy" - } - } - ], - "max-bin": 256, - "learning-rate": 0.3, - "subsample": 1, - "reg-lambda": 2, - "min-child-weight": 1, - "min-split-loss": 0.1, - "max-depth": 8, - "n-estimators": 200, - "objective": "multi:softprob", - "single-precision-histogram": "" - }, - { - "dataset": [ - { - "source": "csv", - "name": "plasticc", - "training": - { - "x": "data/plasticc_x_train.csv", - "y": "data/plasticc_y_train.csv" + "x": "data/plasticc_x_test.npy", + "y": "data/plasticc_y_test.npy" } } ], @@ -185,12 +185,17 @@ { "dataset": [ { - "source": "csv", + "source": "npy", "name": "santander", "training": { - "x": "data/santander_x_train.csv", - "y": "data/santander_y_train.csv" + "x": "data/santander_x_train.npy", + "y": "data/santander_y_train.npy" + }, + "testing": + { + "x": "data/santander_x_test.npy", + "y": "data/santander_y_test.npy" } } ], diff --git a/configs/xgboost/xgb_gpu_config.json b/configs/xgboost/xgb_gpu_config.json index 30cf73bc8..506ac0cfd 100644 --- a/configs/xgboost/xgb_gpu_config.json +++ b/configs/xgboost/xgb_gpu_config.json @@ -115,12 +115,34 @@ { "dataset": [ { - "source": "csv", + "source": "npy", + "name": "mlsr", + "training": + { + "x": "data/mlsr_x_train.npy", + "y": "data/mlsr_y_train.npy" + } + } + ], + "max-bin": 256, + "learning-rate": 0.3, + "subsample": 1, + "reg-lambda": 2, + "min-child-weight": 1, + "min-split-loss": 0.1, + "max-depth": 8, + "n-estimators": 200, + "objective": "multi:softprob" + }, + { + "dataset": [ + { + "source": "npy", "name": "mortgage1Q", "training": { - "x": "data/mortgage_x.csv", - "y": "data/mortgage_y.csv" + "x": "data/mortgage1Q_x_train.npy", + "y": "data/mortgage1Q_y_train.npy" } } ], @@ -139,38 +161,16 @@ "dataset": [ { "source": "npy", - "name": "msrank", + "name": "plasticc", "training": { - "x": "data/msrank_x_train.npy", - "y": "data/msrank_y_train.npy" + "x": "data/plasticc_x_train.npy", + "y": "data/plasticc_y_train.npy" }, "testing": { - "x": "data/msrank_x_test.npy", - "y": "data/msrank_y_test.npy" - } - } - ], - "max-bin": 256, - "learning-rate": 0.3, - "subsample": 1, - "reg-lambda": 2, - "min-child-weight": 1, - "min-split-loss": 0.1, - "max-depth": 8, - "n-estimators": 200, - "objective": "multi:softprob" - }, - { - "dataset": [ - { - "source": "csv", - "name": "plasticc", - "training": - { - "x": "data/plasticc_x_train.csv", - "y": "data/plasticc_y_train.csv" + "x": "data/plasticc_x_test.npy", + "y": "data/plasticc_y_test.npy" } } ], @@ -183,12 +183,17 @@ { "dataset": [ { - "source": "csv", + "source": "npy", "name": "santander", "training": { - "x": "data/santander_x_train.csv", - "y": "data/santander_y_train.csv" + "x": "data/santander_x_train.npy", + "y": "data/santander_y_train.npy" + }, + "testing": + { + "x": "data/santander_x_test.npy", + "y": "data/santander_y_test.npy" } } ], diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index 29fbc3533..5fad3ac4b 100755 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -21,11 +21,12 @@ from pathlib import Path from typing import Callable, Dict -from .loader_classification import ( - a_nine_a, airline, airline_ohe, bosch, codrnanorm, epsilon, fraud, gisette, - higgs, higgs_one_m, ijcnn, klaverjas, santander, skin_segmentation) -from .loader_multiclass import (connect, covertype, covtype, letters, mnist, - msrank, plasticc, sensit) +from .loader_classification import (a_nine_a, airline, airline_ohe, bosch, + census, codrnanorm, epsilon, fraud, + gisette, higgs, higgs_one_m, ijcnn, + klaverjas, santander, skin_segmentation) +from .loader_multiclass import (connect, covertype, covtype, letters, mlsr, + mnist, msrank, plasticc, sensit) from .loader_regression import abalone, mortgage_first_q, year_prediction_msd dataset_loaders: Dict[str, Callable[[Path], bool]] = { @@ -34,6 +35,7 @@ "airline": airline, "airline-ohe": airline_ohe, "bosch": bosch, + "census": census, "codrnanorm": codrnanorm, "connect": connect, "covertype": covertype, @@ -46,6 +48,7 @@ "ijcnn": ijcnn, "klaverjas": klaverjas, "letters": letters, + "mlsr": mlsr, "mnist": mnist, "mortgage1Q": mortgage_first_q, "msrank": msrank, diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index 5f804e888..1a945eb22 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -16,6 +16,7 @@ import logging import os +import subprocess from pathlib import Path from typing import Any @@ -50,17 +51,15 @@ def a_nine_a(dataset_dir: Path) -> bool: y[y == -1] = 0 - logging.info('a9a dataset is downloaded') - logging.info('reading CSV file...') + logging.info(f'{dataset_name} is loaded, started parsing...') x_train, x_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=11) for data, name in zip((x_train, x_test, y_train, y_test), ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - logging.info(f'dataset {dataset_name} ready.') + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') return True @@ -152,7 +151,7 @@ def airline_ohe(dataset_dir: Path) -> bool: categorical_names = ["Month", "DayofMonth", "DayOfWeek", "UniqueCarrier", "Origin", "Dest"] - for local_url in [local_url_train, local_url_train]: + for local_url in [local_url_train, local_url_test]: df = pd.read_csv(local_url, nrows=1000000 if local_url.endswith('train-10m.csv') else None) X = df.drop('dep_delayed_15min', 1) @@ -197,9 +196,9 @@ def bosch(dataset_dir: Path) -> bool: if not os.path.isfile(local_url): logging.info(f'Started loading {dataset_name}') - os.system( - "kaggle competitions download -c bosch-production-line-performance -f " + - filename + " -p " + str(dataset_dir)) + args = ["kaggle", "competitions", "download", "-c", + "bosch-production-line-performance", "-f", filename, "-p", str(dataset_dir)] + _ = subprocess.check_output(args) logging.info(f'{dataset_name} is loaded, started parsing...') X = pd.read_csv(local_url, index_col=0, compression='zip', dtype=np.float32) y = X.iloc[:, -1].to_numpy(dtype=np.float32) @@ -216,6 +215,13 @@ def bosch(dataset_dir: Path) -> bool: return True +def census(dataset_dir: Path) -> bool: + """ + # TODO: add an loading instruction + """ + return False + + def codrnanorm(dataset_dir: Path) -> bool: """ Abstract: Detection of non-coding RNAs on the basis of predicted secondary @@ -237,17 +243,15 @@ def codrnanorm(dataset_dir: Path) -> bool: X = pd.DataFrame(X.todense()) y = pd.DataFrame(y) - logging.info(f'{dataset_name} dataset is downloaded') - logging.info('reading CSV file...') + logging.info(f'{dataset_name} is loaded, started parsing...') x_train, x_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42) for data, name in zip((x_train, x_test, y_train, y_test), ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - logging.info(f'dataset {dataset_name} ready.') + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') return True @@ -303,7 +307,7 @@ def fraud(dataset_dir: Path) -> bool: Contains missing values as NaN. TaskType:binclass - NumberOfFeatures:30 + NumberOfFeatures:28 NumberOfInstances:285K """ dataset_name = 'fraud' @@ -314,8 +318,9 @@ def fraud(dataset_dir: Path) -> bool: if not os.path.isfile(local_url): logging.info(f'Started loading {dataset_name}') - os.system("kaggle datasets download mlg-ulb/creditcardfraud -f" + - filename + " -p " + str(dataset_dir)) + args = ["kaggle", "datasets", "download", "mlg-ulb/creditcardfraud", "-f", + filename, "-p", str(dataset_dir)] + _ = subprocess.check_output(args) logging.info(f'{dataset_name} is loaded, started parsing...') df = pd.read_csv(local_url + ".zip", dtype=np.float32) @@ -372,8 +377,7 @@ def gisette(dataset_dir: Path) -> bool: if not os.path.exists(filename_test_labels): retrieve(gisette_test_labels_url, filename_test_labels) - logging.info('gisette dataset is downloaded') - logging.info('reading CSV file...') + logging.info(f'{dataset_name} is loaded, started parsing...') num_cols = 5000 @@ -400,11 +404,9 @@ def gisette(dataset_dir: Path) -> bool: for data, name in zip((x_train, x_test, y_train, y_test), ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - - logging.info('dataset gisette ready.') + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info('dataset gisette is ready.') return True @@ -508,17 +510,15 @@ def ijcnn(dataset_dir: Path) -> bool: y[y == -1] = 0 - logging.info(f'{dataset_name} dataset is downloaded') - logging.info('reading CSV file...') + logging.info(f'{dataset_name} is loaded, started parsing...') x_train, x_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42) for data, name in zip((x_train, x_test, y_train, y_test), ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - logging.info(f'dataset {dataset_name} ready.') + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') return True @@ -533,10 +533,10 @@ def klaverjas(dataset_dir: Path) -> bool: Task Information: Classification task. n_classes = 2. - klaverjas X train dataset (196045, 3) - klaverjas y train dataset (196045, 1) - klaverjas X test dataset (49012, 3) - klaverjas y test dataset (49012, 1) + klaverjas X train dataset (196308, 32) + klaverjas y train dataset (196308, 1) + klaverjas X test dataset (785233, 32) + klaverjas y test dataset (785233, 1) """ dataset_name = 'klaverjas' os.makedirs(dataset_dir, exist_ok=True) @@ -545,17 +545,15 @@ def klaverjas(dataset_dir: Path) -> bool: as_frame=True, data_home=dataset_dir) y = y.cat.codes - logging.info(f'{dataset_name} dataset is downloaded') - logging.info('reading CSV file...') + logging.info(f'{dataset_name} is loaded, started parsing...') x_train, x_test, y_train, y_test = train_test_split( X, y, train_size=0.2, random_state=42) for data, name in zip((x_train, x_test, y_train, y_test), ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - logging.info(f'dataset {dataset_name} ready.') + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') return True @@ -588,15 +586,13 @@ def skin_segmentation(dataset_dir: Path) -> bool: y = y.astype(int) y[y == 2] = 0 - logging.info(f'{dataset_name} dataset is downloaded') - logging.info('reading CSV file...') + logging.info(f'{dataset_name} is loaded, started parsing...') x_train, x_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42) for data, name in zip((x_train, x_test, y_train, y_test), ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - logging.info(f'dataset {dataset_name} ready.') + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') return True diff --git a/datasets/loader_multiclass.py b/datasets/loader_multiclass.py index c6a0bc0e3..69b1da1e6 100644 --- a/datasets/loader_multiclass.py +++ b/datasets/loader_multiclass.py @@ -35,10 +35,10 @@ def connect(dataset_dir: Path) -> bool: http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.htm Classification task. n_classes = 3. - connect X train dataset (196045, 127) - connect y train dataset (196045, 1) - connect X test dataset (49012, 127) - connect y test dataset (49012, 1) + connect X train dataset (60801, 126) + connect y train dataset (60801, 1) + connect X test dataset (6756, 126) + connect y test dataset (6756, 1) """ dataset_name = 'connect' os.makedirs(dataset_dir, exist_ok=True) @@ -49,17 +49,15 @@ def connect(dataset_dir: Path) -> bool: y = pd.DataFrame(y) y = y.astype(int) - logging.info(f'{dataset_name} dataset is downloaded') - logging.info('reading CSV file...') + logging.info(f'{dataset_name} is loaded, started parsing...') x_train, x_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=42) for data, name in zip((x_train, x_test, y_train, y_test), ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - logging.info(f'dataset {dataset_name} ready.') + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') return True @@ -83,17 +81,15 @@ def covertype(dataset_dir: Path) -> bool: as_frame=True, data_home=dataset_dir) y = y.astype(int) - logging.info(f'{dataset_name} dataset is downloaded') - logging.info('reading CSV file...') + logging.info(f'{dataset_name} is loaded, started parsing...') x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42) + X, y, test_size=0.4, random_state=42) for data, name in zip((x_train, x_test, y_train, y_test), ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - logging.info(f'dataset {dataset_name} ready.') + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') return True @@ -159,6 +155,13 @@ def letters(dataset_dir: Path) -> bool: return True +def mlsr(dataset: Path) -> bool: + """ + # TODO: add an loading instruction + """ + return False + + def mnist(dataset_dir: Path) -> bool: """ Abstract: @@ -184,17 +187,15 @@ def mnist(dataset_dir: Path) -> bool: y = y.astype(int) X = X / 255 - logging.info(f'{dataset_name} dataset is downloaded') - logging.info('reading CSV file...') + logging.info(f'{dataset_name} is loaded, started parsing...') x_train, x_test, y_train, y_test = train_test_split( X, y, test_size=10000, shuffle=False) for data, name in zip((x_train, x_test, y_train, y_test), ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - logging.info(f'dataset {dataset_name} ready.') + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') return True @@ -202,9 +203,9 @@ def msrank(dataset_dir: Path) -> bool: """ Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf - TaskType:binclass - NumberOfFeatures:700 - NumberOfInstances:10100000 + TaskType:multiclass + NumberOfFeatures:137 + NumberOfInstances:1.2M """ dataset_name = 'msrank' os.makedirs(dataset_dir, exist_ok=True) @@ -262,11 +263,11 @@ def sensit(dataset_dir: Path) -> bool: Author: M. Duarte, Y. H. Hu Source: [original](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets) - Classification task. n_classes = 2. - sensit X train dataset (196045, 3) - sensit y train dataset (196045, 1) - sensit X test dataset (49012, 3) - sensit y test dataset (49012, 1) + Multiclass classification task + sensit X train dataset (78822, 100) + sensit y train dataset (78822, 1) + sensit X test dataset (19706, 100) + sensit y test dataset (19706, 1) """ dataset_name = 'sensit' os.makedirs(dataset_dir, exist_ok=True) @@ -277,15 +278,13 @@ def sensit(dataset_dir: Path) -> bool: y = pd.DataFrame(y) y = y.astype(int) - logging.info(f'{dataset_name} dataset is downloaded') - logging.info('reading CSV file...') + logging.info(f'{dataset_name} is loaded, started parsing...') x_train, x_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42) for data, name in zip((x_train, x_test, y_train, y_test), ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.csv' - data.to_csv(os.path.join(dataset_dir, filename), - header=False, index=False) - logging.info(f'dataset {dataset_name} ready.') + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') return True diff --git a/runner.py b/runner.py index 1763ff5ed..c4cba2449 100755 --- a/runner.py +++ b/runner.py @@ -20,6 +20,7 @@ import os import socket import sys +from typing import Any, Dict, List, Union import datasets.make_datasets as make_datasets import utils @@ -53,7 +54,7 @@ # make directory for data if it doesn't exist os.makedirs('data', exist_ok=True) - json_result = { + json_result: Dict[str, Union[Dict[str, Any], List[Any]]] = { 'hardware': utils.get_hw_parameters(), 'software': utils.get_sw_parameters(), 'results': [] @@ -190,7 +191,8 @@ class GenerationArgs: stderr += f'CASE {case} EXTRA OUTPUT:\n' \ + f'{extra_stdout}\n' try: - json_result['results'] = json.loads(stdout) + if isinstance(json_result['results'], list): + json_result['results'].extend(json.loads(stdout)) except json.JSONDecodeError as decoding_exception: stderr += f'CASE {case} JSON DECODING ERROR:\n' \ + f'{decoding_exception}\n{stdout}\n' From 523df303296535187401ed00ecdba2498718325d Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Fri, 23 Apr 2021 12:52:39 +0300 Subject: [PATCH 26/31] Cleanup after someone's commit --- utils.py | 52 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/utils.py b/utils.py index f2cd5b84e..bd1ebeabf 100755 --- a/utils.py +++ b/utils.py @@ -15,7 +15,6 @@ # =============================================================================== import json -import logging import os import pathlib import platform @@ -68,6 +67,14 @@ def read_output_from_command(command: str, return res.stdout[:-1], res.stderr[:-1] +def parse_lscpu_lscl_info(command_output: str) -> Dict[str, str]: + res: Dict[str, str] = {} + for elem in command_output.strip().split('\n'): + splt = elem.split(':') + res[splt[0]] = splt[1] + return res + + def get_hw_parameters() -> Union[bool, Dict[Any, Any]]: if 'Linux' not in platform.platform(): return {} @@ -88,21 +95,36 @@ def get_hw_parameters() -> Union[bool, Dict[Any, Any]]: mem_info = ' '.join(mem_info.split()) hw_params['RAM size[GB]'] = int(mem_info.split(' ')[1]) / 2 ** 30 - # get GPU information + # get Intel GPU information + try: + lsgpu_info, _ = read_output_from_command( + 'lscl --device-type=gpu --platform-vendor=Intel') + device_num = 0 + start_idx = lsgpu_info.find('Device ') + while start_idx >= 0: + start_idx = lsgpu_info.find(':', start_idx) + 1 + end_idx = lsgpu_info.find('Device ', start_idx) + hw_params[f'GPU Intel #{device_num + 1}'] = parse_lscpu_lscl_info( + lsgpu_info[start_idx: end_idx]) + device_num += 1 + start_idx = end_idx + except (FileNotFoundError, json.JSONDecodeError): + pass + + # get Nvidia GPU information try: - nfo, _ = read_output_from_command('lscpu') - cpu_info = nfo.split('\n') - for el in cpu_info: - if 'Thread(s) per core' in el: - threads_per_core = int(el[-1]) - if threads_per_core > 1: - return True - else: - return False - return False - except FileNotFoundError: - logging.info('Impossible to check hyperthreading via lscpu') - return False + gpu_info, _ = read_output_from_command( + 'nvidia-smi --query-gpu=name,memory.total,driver_version,pstate ' + '--format=csv,noheader') + gpu_info_arr = gpu_info.split(', ') + hw_params['GPU Nvidia'] = { + 'Name': gpu_info_arr[0], + 'Memory size': gpu_info_arr[1], + 'Performance mode': gpu_info_arr[3] + } + except (FileNotFoundError, json.JSONDecodeError): + pass + return hw_params def get_sw_parameters() -> Dict[str, Dict[str, Any]]: From 59303fa124b8c9fb7f100f6fa9ec1d62c91e0457 Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Fri, 23 Apr 2021 13:25:07 +0300 Subject: [PATCH 27/31] Applying mypy --- datasets/loader_classification.py | 2 +- utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index 1a945eb22..be981952e 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -405,7 +405,7 @@ def gisette(dataset_dir: Path) -> bool: for data, name in zip((x_train, x_test, y_train, y_test), ('x_train', 'x_test', 'y_train', 'y_test')): filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) + np.save(os.path.join(dataset_dir, filename), data.to_numpy()) logging.info('dataset gisette is ready.') return True diff --git a/utils.py b/utils.py index bd1ebeabf..5593ef443 100755 --- a/utils.py +++ b/utils.py @@ -75,7 +75,7 @@ def parse_lscpu_lscl_info(command_output: str) -> Dict[str, str]: return res -def get_hw_parameters() -> Union[bool, Dict[Any, Any]]: +def get_hw_parameters() -> Dict[str, Union[Dict[str, Any], float]]: if 'Linux' not in platform.platform(): return {} From b56e42c8434ba534da95ec07b4eb847cea92c09d Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Fri, 23 Apr 2021 20:08:25 +0300 Subject: [PATCH 28/31] Applied Ekaterina's suggestions Co-authored-by: Ekaterina Mekhnetsova --- README.md | 4 ++-- configs/README.md | 30 +++++++++++++++--------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 657c01a1e..97bee26e1 100755 --- a/README.md +++ b/README.md @@ -73,10 +73,10 @@ Options: - ``--configs``: specify the path to a configuration file. - ``--no-intel-optimized``: use Scikit-learn without [Intel(R) Extension for Scikit-learn*](#intelr-extension-for-scikit-learn-support). Now available for [scikit-learn benchmarks](https://github.com/IntelPython/scikit-learn_bench/tree/master/sklearn_bench). By default, the runner uses Intel(R) Extension for Scikit-learn. -- ``--output-file``: output file name for the benchmark result. The default name is `result.json` +- ``--output-file``: specify the name of the output file for the benchmark result. The default name is `result.json` - ``--report``: create an Excel report based on benchmark results. The `openpyxl` library is required. - ``--dummy-run``: run configuration parser and dataset generation without benchmarks running. -- ``--verbose``: *WARNING*, *INFO*, *DEBUG*. print additional information during benchmarks running. Default is *INFO*. +- ``--verbose``: *WARNING*, *INFO*, *DEBUG*. Print out additional information when the benchmarks are running. The default is *INFO*. | Level | Description | |-----------|---------------| diff --git a/configs/README.md b/configs/README.md index b1c8221a6..33fc77d33 100644 --- a/configs/README.md +++ b/configs/README.md @@ -22,20 +22,20 @@ Refer to the tables below for descriptions of all fields in the configuration fi | Field Name | Type | Description | | ----- | ---- |------------ | -|data-format| Union[str, List[str]] | **REQUIRED** input data format. Data formats: *numpy*, *pandas* or *cudf* | -|data-order| Union[str, List[str]] | **REQUIRED** input data order. Data order: *C* (row-major, default) or *F* (column-major) | -|dtype| Union[str, List[str]] | **REQUIRED** input data type. Data type: *float64* (default) or *float32* | -|check-finitness| List[] | Check finiteness in sklearn input check(disabled by default) | -|device| array[string] | For scikit-learn only. The list of devices to run the benchmarks on.
It can be *None* (default, run on CPU without sycl context) or one of the types of sycl devices: *cpu*, *gpu*, *host*.
Refer to [SYCL specification](https://www.khronos.org/files/sycl/sycl-2020-reference-guide.pdf) for details| +|data-format| Union[str, List[str]] | **REQUIRED** Input data format: *numpy*, *pandas*, or *cudf*. | +|data-order| Union[str, List[str]] | **REQUIRED** Input data order: *C* (row-major, default) or *F* (column-major). | +|dtype| Union[str, List[str]] | **REQUIRED** Input data type: *float64* (default) or *float32*. | +|check-finitness| List[] | Check finiteness during scikit-learn input check (disabled by default). | +|device| array[string] | For scikit-learn only. The list of devices to run the benchmarks on.
It can be *None* (default, run on CPU without sycl context) or one of the types of sycl devices: *cpu*, *gpu*, *host*.
Refer to [SYCL specification](https://www.khronos.org/files/sycl/sycl-2020-reference-guide.pdf) for details.| ## Case Object | Field Name | Type | Description | | ----- | ---- |------------ | -|lib| Union[str, List[str]] | **REQUIRED** Test framework or list of frameworks. Must be from [*sklearn*, *daal4py*, *cuml* or *xgboost*] | -|algorithm| string | **REQUIRED** benchmark file name. | -|dataset| List[[Dataset Object](#dataset-object)] | **REQUIRED** input data specifications. | -|**specific algorithm parameters**| Union[int, float, str, List[int], List[float], List[str]] | other specific algorithm parameters. The list of supported parameters can be found here | +|lib| Union[str, List[str]] | **REQUIRED** A test framework or a list of frameworks. Must be from [*sklearn*, *daal4py*, *cuml*, *xgboost*]. | +|algorithm| string | **REQUIRED** Benchmark file name. | +|dataset| List[[Dataset Object](#dataset-object)] | **REQUIRED** Input data specifications. | +|**specific algorithm parameters**| Union[int, float, str, List[int], List[float], List[str]] | Other algorithm-specific parameters. The list of supported parameters can be found here. | ### **Important:** feel free to move any parameter from **cases** to **common** section since this parameter is common for all cases @@ -43,14 +43,14 @@ Refer to the tables below for descriptions of all fields in the configuration fi | Field Name | Type | Description | | ----- | ---- |------------ | -|source| string | **REQUIRED** data source. It can be *synthetic*, *csv* or *npy* | -|type| string | **REQUIRED for synthetic data**. The type of task for which the dataset is generated. It can be *classification*, *blobs* or *regression* | +|source| string | **REQUIRED** Data source: *synthetic*, *csv*, or *npy*. | +|type| string | **REQUIRED for synthetic data**. The type of task for which the dataset is generated: *classification*, *blobs*, or *regression*. | |n_classes| int | For *synthetic* data and for *classification* type only. The number of classes (or labels) of the classification problem | |n_clusters| int | For *synthetic* data and for *blobs* type only. The number of centers to generate | -|n_features| int | **REQUIRED for *synthetic* data**. The number of features to generate | -|name| string | Name of the dataset | -|training| [Training Object](#training-object) | **REQUIRED** An object with training dataset paths | -|testing| [Testing Object](#testing-object) | An object with testing dataset paths. If not provided, training datasets are used | +|n_features| int | **REQUIRED for *synthetic* data**. The number of features to generate. | +|name| string | Name of the dataset. | +|training| [Training Object](#training-object) | **REQUIRED** An object with the paths to the training datasets. | +|testing| [Testing Object](#testing-object) | An object with the paths to the testing datasets. If not provided, the training datasets are used. | ## Training Object From ad176e5409c19fa88b7d3bde7db90302db589cfb Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Fri, 23 Apr 2021 20:23:33 +0300 Subject: [PATCH 29/31] Applied other Ekaterina's comments --- configs/README.md | 4 ++-- xgboost_bench/gbt.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/README.md b/configs/README.md index b1c8221a6..c4658dc97 100644 --- a/configs/README.md +++ b/configs/README.md @@ -35,9 +35,9 @@ Refer to the tables below for descriptions of all fields in the configuration fi |lib| Union[str, List[str]] | **REQUIRED** Test framework or list of frameworks. Must be from [*sklearn*, *daal4py*, *cuml* or *xgboost*] | |algorithm| string | **REQUIRED** benchmark file name. | |dataset| List[[Dataset Object](#dataset-object)] | **REQUIRED** input data specifications. | -|**specific algorithm parameters**| Union[int, float, str, List[int], List[float], List[str]] | other specific algorithm parameters. The list of supported parameters can be found here | +|**specific algorithm parameters**| Union[int, float, str, List[int], List[float], List[str]] | Other specific algorithm parameters | -### **Important:** feel free to move any parameter from **cases** to **common** section since this parameter is common for all cases +**Important:** You can move any parameter from **"cases"** to **"common"** if this parameter is common to all cases ## Dataset Object diff --git a/xgboost_bench/gbt.py b/xgboost_bench/gbt.py index b7a8b54db..8b991b389 100644 --- a/xgboost_bench/gbt.py +++ b/xgboost_bench/gbt.py @@ -65,7 +65,7 @@ def convert_xgb_predictions(y_pred, objective): help='Minimum loss reduction required to make' ' partition on a leaf node') parser.add_argument('--n-estimators', type=int, default=100, - help='Number of gradient boosted trees') + help='The number of gradient boosted trees') parser.add_argument('--objective', type=str, required=True, choices=('reg:squarederror', 'binary:logistic', 'multi:softmax', 'multi:softprob'), From 11a8ffc9658c5df05a3b05547a1ceb0f58f5fa3a Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Mon, 26 Apr 2021 12:19:03 +0300 Subject: [PATCH 30/31] Final commits applying --- ..._cpu_nvidia_config.json => xgb_cpu_additional_config.json} | 0 xgboost_bench/gbt.py | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename configs/xgboost/{xgb_cpu_nvidia_config.json => xgb_cpu_additional_config.json} (100%) diff --git a/configs/xgboost/xgb_cpu_nvidia_config.json b/configs/xgboost/xgb_cpu_additional_config.json similarity index 100% rename from configs/xgboost/xgb_cpu_nvidia_config.json rename to configs/xgboost/xgb_cpu_additional_config.json diff --git a/xgboost_bench/gbt.py b/xgboost_bench/gbt.py index 8b991b389..0c44acfaa 100644 --- a/xgboost_bench/gbt.py +++ b/xgboost_bench/gbt.py @@ -133,8 +133,8 @@ def convert_xgb_predictions(y_pred, objective): else: params.n_classes = len(np.unique(y_train)) - # BE VERY CAREFUL ON IT!! It should only work for COVTYPE DATASET - if params.objective.startswith('multi:softmax'): + # Covtype has one class more than there is in train + if params.dataset_name == 'covtype': params.n_classes += 1 if params.n_classes > 2: From 37d54610c68e70dac0e33a011cc1578c52cf818c Mon Sep 17 00:00:00 2001 From: Igor Rukhovich Date: Mon, 26 Apr 2021 22:18:34 +0300 Subject: [PATCH 31/31] Alexander's final comments --- bench.py | 2 +- cuml_bench/README.md | 2 +- daal4py_bench/README.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bench.py b/bench.py index 05aaaa3b1..cd26c166e 100644 --- a/bench.py +++ b/bench.py @@ -494,7 +494,7 @@ def print_output(library, algorithm, stages, params, functions, output = [] for i in range(len(stages)): result = gen_basic_dict(library, algorithm, stages[i], params, - data[i], alg_instance, alg_params if i == 0 else None) + data[i], alg_instance, alg_params) result.update({'time[s]': times[i]}) if accuracy_type is not None: result.update({f'{accuracy_type}': accuracies[i]}) diff --git a/cuml_bench/README.md b/cuml_bench/README.md index e65f11432..e36e77f3b 100644 --- a/cuml_bench/README.md +++ b/cuml_bench/README.md @@ -1,6 +1,6 @@ ## How to create conda environment for benchmarking -`conda create -n bench -c rapidsai -c conda-forge python=3.7 cuml pandas cudf` +`conda create -n bench -c rapidsai -c conda-forge python=3.7 scikit-learn cuml pandas cudf tqdm` ## Algorithms parameters diff --git a/daal4py_bench/README.md b/daal4py_bench/README.md index 85c7831df..c1c940ef0 100644 --- a/daal4py_bench/README.md +++ b/daal4py_bench/README.md @@ -1,7 +1,7 @@ ## How to create conda environment for benchmarking -`conda create -n bench -c intel python=3.7 daal4py pandas scikit-learn` +`conda create -n bench -c intel python=3.7 daal4py pandas scikit-learn tqdm` ## Algorithms parameters