From 88565f8a8c646fc8f83a242e52adc43d3c26979f Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Wed, 29 Dec 2021 14:09:23 +0300 Subject: [PATCH 1/7] sizes in configs --- runner.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/runner.py b/runner.py index 980e40b87..5e57a2794 100755 --- a/runner.py +++ b/runner.py @@ -58,6 +58,9 @@ def get_configs(path: Path) -> List[str]: help='Available floating point data types' 'This parameter only marks dtype as available, ' 'make sure to add the dtype parameter to the config file ') + parser.add_argument('--size', type=str, default="small medium large", nargs='+', + choices=("small", "medium", "large"), + help='Available dataset sizes') parser.add_argument('--no-intel-optimized', default=False, action='store_true', help='Use Scikit-learn without Intel optimizations') parser.add_argument('--output-file', default='results.json', @@ -105,6 +108,11 @@ def get_configs(path: Path) -> List[str]: params = common_params.copy() params.update(params_set.copy()) + if 'size' in params: + if params['size'] not in args.size: + continue + del params['size'] + device = [] if 'device' not in params: if 'sklearn' in params['lib']: From e668d94480be5fc149e38ddb28df9a0649d00b46 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Wed, 12 Jan 2022 16:45:30 +0300 Subject: [PATCH 2/7] test config --- configs/skl_config.json | 71 +++++++++++++++++++++++++++++++++++++++-- runner.py | 8 ++--- 2 files changed, 72 insertions(+), 7 deletions(-) diff --git a/configs/skl_config.json b/configs/skl_config.json index f3f1fa93f..486177949 100644 --- a/configs/skl_config.json +++ b/configs/skl_config.json @@ -19,6 +19,7 @@ } } ], + "workload-size": "small", "time-method": "box_filter", "time-limit": 50, "n-clusters": 1000, @@ -38,6 +39,7 @@ } } ], + "workload-size": "small", "time-method": "box_filter", "time-limit": 50, "n-clusters": 5, @@ -47,6 +49,7 @@ }, { "algorithm": "kmeans", + "workload-size": "small", "dataset": [ { "source": "synthetic", @@ -66,6 +69,7 @@ }, { "algorithm": "pca", + "workload-size": "small", "dataset": [ { "source": "synthetic", @@ -113,6 +117,7 @@ { "algorithm": "df_clsf", "dtype": "float32", + "workload-size": "small", "dataset": [ { "source": "npy", @@ -150,6 +155,7 @@ }, { "algorithm": "df_regr", + "workload-size": "small", "dtype": "float32", "dataset": [ { @@ -184,6 +190,7 @@ }, { "algorithm": "ridge", + "workload-size": "small", "dataset": [ { "source": "synthetic", @@ -206,6 +213,7 @@ }, { "algorithm": "linear", + "workload-size": "small", "dataset": [ { "source": "synthetic", @@ -227,6 +235,7 @@ }, { "algorithm": "log_reg", + "workload-size": "small", "dataset": [ { "source": "synthetic", @@ -270,6 +279,7 @@ }, { "algorithm": "svm", + "workload-size": "medium", "dataset": [ { "source": "npy", @@ -291,6 +301,7 @@ }, { "algorithm": "svm", + "workload-size": "small", "dataset": [ { "source": "npy", @@ -312,6 +323,7 @@ }, { "algorithm": "svm", + "workload-size": "medium", "dataset": [ { "source": "npy", @@ -333,6 +345,7 @@ }, { "algorithm": "svm", + "workload-size": "medium", "dataset": [ { "source": "npy", @@ -354,6 +367,7 @@ }, { "algorithm": "svm", + "workload-size": "medium", "dataset": [ { "source": "npy", @@ -375,6 +389,7 @@ }, { "algorithm": "svm", + "workload-size": "large", "dataset": [ { "source": "npy", @@ -396,6 +411,7 @@ }, { "algorithm": "nusvc", + "workload-size": "small", "dataset": [ { "source": "npy", @@ -417,6 +433,7 @@ }, { "algorithm": "nusvc", + "workload-size": "large", "dataset": [ { "source": "npy", @@ -438,6 +455,7 @@ }, { "algorithm": "nusvc", + "workload-size": "medium", "dataset": [ { "source": "npy", @@ -459,6 +477,7 @@ }, { "algorithm": "nusvc", + "workload-size": "medium", "dataset": [ { "source": "npy", @@ -480,6 +499,7 @@ }, { "algorithm": "svr", + "workload-size": "small", "dataset": [ { "source": "npy", @@ -501,6 +521,7 @@ }, { "algorithm": "svr", + "workload-size": "medium", "dataset": [ { "source": "npy", @@ -522,6 +543,7 @@ }, { "algorithm": "nusvr", + "workload-size": "small", "dataset": [ { "source": "npy", @@ -544,6 +566,7 @@ }, { "algorithm": "nusvr", + "workload-size": "medium", "dataset": [ { "source": "npy", @@ -567,6 +590,7 @@ }, { "algorithm": "nusvr", + "workload-size": "large", "dataset": [ { "source": "npy", @@ -589,6 +613,7 @@ }, { "algorithm": "dbscan", + "workload-size": "small", "dataset": [ { "source": "synthetic", @@ -607,7 +632,13 @@ "training": { "n_samples": 500000 } - }, + } + ] + }, + { + "algorithm": "dbscan", + "workload-size": "medium", + "dataset": [ { "source": "synthetic", "type": "blobs", @@ -621,6 +652,7 @@ }, { "algorithm": "knn_clsf", + "workload-size": "small", "dtype": "float32", "dataset": [ { @@ -658,7 +690,15 @@ "testing": { "n_samples": 20000 } - }, + } + ], + "method": ["brute", "kd_tree"] + }, + { + "algorithm": "knn_clsf", + "workload-size": "small", + "dtype": "float32", + "dataset": [ { "source": "synthetic", "type": "classification", @@ -672,10 +712,31 @@ } } ], - "method": ["brute", "kd_tree"] + "method": "kd_tree" + }, + { + "algorithm": "knn_clsf", + "workload-size": "medium", + "dtype": "float32", + "dataset": [ + { + "source": "synthetic", + "type": "classification", + "n_classes": 10, + "n_features": 16, + "training": { + "n_samples": 250000 + }, + "testing": { + "n_samples": 250000 + } + } + ], + "method": "brute" }, { "algorithm": "train_test_split", + "workload-size": "small", "dataset": [ { "source": "synthetic", @@ -711,6 +772,7 @@ }, { "algorithm": "train_test_split", + "workload-size": "small", "dataset": [ { "source": "npy", @@ -730,6 +792,7 @@ }, { "algorithm": "lasso", + "workload-size": "small", "dataset": [ { "source": "npy", @@ -746,6 +809,7 @@ }, { "algorithm": "elasticnet", + "workload-size": "small", "dataset": [ { "source": "npy", @@ -768,6 +832,7 @@ }, { "algorithm": "tsne", + "workload-size": "large", "dataset": [ { "source": "npy", diff --git a/runner.py b/runner.py index 5e57a2794..130b3c01d 100755 --- a/runner.py +++ b/runner.py @@ -58,7 +58,7 @@ def get_configs(path: Path) -> List[str]: help='Available floating point data types' 'This parameter only marks dtype as available, ' 'make sure to add the dtype parameter to the config file ') - parser.add_argument('--size', type=str, default="small medium large", nargs='+', + parser.add_argument('--workload-size', type=str, default="small medium large", nargs='+', choices=("small", "medium", "large"), help='Available dataset sizes') parser.add_argument('--no-intel-optimized', default=False, action='store_true', @@ -108,10 +108,10 @@ def get_configs(path: Path) -> List[str]: params = common_params.copy() params.update(params_set.copy()) - if 'size' in params: - if params['size'] not in args.size: + if 'workload-size' in params: + if params['workload-size'] not in args.workload_size: continue - del params['size'] + del params['workload-size'] device = [] if 'device' not in params: From 6272ea525c8155efb896525e6d1672e779fca3b3 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Mon, 17 Jan 2022 19:11:30 +0300 Subject: [PATCH 3/7] knn_svm --- configs/xpu/knn_clsf.json | 162 +++++++++++++++++ configs/xpu/knn_regr.json | 69 +++++++ configs/xpu/svm.json | 192 ++++++++++++++++++++ datasets/load_datasets.py | 14 +- datasets/loader_classification.py | 289 +++++++++++++++++++++++++++++- 5 files changed, 722 insertions(+), 4 deletions(-) create mode 100644 configs/xpu/knn_clsf.json create mode 100644 configs/xpu/knn_regr.json create mode 100644 configs/xpu/svm.json diff --git a/configs/xpu/knn_clsf.json b/configs/xpu/knn_clsf.json new file mode 100644 index 000000000..2d72c4ade --- /dev/null +++ b/configs/xpu/knn_clsf.json @@ -0,0 +1,162 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "knn_clsf", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_100K", + "training": + { + "x": "data/epsilon_100K_x_train.npy", + "y": "data/epsilon_100K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_100K_x_test.npy", + "y": "data/epsilon_100K_y_test.npy" + } + } + ], + "n-neighbors": [2, 100] + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_150K", + "training": + { + "x": "data/higgs_150K_x_train.npy", + "y": "data/higgs_150K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_150K_x_test.npy", + "y": "data/higgs_150K_y_test.npy" + } + }, + { + "source": "npy", + "name": "hepmass_150K", + "training": + { + "x": "data/hepmass_150K_x_train.npy", + "y": "data/hepmass_150K_y_train.npy" + }, + "testing": + { + "x": "data/hepmass_150K_x_test.npy", + "y": "data/hepmass_150K_y_test.npy" + } + } + ], + "n-neighbors": [5, 100] + }, + { + "dataset": [ + { + "source": "npy", + "name": "cifar", + "training": + { + "x": "data/cifar_x_train.npy", + "y": "data/cifar_y_train.npy" + }, + "testing": + { + "x": "data/cifar_x_test.npy", + "y": "data/cifar_y_test.npy" + } + } + ], + "n-neighbors": 7 + }, + { + "dataset": [ + { + "source": "npy", + "name": "mnist", + "training": + { + "x": "data/mnist_x_train.npy", + "y": "data/mnist_y_train.npy" + }, + "testing": + { + "x": "data/mnist_x_test.npy", + "y": "data/mnist_y_test.npy" + } + } + ], + "n-neighbors": 5 + }, + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_100K", + "training": + { + "x": "data/epsilon_100K_x_train.npy", + "y": "data/epsilon_100K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_100K_x_test.npy", + "y": "data/epsilon_100K_y_test.npy" + } + } + ], + "task": "search", + "n-neighbors": 2 + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_150K", + "training": + { + "x": "data/higgs_150K_x_train.npy", + "y": "data/higgs_150K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_150K_x_test.npy", + "y": "data/higgs_150K_y_test.npy" + } + } + ], + "task": "search", + "n-neighbors": 5 + }, + { + "dataset": [ + { + "source": "npy", + "name": "cifar", + "training": + { + "x": "data/cifar_x_train.npy", + "y": "data/cifar_y_train.npy" + }, + "testing": + { + "x": "data/cifar_x_test.npy", + "y": "data/cifar_y_test.npy" + } + } + ], + "task": "search", + "n-neighbors": 7 + } + ] +} diff --git a/configs/xpu/knn_regr.json b/configs/xpu/knn_regr.json new file mode 100644 index 000000000..ec1fbc9a9 --- /dev/null +++ b/configs/xpu/knn_regr.json @@ -0,0 +1,69 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "knn_regr", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "n-neighbors": 2 + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_150K", + "training": + { + "x": "data/higgs_150K_x_train.npy", + "y": "data/higgs_150K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_150K_x_test.npy", + "y": "data/higgs_150K_y_test.npy" + } + } + ], + "n-neighbors": 5 + }, + { + "dataset": [ + { + "source": "npy", + "name": "cifar", + "training": + { + "x": "data/cifar_x_train.npy", + "y": "data/cifar_y_train.npy" + }, + "testing": + { + "x": "data/cifar_x_test.npy", + "y": "data/cifar_y_test.npy" + } + } + ], + "n-neighbors": 7 + } + ] +} diff --git a/configs/xpu/svm.json b/configs/xpu/svm.json new file mode 100644 index 000000000..a98377532 --- /dev/null +++ b/configs/xpu/svm.json @@ -0,0 +1,192 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "svm", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "gisette", + "training": + { + "x": "data/gisette_x_train.npy", + "y": "data/gisette_y_train.npy" + }, + "testing": + { + "x": "data/gisette_x_test.npy", + "y": "data/gisette_y_test.npy" + } + } + ], + "C": 1.5e-3, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_150K", + "training": + { + "x": "data/higgs_150K_x_train.npy", + "y": "data/higgs_150K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_150K_x_test.npy", + "y": "data/higgs_150K_y_test.npy" + } + } + ], + "C": 1.0, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_80K", + "training": + { + "x": "data/epsilon_80K_x_train.npy", + "y": "data/epsilon_80K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_80K_x_test.npy", + "y": "data/epsilon_80K_y_test.npy" + } + } + ], + "C": 1.0, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "cifar", + "training": + { + "x": "data/cifar_x_train.npy", + "y": "data/cifar_y_train.npy" + }, + "testing": + { + "x": "data/cifar_x_test.npy", + "y": "data/cifar_y_test.npy" + } + } + ], + "C": 1.0e-7, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "imb_drama", + "training": + { + "x": "data/imb_drama_x_train.npy", + "y": "data/imb_drama_y_train.npy" + }, + "testing": + { + "x": "data/imb_drama_x_train.npy", + "y": "data/imb_drama_y_train.npy" + } + } + ], + "C": 1e-3, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_16K", + "training": + { + "x": "data/epsilon_16K_x_train.npy", + "y": "data/epsilon_16K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_16K_x_test.npy", + "y": "data/epsilon_16K_y_test.npy" + } + } + ], + "C": 9.0e2, + "kernel": "rbf" + }, + { + "dataset": [ + { + "source": "npy", + "name": "covtype_binary", + "training": + { + "x": "data/covtype_binary_x_train.npy", + "y": "data/covtype_binary_y_train.npy" + }, + "testing": + { + "x": "data/covtype_binary_x_test.npy", + "y": "data/covtype_binary_y_test.npy" + } + } + ], + "C": 1000.0, + "kernel": "rbf" + }, + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_80K", + "training": + { + "x": "data/epsilon_80K_x_train.npy", + "y": "data/epsilon_80K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_80K_x_test.npy", + "y": "data/epsilon_80K_y_test.npy" + } + } + ], + "C": 1000.0, + "kernel": "rbf" + }, + { + "dataset": [ + { + "source": "npy", + "name": "imb_drama", + "training": + { + "x": "data/imb_drama_x_train.npy", + "y": "data/imb_drama_y_train.npy" + }, + "testing": + { + "x": "data/imb_drama_x_train.npy", + "y": "data/imb_drama_y_train.npy" + } + } + ], + "C": 50, + "kernel": "rbf" + } + ] +} diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index 0a7874d92..fbd7685d4 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -22,8 +22,10 @@ from typing import Callable, Dict from .loader_classification import (a_nine_a, airline, airline_ohe, bosch, - census, codrnanorm, creditcard, epsilon, fraud, - gisette, hepmass_150K, higgs, higgs_one_m, ijcnn, + census, cifar, codrnanorm, covtype_binary, creditcard, + epsilon_16K, epsilon_80K, epsilon, epsilon_100K, + fraud, gisette, hepmass_150K, + higgs, higgs_one_m, higgs_150K, ijcnn, imb_drama, klaverjas, santander, skin_segmentation, susy) from .loader_multiclass import (connect, covertype, covtype, letters, mlsr, mnist, msrank, plasticc, sensit) @@ -40,19 +42,25 @@ "bosch": bosch, "california_housing": california_housing, "census": census, + "cifar": cifar, "codrnanorm": codrnanorm, "connect": connect, - "covertype": covertype, + "covtype_binary": covtype_binary, "covtype": covtype, "creditcard": creditcard, "epsilon": epsilon, + "epsilon_16K": epsilon_16K, + "epsilon_80K": epsilon_80K, + "epsilon_100K": epsilon_100K, "fraud": fraud, "fried": fried, "gisette": gisette, "hepmass_150K": hepmass_150K, "higgs": higgs, "higgs1m": higgs_one_m, + "higgs_150K": higgs_150K, "ijcnn": ijcnn, + "imb_drama": imb_drama, "klaverjas": klaverjas, "letters": letters, "mlsr": mlsr, diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index fc3cb892d..ffb84f12f 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -22,7 +22,7 @@ import numpy as np import pandas as pd -from sklearn.datasets import fetch_openml, load_svmlight_file +from sklearn.datasets import fetch_openml, load_svmlight_file, fetch_covtype from sklearn.model_selection import train_test_split from .loader_utils import retrieve @@ -261,6 +261,41 @@ def codrnanorm(dataset_dir: Path) -> bool: return True +def covtype_binary(dataset_dir: Path) -> bool: + """ + Cover type dataset from UCI machine learning repository + https://archive.ics.uci.edu/ml/datasets/covertype + + y contains 7 unique class labels from 1 to 7 inclusive. + Classification task. n_classes = 7. + covtype X train dataset (464809, 54) + covtype y train dataset (464809, 1) + covtype X test dataset (116203, 54) + covtype y test dataset (116203, 1) + """ + dataset_name = 'covtype_binary' + os.makedirs(dataset_dir, exist_ok=True) + + nrows_train, nrows_test = 100000, 100000 + logging.info(f'Started loading {dataset_name}') + X, y = fetch_covtype(return_X_y=True) # pylint: disable=unexpected-keyword-arg + logging.info(f'{dataset_name} is loaded, started parsing...') + + y = (y > 3).astype(int) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, + train_size=nrows_train, + test_size=nrows_test, + shuffle=False + ) + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + def creditcard(dataset_dir: Path) -> bool: """ Classification task. n_classes = 2. @@ -334,6 +369,150 @@ def epsilon(dataset_dir: Path) -> bool: return True +def epsilon_16K(dataset_dir: Path) -> bool: + """ + Epsilon dataset + https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html + + Classification task. n_classes = 2. + epsilon_100K x train dataset (16000, 2000) + epsilon_100K y train dataset (16000, 1) + epsilon_100K x test dataset (16000, 2000) + epsilon_100K y test dataset (16000, 1) + """ + dataset_name = 'epsilon_16K' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.bz2' + url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.t.bz2' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + + num_train, num_test, dtype = 16000, 16000, np.float32 + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + X_train, y_train = load_svmlight_file(local_url_train, + dtype=dtype) + X_test, y_test = load_svmlight_file(local_url_test, + dtype=dtype) + X_train = X_train.toarray()[:num_train] + X_test = X_test.toarray()[:num_test] + y_train = y_train[:num_train] + y_train[y_train <= 0] = 0 + y_test = y_test[:num_test] + y_test[y_test <= 0] = 0 + + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def epsilon_100K(dataset_dir: Path) -> bool: + """ + Epsilon dataset + https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html + + Classification task. n_classes = 2. + epsilon_100K x train dataset (50000, 2000) + epsilon_100K y train dataset (50000, 1) + epsilon_100K x test dataset (50000, 2000) + epsilon_100K y test dataset (50000, 1) + """ + dataset_name = 'epsilon_100K' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.bz2' + url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.t.bz2' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + + num_train, num_test, dtype = 50000, 50000, np.float32 + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + X_train, y_train = load_svmlight_file(local_url_train, + dtype=dtype) + X_test, y_test = load_svmlight_file(local_url_test, + dtype=dtype) + X_train = X_train.toarray()[:num_train] + X_test = X_test.toarray()[:num_test] + y_train = y_train[:num_train] + y_train[y_train <= 0] = 0 + y_test = y_test[:num_test] + y_test[y_test <= 0] = 0 + + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def epsilon_80K(dataset_dir: Path) -> bool: + """ + Epsilon dataset + https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html + + Classification task. n_classes = 2. + epsilon_100K x train dataset (80000, 2000) + epsilon_100K y train dataset (80000, 1) + epsilon_100K x test dataset (80000, 2000) + epsilon_100K y test dataset (80000, 1) + """ + dataset_name = 'epsilon_80K' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.bz2' + url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.t.bz2' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + + num_train, num_test, dtype = 80000, 80000, np.float32 + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + X_train, y_train = load_svmlight_file(local_url_train, + dtype=dtype) + X_test, y_test = load_svmlight_file(local_url_test, + dtype=dtype) + X_train = X_train.toarray()[:num_train] + X_test = X_test.toarray()[:num_test] + y_train = y_train[:num_train] + y_train[y_train <= 0] = 0 + y_test = y_test[:num_test] + y_test[y_test <= 0] = 0 + + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + def fraud(dataset_dir: Path) -> bool: """ Credit Card Fraud Detection contest @@ -576,6 +755,46 @@ def higgs_one_m(dataset_dir: Path) -> bool: return True +def higgs_150K(dataset_dir: Path) -> bool: + """ + Higgs dataset from UCI machine learning repository + https://archive.ics.uci.edu/ml/datasets/HIGGS + + Classification task. n_classes = 2. + higgs_150K X train dataset (100000, 28) + higgs_150K y train dataset (50000, 1) + higgs_150K X test dataset (100000, 28) + higgs_150K y test dataset (50000, 1) + """ + dataset_name = 'higgs_150K' + os.makedirs(dataset_dir, exist_ok=True) + + url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, started parsing...') + + nrows_train, nrows_test, dtype = 100000, 50000, np.float32 + data: Any = pd.read_csv(local_url, delimiter=",", header=None, + compression="gzip", dtype=dtype, + nrows=nrows_train + nrows_test) + + X = data[data.columns[1:]] + y = data[data.columns[0:1]] + + x_train, x_test, y_train, y_test = train_test_split( + X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False) + + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + def ijcnn(dataset_dir: Path) -> bool: """ Author: Danil Prokhorov. @@ -611,6 +830,28 @@ def ijcnn(dataset_dir: Path) -> bool: logging.info(f'dataset {dataset_name} is ready.') return True +def imb_drama(dataset_dir: Path) -> bool: + """ + imdb_drama dataset from OpenML Datasets ( + https://www.openml.org/d/273) + + Classification task. + Number of features: 1001 + Number of instances: 120919 + """ + dataset_name = 'imb_drama' + os.makedirs(dataset_dir, exist_ok=True) + + x_train, y_train = fetch_openml('IMDB.drama', return_X_y=True, + as_frame=False, data_home=dataset_dir) + logging.info(f'{dataset_name} is loaded, started parsing...') + for data, name in zip((x_train.todense(), y_train), + ('x_train', 'y_train')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + def klaverjas(dataset_dir: Path) -> bool: """ @@ -726,3 +967,49 @@ def susy(dataset_dir: Path) -> bool: np.save(os.path.join(dataset_dir, filename), data) logging.info(f'dataset {dataset_name} is ready.') return True + + +def cifar(dataset_dir: Path) -> bool: + """ + Cifar dataset from LIBSVM Datasets ( + https://www.cs.toronto.edu/~kriz/cifar.html#cifar) + TaskType: Classification + cifar x train dataset (50000, 3072) + cifar y train dataset (50000, 1) + cifar x test dataset (10000, 3072) + cifar y test dataset (10000, 1) + """ + dataset_name = 'cifar' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.bz2' + url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.t.bz2' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + logging.info(f'{dataset_name} is loaded, started parsing...') + x_train, y_train = load_svmlight_file(local_url_train, + dtype=np.float32) + + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + x_test, y_test = load_svmlight_file(local_url_test, + dtype=np.float32) + + x_train = x_train.toarray() + y_train[y_train <= 0] = 0 + + x_test = x_test.toarray() + y_test[y_test <= 0] = 0 + + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + return True + From 81868d37ec55ec0be33d1ebe62849e7c97305eb4 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Mon, 17 Jan 2022 19:16:10 +0300 Subject: [PATCH 4/7] Revert "knn_svm" This reverts commit 6272ea525c8155efb896525e6d1672e779fca3b3. --- configs/xpu/knn_clsf.json | 162 ----------------- configs/xpu/knn_regr.json | 69 ------- configs/xpu/svm.json | 192 -------------------- datasets/load_datasets.py | 14 +- datasets/loader_classification.py | 289 +----------------------------- 5 files changed, 4 insertions(+), 722 deletions(-) delete mode 100644 configs/xpu/knn_clsf.json delete mode 100644 configs/xpu/knn_regr.json delete mode 100644 configs/xpu/svm.json diff --git a/configs/xpu/knn_clsf.json b/configs/xpu/knn_clsf.json deleted file mode 100644 index 2d72c4ade..000000000 --- a/configs/xpu/knn_clsf.json +++ /dev/null @@ -1,162 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "knn_clsf", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_100K", - "training": - { - "x": "data/epsilon_100K_x_train.npy", - "y": "data/epsilon_100K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_100K_x_test.npy", - "y": "data/epsilon_100K_y_test.npy" - } - } - ], - "n-neighbors": [2, 100] - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - }, - { - "source": "npy", - "name": "hepmass_150K", - "training": - { - "x": "data/hepmass_150K_x_train.npy", - "y": "data/hepmass_150K_y_train.npy" - }, - "testing": - { - "x": "data/hepmass_150K_x_test.npy", - "y": "data/hepmass_150K_y_test.npy" - } - } - ], - "n-neighbors": [5, 100] - }, - { - "dataset": [ - { - "source": "npy", - "name": "cifar", - "training": - { - "x": "data/cifar_x_train.npy", - "y": "data/cifar_y_train.npy" - }, - "testing": - { - "x": "data/cifar_x_test.npy", - "y": "data/cifar_y_test.npy" - } - } - ], - "n-neighbors": 7 - }, - { - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "n-neighbors": 5 - }, - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_100K", - "training": - { - "x": "data/epsilon_100K_x_train.npy", - "y": "data/epsilon_100K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_100K_x_test.npy", - "y": "data/epsilon_100K_y_test.npy" - } - } - ], - "task": "search", - "n-neighbors": 2 - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - } - ], - "task": "search", - "n-neighbors": 5 - }, - { - "dataset": [ - { - "source": "npy", - "name": "cifar", - "training": - { - "x": "data/cifar_x_train.npy", - "y": "data/cifar_y_train.npy" - }, - "testing": - { - "x": "data/cifar_x_test.npy", - "y": "data/cifar_y_test.npy" - } - } - ], - "task": "search", - "n-neighbors": 7 - } - ] -} diff --git a/configs/xpu/knn_regr.json b/configs/xpu/knn_regr.json deleted file mode 100644 index ec1fbc9a9..000000000 --- a/configs/xpu/knn_regr.json +++ /dev/null @@ -1,69 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "knn_regr", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "n-neighbors": 2 - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - } - ], - "n-neighbors": 5 - }, - { - "dataset": [ - { - "source": "npy", - "name": "cifar", - "training": - { - "x": "data/cifar_x_train.npy", - "y": "data/cifar_y_train.npy" - }, - "testing": - { - "x": "data/cifar_x_test.npy", - "y": "data/cifar_y_test.npy" - } - } - ], - "n-neighbors": 7 - } - ] -} diff --git a/configs/xpu/svm.json b/configs/xpu/svm.json deleted file mode 100644 index a98377532..000000000 --- a/configs/xpu/svm.json +++ /dev/null @@ -1,192 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "svm", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - } - ], - "C": 1.5e-3, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - } - ], - "C": 1.0, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_80K", - "training": - { - "x": "data/epsilon_80K_x_train.npy", - "y": "data/epsilon_80K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_80K_x_test.npy", - "y": "data/epsilon_80K_y_test.npy" - } - } - ], - "C": 1.0, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "cifar", - "training": - { - "x": "data/cifar_x_train.npy", - "y": "data/cifar_y_train.npy" - }, - "testing": - { - "x": "data/cifar_x_test.npy", - "y": "data/cifar_y_test.npy" - } - } - ], - "C": 1.0e-7, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "imb_drama", - "training": - { - "x": "data/imb_drama_x_train.npy", - "y": "data/imb_drama_y_train.npy" - }, - "testing": - { - "x": "data/imb_drama_x_train.npy", - "y": "data/imb_drama_y_train.npy" - } - } - ], - "C": 1e-3, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_16K", - "training": - { - "x": "data/epsilon_16K_x_train.npy", - "y": "data/epsilon_16K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_16K_x_test.npy", - "y": "data/epsilon_16K_y_test.npy" - } - } - ], - "C": 9.0e2, - "kernel": "rbf" - }, - { - "dataset": [ - { - "source": "npy", - "name": "covtype_binary", - "training": - { - "x": "data/covtype_binary_x_train.npy", - "y": "data/covtype_binary_y_train.npy" - }, - "testing": - { - "x": "data/covtype_binary_x_test.npy", - "y": "data/covtype_binary_y_test.npy" - } - } - ], - "C": 1000.0, - "kernel": "rbf" - }, - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_80K", - "training": - { - "x": "data/epsilon_80K_x_train.npy", - "y": "data/epsilon_80K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_80K_x_test.npy", - "y": "data/epsilon_80K_y_test.npy" - } - } - ], - "C": 1000.0, - "kernel": "rbf" - }, - { - "dataset": [ - { - "source": "npy", - "name": "imb_drama", - "training": - { - "x": "data/imb_drama_x_train.npy", - "y": "data/imb_drama_y_train.npy" - }, - "testing": - { - "x": "data/imb_drama_x_train.npy", - "y": "data/imb_drama_y_train.npy" - } - } - ], - "C": 50, - "kernel": "rbf" - } - ] -} diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index fbd7685d4..0a7874d92 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -22,10 +22,8 @@ from typing import Callable, Dict from .loader_classification import (a_nine_a, airline, airline_ohe, bosch, - census, cifar, codrnanorm, covtype_binary, creditcard, - epsilon_16K, epsilon_80K, epsilon, epsilon_100K, - fraud, gisette, hepmass_150K, - higgs, higgs_one_m, higgs_150K, ijcnn, imb_drama, + census, codrnanorm, creditcard, epsilon, fraud, + gisette, hepmass_150K, higgs, higgs_one_m, ijcnn, klaverjas, santander, skin_segmentation, susy) from .loader_multiclass import (connect, covertype, covtype, letters, mlsr, mnist, msrank, plasticc, sensit) @@ -42,25 +40,19 @@ "bosch": bosch, "california_housing": california_housing, "census": census, - "cifar": cifar, "codrnanorm": codrnanorm, "connect": connect, - "covtype_binary": covtype_binary, + "covertype": covertype, "covtype": covtype, "creditcard": creditcard, "epsilon": epsilon, - "epsilon_16K": epsilon_16K, - "epsilon_80K": epsilon_80K, - "epsilon_100K": epsilon_100K, "fraud": fraud, "fried": fried, "gisette": gisette, "hepmass_150K": hepmass_150K, "higgs": higgs, "higgs1m": higgs_one_m, - "higgs_150K": higgs_150K, "ijcnn": ijcnn, - "imb_drama": imb_drama, "klaverjas": klaverjas, "letters": letters, "mlsr": mlsr, diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index ffb84f12f..fc3cb892d 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -22,7 +22,7 @@ import numpy as np import pandas as pd -from sklearn.datasets import fetch_openml, load_svmlight_file, fetch_covtype +from sklearn.datasets import fetch_openml, load_svmlight_file from sklearn.model_selection import train_test_split from .loader_utils import retrieve @@ -261,41 +261,6 @@ def codrnanorm(dataset_dir: Path) -> bool: return True -def covtype_binary(dataset_dir: Path) -> bool: - """ - Cover type dataset from UCI machine learning repository - https://archive.ics.uci.edu/ml/datasets/covertype - - y contains 7 unique class labels from 1 to 7 inclusive. - Classification task. n_classes = 7. - covtype X train dataset (464809, 54) - covtype y train dataset (464809, 1) - covtype X test dataset (116203, 54) - covtype y test dataset (116203, 1) - """ - dataset_name = 'covtype_binary' - os.makedirs(dataset_dir, exist_ok=True) - - nrows_train, nrows_test = 100000, 100000 - logging.info(f'Started loading {dataset_name}') - X, y = fetch_covtype(return_X_y=True) # pylint: disable=unexpected-keyword-arg - logging.info(f'{dataset_name} is loaded, started parsing...') - - y = (y > 3).astype(int) - - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, - train_size=nrows_train, - test_size=nrows_test, - shuffle=False - ) - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - def creditcard(dataset_dir: Path) -> bool: """ Classification task. n_classes = 2. @@ -369,150 +334,6 @@ def epsilon(dataset_dir: Path) -> bool: return True -def epsilon_16K(dataset_dir: Path) -> bool: - """ - Epsilon dataset - https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html - - Classification task. n_classes = 2. - epsilon_100K x train dataset (16000, 2000) - epsilon_100K y train dataset (16000, 1) - epsilon_100K x test dataset (16000, 2000) - epsilon_100K y test dataset (16000, 1) - """ - dataset_name = 'epsilon_16K' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.bz2' - url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.t.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - - num_train, num_test, dtype = 16000, 16000, np.float32 - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - retrieve(url_test, local_url_test) - logging.info(f'{dataset_name} is loaded, started parsing...') - X_train, y_train = load_svmlight_file(local_url_train, - dtype=dtype) - X_test, y_test = load_svmlight_file(local_url_test, - dtype=dtype) - X_train = X_train.toarray()[:num_train] - X_test = X_test.toarray()[:num_test] - y_train = y_train[:num_train] - y_train[y_train <= 0] = 0 - y_test = y_test[:num_test] - y_test[y_test <= 0] = 0 - - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def epsilon_100K(dataset_dir: Path) -> bool: - """ - Epsilon dataset - https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html - - Classification task. n_classes = 2. - epsilon_100K x train dataset (50000, 2000) - epsilon_100K y train dataset (50000, 1) - epsilon_100K x test dataset (50000, 2000) - epsilon_100K y test dataset (50000, 1) - """ - dataset_name = 'epsilon_100K' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.bz2' - url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.t.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - - num_train, num_test, dtype = 50000, 50000, np.float32 - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - retrieve(url_test, local_url_test) - logging.info(f'{dataset_name} is loaded, started parsing...') - X_train, y_train = load_svmlight_file(local_url_train, - dtype=dtype) - X_test, y_test = load_svmlight_file(local_url_test, - dtype=dtype) - X_train = X_train.toarray()[:num_train] - X_test = X_test.toarray()[:num_test] - y_train = y_train[:num_train] - y_train[y_train <= 0] = 0 - y_test = y_test[:num_test] - y_test[y_test <= 0] = 0 - - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def epsilon_80K(dataset_dir: Path) -> bool: - """ - Epsilon dataset - https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html - - Classification task. n_classes = 2. - epsilon_100K x train dataset (80000, 2000) - epsilon_100K y train dataset (80000, 1) - epsilon_100K x test dataset (80000, 2000) - epsilon_100K y test dataset (80000, 1) - """ - dataset_name = 'epsilon_80K' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.bz2' - url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.t.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - - num_train, num_test, dtype = 80000, 80000, np.float32 - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - retrieve(url_test, local_url_test) - logging.info(f'{dataset_name} is loaded, started parsing...') - X_train, y_train = load_svmlight_file(local_url_train, - dtype=dtype) - X_test, y_test = load_svmlight_file(local_url_test, - dtype=dtype) - X_train = X_train.toarray()[:num_train] - X_test = X_test.toarray()[:num_test] - y_train = y_train[:num_train] - y_train[y_train <= 0] = 0 - y_test = y_test[:num_test] - y_test[y_test <= 0] = 0 - - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - def fraud(dataset_dir: Path) -> bool: """ Credit Card Fraud Detection contest @@ -755,46 +576,6 @@ def higgs_one_m(dataset_dir: Path) -> bool: return True -def higgs_150K(dataset_dir: Path) -> bool: - """ - Higgs dataset from UCI machine learning repository - https://archive.ics.uci.edu/ml/datasets/HIGGS - - Classification task. n_classes = 2. - higgs_150K X train dataset (100000, 28) - higgs_150K y train dataset (50000, 1) - higgs_150K X test dataset (100000, 28) - higgs_150K y test dataset (50000, 1) - """ - dataset_name = 'higgs_150K' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' - local_url = os.path.join(dataset_dir, os.path.basename(url)) - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - - nrows_train, nrows_test, dtype = 100000, 50000, np.float32 - data: Any = pd.read_csv(local_url, delimiter=",", header=None, - compression="gzip", dtype=dtype, - nrows=nrows_train + nrows_test) - - X = data[data.columns[1:]] - y = data[data.columns[0:1]] - - x_train, x_test, y_train, y_test = train_test_split( - X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False) - - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - def ijcnn(dataset_dir: Path) -> bool: """ Author: Danil Prokhorov. @@ -830,28 +611,6 @@ def ijcnn(dataset_dir: Path) -> bool: logging.info(f'dataset {dataset_name} is ready.') return True -def imb_drama(dataset_dir: Path) -> bool: - """ - imdb_drama dataset from OpenML Datasets ( - https://www.openml.org/d/273) - - Classification task. - Number of features: 1001 - Number of instances: 120919 - """ - dataset_name = 'imb_drama' - os.makedirs(dataset_dir, exist_ok=True) - - x_train, y_train = fetch_openml('IMDB.drama', return_X_y=True, - as_frame=False, data_home=dataset_dir) - logging.info(f'{dataset_name} is loaded, started parsing...') - for data, name in zip((x_train.todense(), y_train), - ('x_train', 'y_train')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - def klaverjas(dataset_dir: Path) -> bool: """ @@ -967,49 +726,3 @@ def susy(dataset_dir: Path) -> bool: np.save(os.path.join(dataset_dir, filename), data) logging.info(f'dataset {dataset_name} is ready.') return True - - -def cifar(dataset_dir: Path) -> bool: - """ - Cifar dataset from LIBSVM Datasets ( - https://www.cs.toronto.edu/~kriz/cifar.html#cifar) - TaskType: Classification - cifar x train dataset (50000, 3072) - cifar y train dataset (50000, 1) - cifar x test dataset (10000, 3072) - cifar y test dataset (10000, 1) - """ - dataset_name = 'cifar' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.bz2' - url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.t.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - logging.info(f'{dataset_name} is loaded, started parsing...') - x_train, y_train = load_svmlight_file(local_url_train, - dtype=np.float32) - - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - retrieve(url_test, local_url_test) - logging.info(f'{dataset_name} is loaded, started parsing...') - x_test, y_test = load_svmlight_file(local_url_test, - dtype=np.float32) - - x_train = x_train.toarray() - y_train[y_train <= 0] = 0 - - x_test = x_test.toarray() - y_test[y_test <= 0] = 0 - - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - return True - From 669f82e3372c2dc9d40d157e8d745531981b0776 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Wed, 19 Jan 2022 12:02:00 +0300 Subject: [PATCH 5/7] revert changing skl config --- configs/skl_config.json | 71 ++--------------------------------------- 1 file changed, 3 insertions(+), 68 deletions(-) diff --git a/configs/skl_config.json b/configs/skl_config.json index 486177949..f3f1fa93f 100644 --- a/configs/skl_config.json +++ b/configs/skl_config.json @@ -19,7 +19,6 @@ } } ], - "workload-size": "small", "time-method": "box_filter", "time-limit": 50, "n-clusters": 1000, @@ -39,7 +38,6 @@ } } ], - "workload-size": "small", "time-method": "box_filter", "time-limit": 50, "n-clusters": 5, @@ -49,7 +47,6 @@ }, { "algorithm": "kmeans", - "workload-size": "small", "dataset": [ { "source": "synthetic", @@ -69,7 +66,6 @@ }, { "algorithm": "pca", - "workload-size": "small", "dataset": [ { "source": "synthetic", @@ -117,7 +113,6 @@ { "algorithm": "df_clsf", "dtype": "float32", - "workload-size": "small", "dataset": [ { "source": "npy", @@ -155,7 +150,6 @@ }, { "algorithm": "df_regr", - "workload-size": "small", "dtype": "float32", "dataset": [ { @@ -190,7 +184,6 @@ }, { "algorithm": "ridge", - "workload-size": "small", "dataset": [ { "source": "synthetic", @@ -213,7 +206,6 @@ }, { "algorithm": "linear", - "workload-size": "small", "dataset": [ { "source": "synthetic", @@ -235,7 +227,6 @@ }, { "algorithm": "log_reg", - "workload-size": "small", "dataset": [ { "source": "synthetic", @@ -279,7 +270,6 @@ }, { "algorithm": "svm", - "workload-size": "medium", "dataset": [ { "source": "npy", @@ -301,7 +291,6 @@ }, { "algorithm": "svm", - "workload-size": "small", "dataset": [ { "source": "npy", @@ -323,7 +312,6 @@ }, { "algorithm": "svm", - "workload-size": "medium", "dataset": [ { "source": "npy", @@ -345,7 +333,6 @@ }, { "algorithm": "svm", - "workload-size": "medium", "dataset": [ { "source": "npy", @@ -367,7 +354,6 @@ }, { "algorithm": "svm", - "workload-size": "medium", "dataset": [ { "source": "npy", @@ -389,7 +375,6 @@ }, { "algorithm": "svm", - "workload-size": "large", "dataset": [ { "source": "npy", @@ -411,7 +396,6 @@ }, { "algorithm": "nusvc", - "workload-size": "small", "dataset": [ { "source": "npy", @@ -433,7 +417,6 @@ }, { "algorithm": "nusvc", - "workload-size": "large", "dataset": [ { "source": "npy", @@ -455,7 +438,6 @@ }, { "algorithm": "nusvc", - "workload-size": "medium", "dataset": [ { "source": "npy", @@ -477,7 +459,6 @@ }, { "algorithm": "nusvc", - "workload-size": "medium", "dataset": [ { "source": "npy", @@ -499,7 +480,6 @@ }, { "algorithm": "svr", - "workload-size": "small", "dataset": [ { "source": "npy", @@ -521,7 +501,6 @@ }, { "algorithm": "svr", - "workload-size": "medium", "dataset": [ { "source": "npy", @@ -543,7 +522,6 @@ }, { "algorithm": "nusvr", - "workload-size": "small", "dataset": [ { "source": "npy", @@ -566,7 +544,6 @@ }, { "algorithm": "nusvr", - "workload-size": "medium", "dataset": [ { "source": "npy", @@ -590,7 +567,6 @@ }, { "algorithm": "nusvr", - "workload-size": "large", "dataset": [ { "source": "npy", @@ -613,7 +589,6 @@ }, { "algorithm": "dbscan", - "workload-size": "small", "dataset": [ { "source": "synthetic", @@ -632,13 +607,7 @@ "training": { "n_samples": 500000 } - } - ] - }, - { - "algorithm": "dbscan", - "workload-size": "medium", - "dataset": [ + }, { "source": "synthetic", "type": "blobs", @@ -652,7 +621,6 @@ }, { "algorithm": "knn_clsf", - "workload-size": "small", "dtype": "float32", "dataset": [ { @@ -690,35 +658,7 @@ "testing": { "n_samples": 20000 } - } - ], - "method": ["brute", "kd_tree"] - }, - { - "algorithm": "knn_clsf", - "workload-size": "small", - "dtype": "float32", - "dataset": [ - { - "source": "synthetic", - "type": "classification", - "n_classes": 10, - "n_features": 16, - "training": { - "n_samples": 250000 - }, - "testing": { - "n_samples": 250000 - } - } - ], - "method": "kd_tree" - }, - { - "algorithm": "knn_clsf", - "workload-size": "medium", - "dtype": "float32", - "dataset": [ + }, { "source": "synthetic", "type": "classification", @@ -732,11 +672,10 @@ } } ], - "method": "brute" + "method": ["brute", "kd_tree"] }, { "algorithm": "train_test_split", - "workload-size": "small", "dataset": [ { "source": "synthetic", @@ -772,7 +711,6 @@ }, { "algorithm": "train_test_split", - "workload-size": "small", "dataset": [ { "source": "npy", @@ -792,7 +730,6 @@ }, { "algorithm": "lasso", - "workload-size": "small", "dataset": [ { "source": "npy", @@ -809,7 +746,6 @@ }, { "algorithm": "elasticnet", - "workload-size": "small", "dataset": [ { "source": "npy", @@ -832,7 +768,6 @@ }, { "algorithm": "tsne", - "workload-size": "large", "dataset": [ { "source": "npy", From 08a66ee845aa03b6b951fd9d182074e0ec35ca9d Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Wed, 19 Jan 2022 12:11:21 +0300 Subject: [PATCH 6/7] sizes for df_clsf --- configs/xpu/df_clsf.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/configs/xpu/df_clsf.json b/configs/xpu/df_clsf.json index 4d14763b8..0c7c25d70 100644 --- a/configs/xpu/df_clsf.json +++ b/configs/xpu/df_clsf.json @@ -26,6 +26,7 @@ } } ], + "workload-size": "medium", "num-trees": 10, "max-depth": 5 }, @@ -46,6 +47,7 @@ } } ], + "workload-size": "large", "num-trees": 100, "max-depth": 8 }, @@ -66,6 +68,7 @@ } } ], + "workload-size": "medium", "num-trees": 20, "max-depth": 16 }, @@ -86,6 +89,7 @@ } } ], + "workload-size": "large", "num-trees": 100, "max-depth": 10 }, @@ -106,6 +110,7 @@ } } ], + "workload-size": "medium", "num-trees": 50, "max-depth": 15 } From caf58888915ff527d3b736c39678c345d7618169 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Wed, 19 Jan 2022 14:41:03 +0300 Subject: [PATCH 7/7] update help info --- runner.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/runner.py b/runner.py index 130b3c01d..f030a76b5 100755 --- a/runner.py +++ b/runner.py @@ -60,7 +60,9 @@ def get_configs(path: Path) -> List[str]: 'make sure to add the dtype parameter to the config file ') parser.add_argument('--workload-size', type=str, default="small medium large", nargs='+', choices=("small", "medium", "large"), - help='Available dataset sizes') + help='Available workload sizes,' + 'make sure to add the workload-size parameter to the config file ' + 'unmarked workloads will be launched anyway') parser.add_argument('--no-intel-optimized', default=False, action='store_true', help='Use Scikit-learn without Intel optimizations') parser.add_argument('--output-file', default='results.json',