From 6272ea525c8155efb896525e6d1672e779fca3b3 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Mon, 17 Jan 2022 19:11:30 +0300 Subject: [PATCH 01/20] knn_svm --- configs/xpu/knn_clsf.json | 162 +++++++++++++++++ configs/xpu/knn_regr.json | 69 +++++++ configs/xpu/svm.json | 192 ++++++++++++++++++++ datasets/load_datasets.py | 14 +- datasets/loader_classification.py | 289 +++++++++++++++++++++++++++++- 5 files changed, 722 insertions(+), 4 deletions(-) create mode 100644 configs/xpu/knn_clsf.json create mode 100644 configs/xpu/knn_regr.json create mode 100644 configs/xpu/svm.json diff --git a/configs/xpu/knn_clsf.json b/configs/xpu/knn_clsf.json new file mode 100644 index 000000000..2d72c4ade --- /dev/null +++ b/configs/xpu/knn_clsf.json @@ -0,0 +1,162 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "knn_clsf", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_100K", + "training": + { + "x": "data/epsilon_100K_x_train.npy", + "y": "data/epsilon_100K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_100K_x_test.npy", + "y": "data/epsilon_100K_y_test.npy" + } + } + ], + "n-neighbors": [2, 100] + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_150K", + "training": + { + "x": "data/higgs_150K_x_train.npy", + "y": "data/higgs_150K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_150K_x_test.npy", + "y": "data/higgs_150K_y_test.npy" + } + }, + { + "source": "npy", + "name": "hepmass_150K", + "training": + { + "x": "data/hepmass_150K_x_train.npy", + "y": "data/hepmass_150K_y_train.npy" + }, + "testing": + { + "x": "data/hepmass_150K_x_test.npy", + "y": "data/hepmass_150K_y_test.npy" + } + } + ], + "n-neighbors": [5, 100] + }, + { + "dataset": [ + { + "source": "npy", + "name": "cifar", + "training": + { + "x": "data/cifar_x_train.npy", + "y": "data/cifar_y_train.npy" + }, + "testing": + { + "x": "data/cifar_x_test.npy", + "y": "data/cifar_y_test.npy" + } + } + ], + "n-neighbors": 7 + }, + { + "dataset": [ + { + "source": "npy", + "name": "mnist", + "training": + { + "x": "data/mnist_x_train.npy", + "y": "data/mnist_y_train.npy" + }, + "testing": + { + "x": "data/mnist_x_test.npy", + "y": "data/mnist_y_test.npy" + } + } + ], + "n-neighbors": 5 + }, + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_100K", + "training": + { + "x": "data/epsilon_100K_x_train.npy", + "y": "data/epsilon_100K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_100K_x_test.npy", + "y": "data/epsilon_100K_y_test.npy" + } + } + ], + "task": "search", + "n-neighbors": 2 + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_150K", + "training": + { + "x": "data/higgs_150K_x_train.npy", + "y": "data/higgs_150K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_150K_x_test.npy", + "y": "data/higgs_150K_y_test.npy" + } + } + ], + "task": "search", + "n-neighbors": 5 + }, + { + "dataset": [ + { + "source": "npy", + "name": "cifar", + "training": + { + "x": "data/cifar_x_train.npy", + "y": "data/cifar_y_train.npy" + }, + "testing": + { + "x": "data/cifar_x_test.npy", + "y": "data/cifar_y_test.npy" + } + } + ], + "task": "search", + "n-neighbors": 7 + } + ] +} diff --git a/configs/xpu/knn_regr.json b/configs/xpu/knn_regr.json new file mode 100644 index 000000000..ec1fbc9a9 --- /dev/null +++ b/configs/xpu/knn_regr.json @@ -0,0 +1,69 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "knn_regr", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "n-neighbors": 2 + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_150K", + "training": + { + "x": "data/higgs_150K_x_train.npy", + "y": "data/higgs_150K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_150K_x_test.npy", + "y": "data/higgs_150K_y_test.npy" + } + } + ], + "n-neighbors": 5 + }, + { + "dataset": [ + { + "source": "npy", + "name": "cifar", + "training": + { + "x": "data/cifar_x_train.npy", + "y": "data/cifar_y_train.npy" + }, + "testing": + { + "x": "data/cifar_x_test.npy", + "y": "data/cifar_y_test.npy" + } + } + ], + "n-neighbors": 7 + } + ] +} diff --git a/configs/xpu/svm.json b/configs/xpu/svm.json new file mode 100644 index 000000000..a98377532 --- /dev/null +++ b/configs/xpu/svm.json @@ -0,0 +1,192 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "svm", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "gisette", + "training": + { + "x": "data/gisette_x_train.npy", + "y": "data/gisette_y_train.npy" + }, + "testing": + { + "x": "data/gisette_x_test.npy", + "y": "data/gisette_y_test.npy" + } + } + ], + "C": 1.5e-3, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_150K", + "training": + { + "x": "data/higgs_150K_x_train.npy", + "y": "data/higgs_150K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_150K_x_test.npy", + "y": "data/higgs_150K_y_test.npy" + } + } + ], + "C": 1.0, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_80K", + "training": + { + "x": "data/epsilon_80K_x_train.npy", + "y": "data/epsilon_80K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_80K_x_test.npy", + "y": "data/epsilon_80K_y_test.npy" + } + } + ], + "C": 1.0, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "cifar", + "training": + { + "x": "data/cifar_x_train.npy", + "y": "data/cifar_y_train.npy" + }, + "testing": + { + "x": "data/cifar_x_test.npy", + "y": "data/cifar_y_test.npy" + } + } + ], + "C": 1.0e-7, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "imb_drama", + "training": + { + "x": "data/imb_drama_x_train.npy", + "y": "data/imb_drama_y_train.npy" + }, + "testing": + { + "x": "data/imb_drama_x_train.npy", + "y": "data/imb_drama_y_train.npy" + } + } + ], + "C": 1e-3, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_16K", + "training": + { + "x": "data/epsilon_16K_x_train.npy", + "y": "data/epsilon_16K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_16K_x_test.npy", + "y": "data/epsilon_16K_y_test.npy" + } + } + ], + "C": 9.0e2, + "kernel": "rbf" + }, + { + "dataset": [ + { + "source": "npy", + "name": "covtype_binary", + "training": + { + "x": "data/covtype_binary_x_train.npy", + "y": "data/covtype_binary_y_train.npy" + }, + "testing": + { + "x": "data/covtype_binary_x_test.npy", + "y": "data/covtype_binary_y_test.npy" + } + } + ], + "C": 1000.0, + "kernel": "rbf" + }, + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_80K", + "training": + { + "x": "data/epsilon_80K_x_train.npy", + "y": "data/epsilon_80K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_80K_x_test.npy", + "y": "data/epsilon_80K_y_test.npy" + } + } + ], + "C": 1000.0, + "kernel": "rbf" + }, + { + "dataset": [ + { + "source": "npy", + "name": "imb_drama", + "training": + { + "x": "data/imb_drama_x_train.npy", + "y": "data/imb_drama_y_train.npy" + }, + "testing": + { + "x": "data/imb_drama_x_train.npy", + "y": "data/imb_drama_y_train.npy" + } + } + ], + "C": 50, + "kernel": "rbf" + } + ] +} diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index 0a7874d92..fbd7685d4 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -22,8 +22,10 @@ from typing import Callable, Dict from .loader_classification import (a_nine_a, airline, airline_ohe, bosch, - census, codrnanorm, creditcard, epsilon, fraud, - gisette, hepmass_150K, higgs, higgs_one_m, ijcnn, + census, cifar, codrnanorm, covtype_binary, creditcard, + epsilon_16K, epsilon_80K, epsilon, epsilon_100K, + fraud, gisette, hepmass_150K, + higgs, higgs_one_m, higgs_150K, ijcnn, imb_drama, klaverjas, santander, skin_segmentation, susy) from .loader_multiclass import (connect, covertype, covtype, letters, mlsr, mnist, msrank, plasticc, sensit) @@ -40,19 +42,25 @@ "bosch": bosch, "california_housing": california_housing, "census": census, + "cifar": cifar, "codrnanorm": codrnanorm, "connect": connect, - "covertype": covertype, + "covtype_binary": covtype_binary, "covtype": covtype, "creditcard": creditcard, "epsilon": epsilon, + "epsilon_16K": epsilon_16K, + "epsilon_80K": epsilon_80K, + "epsilon_100K": epsilon_100K, "fraud": fraud, "fried": fried, "gisette": gisette, "hepmass_150K": hepmass_150K, "higgs": higgs, "higgs1m": higgs_one_m, + "higgs_150K": higgs_150K, "ijcnn": ijcnn, + "imb_drama": imb_drama, "klaverjas": klaverjas, "letters": letters, "mlsr": mlsr, diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index fc3cb892d..ffb84f12f 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -22,7 +22,7 @@ import numpy as np import pandas as pd -from sklearn.datasets import fetch_openml, load_svmlight_file +from sklearn.datasets import fetch_openml, load_svmlight_file, fetch_covtype from sklearn.model_selection import train_test_split from .loader_utils import retrieve @@ -261,6 +261,41 @@ def codrnanorm(dataset_dir: Path) -> bool: return True +def covtype_binary(dataset_dir: Path) -> bool: + """ + Cover type dataset from UCI machine learning repository + https://archive.ics.uci.edu/ml/datasets/covertype + + y contains 7 unique class labels from 1 to 7 inclusive. + Classification task. n_classes = 7. + covtype X train dataset (464809, 54) + covtype y train dataset (464809, 1) + covtype X test dataset (116203, 54) + covtype y test dataset (116203, 1) + """ + dataset_name = 'covtype_binary' + os.makedirs(dataset_dir, exist_ok=True) + + nrows_train, nrows_test = 100000, 100000 + logging.info(f'Started loading {dataset_name}') + X, y = fetch_covtype(return_X_y=True) # pylint: disable=unexpected-keyword-arg + logging.info(f'{dataset_name} is loaded, started parsing...') + + y = (y > 3).astype(int) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, + train_size=nrows_train, + test_size=nrows_test, + shuffle=False + ) + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + def creditcard(dataset_dir: Path) -> bool: """ Classification task. n_classes = 2. @@ -334,6 +369,150 @@ def epsilon(dataset_dir: Path) -> bool: return True +def epsilon_16K(dataset_dir: Path) -> bool: + """ + Epsilon dataset + https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html + + Classification task. n_classes = 2. + epsilon_100K x train dataset (16000, 2000) + epsilon_100K y train dataset (16000, 1) + epsilon_100K x test dataset (16000, 2000) + epsilon_100K y test dataset (16000, 1) + """ + dataset_name = 'epsilon_16K' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.bz2' + url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.t.bz2' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + + num_train, num_test, dtype = 16000, 16000, np.float32 + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + X_train, y_train = load_svmlight_file(local_url_train, + dtype=dtype) + X_test, y_test = load_svmlight_file(local_url_test, + dtype=dtype) + X_train = X_train.toarray()[:num_train] + X_test = X_test.toarray()[:num_test] + y_train = y_train[:num_train] + y_train[y_train <= 0] = 0 + y_test = y_test[:num_test] + y_test[y_test <= 0] = 0 + + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def epsilon_100K(dataset_dir: Path) -> bool: + """ + Epsilon dataset + https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html + + Classification task. n_classes = 2. + epsilon_100K x train dataset (50000, 2000) + epsilon_100K y train dataset (50000, 1) + epsilon_100K x test dataset (50000, 2000) + epsilon_100K y test dataset (50000, 1) + """ + dataset_name = 'epsilon_100K' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.bz2' + url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.t.bz2' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + + num_train, num_test, dtype = 50000, 50000, np.float32 + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + X_train, y_train = load_svmlight_file(local_url_train, + dtype=dtype) + X_test, y_test = load_svmlight_file(local_url_test, + dtype=dtype) + X_train = X_train.toarray()[:num_train] + X_test = X_test.toarray()[:num_test] + y_train = y_train[:num_train] + y_train[y_train <= 0] = 0 + y_test = y_test[:num_test] + y_test[y_test <= 0] = 0 + + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def epsilon_80K(dataset_dir: Path) -> bool: + """ + Epsilon dataset + https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html + + Classification task. n_classes = 2. + epsilon_100K x train dataset (80000, 2000) + epsilon_100K y train dataset (80000, 1) + epsilon_100K x test dataset (80000, 2000) + epsilon_100K y test dataset (80000, 1) + """ + dataset_name = 'epsilon_80K' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.bz2' + url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.t.bz2' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + + num_train, num_test, dtype = 80000, 80000, np.float32 + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + X_train, y_train = load_svmlight_file(local_url_train, + dtype=dtype) + X_test, y_test = load_svmlight_file(local_url_test, + dtype=dtype) + X_train = X_train.toarray()[:num_train] + X_test = X_test.toarray()[:num_test] + y_train = y_train[:num_train] + y_train[y_train <= 0] = 0 + y_test = y_test[:num_test] + y_test[y_test <= 0] = 0 + + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + def fraud(dataset_dir: Path) -> bool: """ Credit Card Fraud Detection contest @@ -576,6 +755,46 @@ def higgs_one_m(dataset_dir: Path) -> bool: return True +def higgs_150K(dataset_dir: Path) -> bool: + """ + Higgs dataset from UCI machine learning repository + https://archive.ics.uci.edu/ml/datasets/HIGGS + + Classification task. n_classes = 2. + higgs_150K X train dataset (100000, 28) + higgs_150K y train dataset (50000, 1) + higgs_150K X test dataset (100000, 28) + higgs_150K y test dataset (50000, 1) + """ + dataset_name = 'higgs_150K' + os.makedirs(dataset_dir, exist_ok=True) + + url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, started parsing...') + + nrows_train, nrows_test, dtype = 100000, 50000, np.float32 + data: Any = pd.read_csv(local_url, delimiter=",", header=None, + compression="gzip", dtype=dtype, + nrows=nrows_train + nrows_test) + + X = data[data.columns[1:]] + y = data[data.columns[0:1]] + + x_train, x_test, y_train, y_test = train_test_split( + X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False) + + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + def ijcnn(dataset_dir: Path) -> bool: """ Author: Danil Prokhorov. @@ -611,6 +830,28 @@ def ijcnn(dataset_dir: Path) -> bool: logging.info(f'dataset {dataset_name} is ready.') return True +def imb_drama(dataset_dir: Path) -> bool: + """ + imdb_drama dataset from OpenML Datasets ( + https://www.openml.org/d/273) + + Classification task. + Number of features: 1001 + Number of instances: 120919 + """ + dataset_name = 'imb_drama' + os.makedirs(dataset_dir, exist_ok=True) + + x_train, y_train = fetch_openml('IMDB.drama', return_X_y=True, + as_frame=False, data_home=dataset_dir) + logging.info(f'{dataset_name} is loaded, started parsing...') + for data, name in zip((x_train.todense(), y_train), + ('x_train', 'y_train')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + def klaverjas(dataset_dir: Path) -> bool: """ @@ -726,3 +967,49 @@ def susy(dataset_dir: Path) -> bool: np.save(os.path.join(dataset_dir, filename), data) logging.info(f'dataset {dataset_name} is ready.') return True + + +def cifar(dataset_dir: Path) -> bool: + """ + Cifar dataset from LIBSVM Datasets ( + https://www.cs.toronto.edu/~kriz/cifar.html#cifar) + TaskType: Classification + cifar x train dataset (50000, 3072) + cifar y train dataset (50000, 1) + cifar x test dataset (10000, 3072) + cifar y test dataset (10000, 1) + """ + dataset_name = 'cifar' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.bz2' + url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.t.bz2' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + logging.info(f'{dataset_name} is loaded, started parsing...') + x_train, y_train = load_svmlight_file(local_url_train, + dtype=np.float32) + + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + x_test, y_test = load_svmlight_file(local_url_test, + dtype=np.float32) + + x_train = x_train.toarray() + y_train[y_train <= 0] = 0 + + x_test = x_test.toarray() + y_test[y_test <= 0] = 0 + + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + return True + From 81868d37ec55ec0be33d1ebe62849e7c97305eb4 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Mon, 17 Jan 2022 19:16:10 +0300 Subject: [PATCH 02/20] Revert "knn_svm" This reverts commit 6272ea525c8155efb896525e6d1672e779fca3b3. --- configs/xpu/knn_clsf.json | 162 ----------------- configs/xpu/knn_regr.json | 69 ------- configs/xpu/svm.json | 192 -------------------- datasets/load_datasets.py | 14 +- datasets/loader_classification.py | 289 +----------------------------- 5 files changed, 4 insertions(+), 722 deletions(-) delete mode 100644 configs/xpu/knn_clsf.json delete mode 100644 configs/xpu/knn_regr.json delete mode 100644 configs/xpu/svm.json diff --git a/configs/xpu/knn_clsf.json b/configs/xpu/knn_clsf.json deleted file mode 100644 index 2d72c4ade..000000000 --- a/configs/xpu/knn_clsf.json +++ /dev/null @@ -1,162 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "knn_clsf", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_100K", - "training": - { - "x": "data/epsilon_100K_x_train.npy", - "y": "data/epsilon_100K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_100K_x_test.npy", - "y": "data/epsilon_100K_y_test.npy" - } - } - ], - "n-neighbors": [2, 100] - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - }, - { - "source": "npy", - "name": "hepmass_150K", - "training": - { - "x": "data/hepmass_150K_x_train.npy", - "y": "data/hepmass_150K_y_train.npy" - }, - "testing": - { - "x": "data/hepmass_150K_x_test.npy", - "y": "data/hepmass_150K_y_test.npy" - } - } - ], - "n-neighbors": [5, 100] - }, - { - "dataset": [ - { - "source": "npy", - "name": "cifar", - "training": - { - "x": "data/cifar_x_train.npy", - "y": "data/cifar_y_train.npy" - }, - "testing": - { - "x": "data/cifar_x_test.npy", - "y": "data/cifar_y_test.npy" - } - } - ], - "n-neighbors": 7 - }, - { - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "n-neighbors": 5 - }, - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_100K", - "training": - { - "x": "data/epsilon_100K_x_train.npy", - "y": "data/epsilon_100K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_100K_x_test.npy", - "y": "data/epsilon_100K_y_test.npy" - } - } - ], - "task": "search", - "n-neighbors": 2 - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - } - ], - "task": "search", - "n-neighbors": 5 - }, - { - "dataset": [ - { - "source": "npy", - "name": "cifar", - "training": - { - "x": "data/cifar_x_train.npy", - "y": "data/cifar_y_train.npy" - }, - "testing": - { - "x": "data/cifar_x_test.npy", - "y": "data/cifar_y_test.npy" - } - } - ], - "task": "search", - "n-neighbors": 7 - } - ] -} diff --git a/configs/xpu/knn_regr.json b/configs/xpu/knn_regr.json deleted file mode 100644 index ec1fbc9a9..000000000 --- a/configs/xpu/knn_regr.json +++ /dev/null @@ -1,69 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "knn_regr", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "n-neighbors": 2 - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - } - ], - "n-neighbors": 5 - }, - { - "dataset": [ - { - "source": "npy", - "name": "cifar", - "training": - { - "x": "data/cifar_x_train.npy", - "y": "data/cifar_y_train.npy" - }, - "testing": - { - "x": "data/cifar_x_test.npy", - "y": "data/cifar_y_test.npy" - } - } - ], - "n-neighbors": 7 - } - ] -} diff --git a/configs/xpu/svm.json b/configs/xpu/svm.json deleted file mode 100644 index a98377532..000000000 --- a/configs/xpu/svm.json +++ /dev/null @@ -1,192 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "svm", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - } - ], - "C": 1.5e-3, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - } - ], - "C": 1.0, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_80K", - "training": - { - "x": "data/epsilon_80K_x_train.npy", - "y": "data/epsilon_80K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_80K_x_test.npy", - "y": "data/epsilon_80K_y_test.npy" - } - } - ], - "C": 1.0, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "cifar", - "training": - { - "x": "data/cifar_x_train.npy", - "y": "data/cifar_y_train.npy" - }, - "testing": - { - "x": "data/cifar_x_test.npy", - "y": "data/cifar_y_test.npy" - } - } - ], - "C": 1.0e-7, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "imb_drama", - "training": - { - "x": "data/imb_drama_x_train.npy", - "y": "data/imb_drama_y_train.npy" - }, - "testing": - { - "x": "data/imb_drama_x_train.npy", - "y": "data/imb_drama_y_train.npy" - } - } - ], - "C": 1e-3, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_16K", - "training": - { - "x": "data/epsilon_16K_x_train.npy", - "y": "data/epsilon_16K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_16K_x_test.npy", - "y": "data/epsilon_16K_y_test.npy" - } - } - ], - "C": 9.0e2, - "kernel": "rbf" - }, - { - "dataset": [ - { - "source": "npy", - "name": "covtype_binary", - "training": - { - "x": "data/covtype_binary_x_train.npy", - "y": "data/covtype_binary_y_train.npy" - }, - "testing": - { - "x": "data/covtype_binary_x_test.npy", - "y": "data/covtype_binary_y_test.npy" - } - } - ], - "C": 1000.0, - "kernel": "rbf" - }, - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_80K", - "training": - { - "x": "data/epsilon_80K_x_train.npy", - "y": "data/epsilon_80K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_80K_x_test.npy", - "y": "data/epsilon_80K_y_test.npy" - } - } - ], - "C": 1000.0, - "kernel": "rbf" - }, - { - "dataset": [ - { - "source": "npy", - "name": "imb_drama", - "training": - { - "x": "data/imb_drama_x_train.npy", - "y": "data/imb_drama_y_train.npy" - }, - "testing": - { - "x": "data/imb_drama_x_train.npy", - "y": "data/imb_drama_y_train.npy" - } - } - ], - "C": 50, - "kernel": "rbf" - } - ] -} diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index fbd7685d4..0a7874d92 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -22,10 +22,8 @@ from typing import Callable, Dict from .loader_classification import (a_nine_a, airline, airline_ohe, bosch, - census, cifar, codrnanorm, covtype_binary, creditcard, - epsilon_16K, epsilon_80K, epsilon, epsilon_100K, - fraud, gisette, hepmass_150K, - higgs, higgs_one_m, higgs_150K, ijcnn, imb_drama, + census, codrnanorm, creditcard, epsilon, fraud, + gisette, hepmass_150K, higgs, higgs_one_m, ijcnn, klaverjas, santander, skin_segmentation, susy) from .loader_multiclass import (connect, covertype, covtype, letters, mlsr, mnist, msrank, plasticc, sensit) @@ -42,25 +40,19 @@ "bosch": bosch, "california_housing": california_housing, "census": census, - "cifar": cifar, "codrnanorm": codrnanorm, "connect": connect, - "covtype_binary": covtype_binary, + "covertype": covertype, "covtype": covtype, "creditcard": creditcard, "epsilon": epsilon, - "epsilon_16K": epsilon_16K, - "epsilon_80K": epsilon_80K, - "epsilon_100K": epsilon_100K, "fraud": fraud, "fried": fried, "gisette": gisette, "hepmass_150K": hepmass_150K, "higgs": higgs, "higgs1m": higgs_one_m, - "higgs_150K": higgs_150K, "ijcnn": ijcnn, - "imb_drama": imb_drama, "klaverjas": klaverjas, "letters": letters, "mlsr": mlsr, diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index ffb84f12f..fc3cb892d 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -22,7 +22,7 @@ import numpy as np import pandas as pd -from sklearn.datasets import fetch_openml, load_svmlight_file, fetch_covtype +from sklearn.datasets import fetch_openml, load_svmlight_file from sklearn.model_selection import train_test_split from .loader_utils import retrieve @@ -261,41 +261,6 @@ def codrnanorm(dataset_dir: Path) -> bool: return True -def covtype_binary(dataset_dir: Path) -> bool: - """ - Cover type dataset from UCI machine learning repository - https://archive.ics.uci.edu/ml/datasets/covertype - - y contains 7 unique class labels from 1 to 7 inclusive. - Classification task. n_classes = 7. - covtype X train dataset (464809, 54) - covtype y train dataset (464809, 1) - covtype X test dataset (116203, 54) - covtype y test dataset (116203, 1) - """ - dataset_name = 'covtype_binary' - os.makedirs(dataset_dir, exist_ok=True) - - nrows_train, nrows_test = 100000, 100000 - logging.info(f'Started loading {dataset_name}') - X, y = fetch_covtype(return_X_y=True) # pylint: disable=unexpected-keyword-arg - logging.info(f'{dataset_name} is loaded, started parsing...') - - y = (y > 3).astype(int) - - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, - train_size=nrows_train, - test_size=nrows_test, - shuffle=False - ) - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - def creditcard(dataset_dir: Path) -> bool: """ Classification task. n_classes = 2. @@ -369,150 +334,6 @@ def epsilon(dataset_dir: Path) -> bool: return True -def epsilon_16K(dataset_dir: Path) -> bool: - """ - Epsilon dataset - https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html - - Classification task. n_classes = 2. - epsilon_100K x train dataset (16000, 2000) - epsilon_100K y train dataset (16000, 1) - epsilon_100K x test dataset (16000, 2000) - epsilon_100K y test dataset (16000, 1) - """ - dataset_name = 'epsilon_16K' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.bz2' - url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.t.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - - num_train, num_test, dtype = 16000, 16000, np.float32 - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - retrieve(url_test, local_url_test) - logging.info(f'{dataset_name} is loaded, started parsing...') - X_train, y_train = load_svmlight_file(local_url_train, - dtype=dtype) - X_test, y_test = load_svmlight_file(local_url_test, - dtype=dtype) - X_train = X_train.toarray()[:num_train] - X_test = X_test.toarray()[:num_test] - y_train = y_train[:num_train] - y_train[y_train <= 0] = 0 - y_test = y_test[:num_test] - y_test[y_test <= 0] = 0 - - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def epsilon_100K(dataset_dir: Path) -> bool: - """ - Epsilon dataset - https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html - - Classification task. n_classes = 2. - epsilon_100K x train dataset (50000, 2000) - epsilon_100K y train dataset (50000, 1) - epsilon_100K x test dataset (50000, 2000) - epsilon_100K y test dataset (50000, 1) - """ - dataset_name = 'epsilon_100K' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.bz2' - url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.t.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - - num_train, num_test, dtype = 50000, 50000, np.float32 - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - retrieve(url_test, local_url_test) - logging.info(f'{dataset_name} is loaded, started parsing...') - X_train, y_train = load_svmlight_file(local_url_train, - dtype=dtype) - X_test, y_test = load_svmlight_file(local_url_test, - dtype=dtype) - X_train = X_train.toarray()[:num_train] - X_test = X_test.toarray()[:num_test] - y_train = y_train[:num_train] - y_train[y_train <= 0] = 0 - y_test = y_test[:num_test] - y_test[y_test <= 0] = 0 - - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def epsilon_80K(dataset_dir: Path) -> bool: - """ - Epsilon dataset - https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html - - Classification task. n_classes = 2. - epsilon_100K x train dataset (80000, 2000) - epsilon_100K y train dataset (80000, 1) - epsilon_100K x test dataset (80000, 2000) - epsilon_100K y test dataset (80000, 1) - """ - dataset_name = 'epsilon_80K' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.bz2' - url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.t.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - - num_train, num_test, dtype = 80000, 80000, np.float32 - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - retrieve(url_test, local_url_test) - logging.info(f'{dataset_name} is loaded, started parsing...') - X_train, y_train = load_svmlight_file(local_url_train, - dtype=dtype) - X_test, y_test = load_svmlight_file(local_url_test, - dtype=dtype) - X_train = X_train.toarray()[:num_train] - X_test = X_test.toarray()[:num_test] - y_train = y_train[:num_train] - y_train[y_train <= 0] = 0 - y_test = y_test[:num_test] - y_test[y_test <= 0] = 0 - - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - def fraud(dataset_dir: Path) -> bool: """ Credit Card Fraud Detection contest @@ -755,46 +576,6 @@ def higgs_one_m(dataset_dir: Path) -> bool: return True -def higgs_150K(dataset_dir: Path) -> bool: - """ - Higgs dataset from UCI machine learning repository - https://archive.ics.uci.edu/ml/datasets/HIGGS - - Classification task. n_classes = 2. - higgs_150K X train dataset (100000, 28) - higgs_150K y train dataset (50000, 1) - higgs_150K X test dataset (100000, 28) - higgs_150K y test dataset (50000, 1) - """ - dataset_name = 'higgs_150K' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' - local_url = os.path.join(dataset_dir, os.path.basename(url)) - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - - nrows_train, nrows_test, dtype = 100000, 50000, np.float32 - data: Any = pd.read_csv(local_url, delimiter=",", header=None, - compression="gzip", dtype=dtype, - nrows=nrows_train + nrows_test) - - X = data[data.columns[1:]] - y = data[data.columns[0:1]] - - x_train, x_test, y_train, y_test = train_test_split( - X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False) - - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - def ijcnn(dataset_dir: Path) -> bool: """ Author: Danil Prokhorov. @@ -830,28 +611,6 @@ def ijcnn(dataset_dir: Path) -> bool: logging.info(f'dataset {dataset_name} is ready.') return True -def imb_drama(dataset_dir: Path) -> bool: - """ - imdb_drama dataset from OpenML Datasets ( - https://www.openml.org/d/273) - - Classification task. - Number of features: 1001 - Number of instances: 120919 - """ - dataset_name = 'imb_drama' - os.makedirs(dataset_dir, exist_ok=True) - - x_train, y_train = fetch_openml('IMDB.drama', return_X_y=True, - as_frame=False, data_home=dataset_dir) - logging.info(f'{dataset_name} is loaded, started parsing...') - for data, name in zip((x_train.todense(), y_train), - ('x_train', 'y_train')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - def klaverjas(dataset_dir: Path) -> bool: """ @@ -967,49 +726,3 @@ def susy(dataset_dir: Path) -> bool: np.save(os.path.join(dataset_dir, filename), data) logging.info(f'dataset {dataset_name} is ready.') return True - - -def cifar(dataset_dir: Path) -> bool: - """ - Cifar dataset from LIBSVM Datasets ( - https://www.cs.toronto.edu/~kriz/cifar.html#cifar) - TaskType: Classification - cifar x train dataset (50000, 3072) - cifar y train dataset (50000, 1) - cifar x test dataset (10000, 3072) - cifar y test dataset (10000, 1) - """ - dataset_name = 'cifar' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.bz2' - url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.t.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - logging.info(f'{dataset_name} is loaded, started parsing...') - x_train, y_train = load_svmlight_file(local_url_train, - dtype=np.float32) - - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - retrieve(url_test, local_url_test) - logging.info(f'{dataset_name} is loaded, started parsing...') - x_test, y_test = load_svmlight_file(local_url_test, - dtype=np.float32) - - x_train = x_train.toarray() - y_train[y_train <= 0] = 0 - - x_test = x_test.toarray() - y_test[y_test <= 0] = 0 - - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - return True - From f1e74265cb0754f1c65016ee929da319a96ee1f1 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Mon, 31 Jan 2022 15:16:36 +0300 Subject: [PATCH 03/20] datasets root --- runner.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/runner.py b/runner.py index 980e40b87..b84a14b23 100755 --- a/runner.py +++ b/runner.py @@ -75,8 +75,15 @@ def get_configs(path: Path) -> List[str]: stream=sys.stdout, format='%(levelname)s: %(message)s', level=args.verbose) hostname = socket.gethostname() + env = os.environ.copy() + if 'SKBENCH_DATASETSROOT' in env: + datasets_path = env['SKBENCH_DATASETSROOT'] + logging.info(f'dataset folder at {datasets_path}') + else: + datasets_path = '' + # make directory for data if it doesn't exist - os.makedirs('data', exist_ok=True) + os.makedirs(os.path.join(datasets_path, 'data'), exist_ok=True) json_result: Dict[str, Union[Dict[str, Any], List[Any]]] = { 'hardware': utils.get_hw_parameters(), @@ -141,20 +148,20 @@ def get_configs(path: Path) -> List[str]: if 'training' not in dataset or \ 'x' not in dataset['training'] or \ not utils.find_the_dataset(dataset_name, - dataset['training']['x']): + os.path.join(datasets_path, dataset['training']['x'])): logging.warning( f'Dataset {dataset_name} could not be loaded. \n' 'Check the correct name or expand the download in ' 'the folder dataset.') continue - paths = '--file-X-train ' + dataset['training']["x"] + paths = '--file-X-train ' + os.path.join(datasets_path, dataset['training']["x"]) if 'y' in dataset['training']: - paths += ' --file-y-train ' + dataset['training']["y"] + paths += ' --file-y-train ' + os.path.join(datasets_path, dataset['training']["y"]) if 'testing' in dataset: - paths += ' --file-X-test ' + dataset["testing"]["x"] + paths += ' --file-X-test ' + os.path.join(datasets_path, dataset["testing"]["x"]) if 'y' in dataset['testing']: paths += ' --file-y-test ' + \ - dataset["testing"]["y"] + os.path.join(datasets_path, dataset["testing"]["y"]) elif dataset['source'] == 'synthetic': class GenerationArgs: classes: int @@ -190,7 +197,7 @@ class GenerationArgs: else: cls_num_for_file = '' - file_prefix = f'data/synthetic-{gen_args.type}{cls_num_for_file}-' + file_prefix = os.path.join(datasets_path, f'data/synthetic-{gen_args.type}{cls_num_for_file}-') file_postfix = f'-{gen_args.samples}x{gen_args.features}.npy' gen_args.filex = f'{file_prefix}X-train{file_postfix}' From a2336d83591ec6e4709db3796841cc53a0ca4091 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Mon, 31 Jan 2022 18:44:24 +0300 Subject: [PATCH 04/20] renaming --- runner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/runner.py b/runner.py index b84a14b23..03f1878e4 100755 --- a/runner.py +++ b/runner.py @@ -76,9 +76,9 @@ def get_configs(path: Path) -> List[str]: hostname = socket.gethostname() env = os.environ.copy() - if 'SKBENCH_DATASETSROOT' in env: - datasets_path = env['SKBENCH_DATASETSROOT'] - logging.info(f'dataset folder at {datasets_path}') + if 'DATASETSROOT' in env: + datasets_path = env['DATASETSROOT'] + logging.info(f'datasets folder at {datasets_path}') else: datasets_path = '' From c9038e7d11e1959bc50eafb668636ba142c48456 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Mon, 31 Jan 2022 21:48:33 +0300 Subject: [PATCH 05/20] pep8 --- runner.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/runner.py b/runner.py index 03f1878e4..d56dbf0ce 100755 --- a/runner.py +++ b/runner.py @@ -81,7 +81,7 @@ def get_configs(path: Path) -> List[str]: logging.info(f'datasets folder at {datasets_path}') else: datasets_path = '' - + # make directory for data if it doesn't exist os.makedirs(os.path.join(datasets_path, 'data'), exist_ok=True) @@ -148,17 +148,21 @@ def get_configs(path: Path) -> List[str]: if 'training' not in dataset or \ 'x' not in dataset['training'] or \ not utils.find_the_dataset(dataset_name, - os.path.join(datasets_path, dataset['training']['x'])): + os.path.join(datasets_path, + dataset['training']['x'])): logging.warning( f'Dataset {dataset_name} could not be loaded. \n' 'Check the correct name or expand the download in ' 'the folder dataset.') continue - paths = '--file-X-train ' + os.path.join(datasets_path, dataset['training']["x"]) + paths = '--file-X-train ' + os.path.join(datasets_path, + dataset['training']["x"]) if 'y' in dataset['training']: - paths += ' --file-y-train ' + os.path.join(datasets_path, dataset['training']["y"]) + paths += ' --file-y-train ' + os.path.join(datasets_path, + dataset['training']["y"]) if 'testing' in dataset: - paths += ' --file-X-test ' + os.path.join(datasets_path, dataset["testing"]["x"]) + paths += ' --file-X-test ' + os.path.join(datasets_path, + dataset["testing"]["x"]) if 'y' in dataset['testing']: paths += ' --file-y-test ' + \ os.path.join(datasets_path, dataset["testing"]["y"]) @@ -197,7 +201,8 @@ class GenerationArgs: else: cls_num_for_file = '' - file_prefix = os.path.join(datasets_path, f'data/synthetic-{gen_args.type}{cls_num_for_file}-') + file_prefix = os.path.join(datasets_path, + f'data/synthetic-{gen_args.type}{cls_num_for_file}-') file_postfix = f'-{gen_args.samples}x{gen_args.features}.npy' gen_args.filex = f'{file_prefix}X-train{file_postfix}' From 6142eaf713e33b3ad0d75103a9b59e7cce4c0c2a Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Thu, 10 Feb 2022 20:20:19 +0300 Subject: [PATCH 06/20] process case with read only datasets root --- runner.py | 36 +++++++++++++++++++----------------- utils.py | 19 ++++++++++++++++--- 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/runner.py b/runner.py index d56dbf0ce..1dafc343d 100755 --- a/runner.py +++ b/runner.py @@ -77,13 +77,10 @@ def get_configs(path: Path) -> List[str]: env = os.environ.copy() if 'DATASETSROOT' in env: - datasets_path = env['DATASETSROOT'] - logging.info(f'datasets folder at {datasets_path}') + datasets_root = env['DATASETSROOT'] + logging.info(f'datasets folder at {datasets_root}') else: - datasets_path = '' - - # make directory for data if it doesn't exist - os.makedirs(os.path.join(datasets_path, 'data'), exist_ok=True) + datasets_root = '' json_result: Dict[str, Union[Dict[str, Any], List[Any]]] = { 'hardware': utils.get_hw_parameters(), @@ -145,27 +142,32 @@ def get_configs(path: Path) -> List[str]: for dataset in params_set['dataset']: if dataset['source'] in ['csv', 'npy']: dataset_name = dataset['name'] if 'name' in dataset else 'unknown' - if 'training' not in dataset or \ - 'x' not in dataset['training'] or \ - not utils.find_the_dataset(dataset_name, - os.path.join(datasets_path, - dataset['training']['x'])): + if 'training' not in dataset or 'x' not in dataset['training']: + logging.warning( + f'Dataset {dataset_name} could not be loaded. \n' + 'Training data for algorithm is not specified' + ) + continue + dataset_path = utils.find_the_dataset(dataset_name, datasets_root, + dataset['training']['x']) + if (dataset_path is None): logging.warning( f'Dataset {dataset_name} could not be loaded. \n' 'Check the correct name or expand the download in ' - 'the folder dataset.') + 'the folder dataset.' + ) continue - paths = '--file-X-train ' + os.path.join(datasets_path, + paths = '--file-X-train ' + os.path.join(dataset_path, dataset['training']["x"]) if 'y' in dataset['training']: - paths += ' --file-y-train ' + os.path.join(datasets_path, + paths += ' --file-y-train ' + os.path.join(dataset_path, dataset['training']["y"]) if 'testing' in dataset: - paths += ' --file-X-test ' + os.path.join(datasets_path, + paths += ' --file-X-test ' + os.path.join(dataset_path, dataset["testing"]["x"]) if 'y' in dataset['testing']: paths += ' --file-y-test ' + \ - os.path.join(datasets_path, dataset["testing"]["y"]) + os.path.join(dataset_path, dataset["testing"]["y"]) elif dataset['source'] == 'synthetic': class GenerationArgs: classes: int @@ -201,7 +203,7 @@ class GenerationArgs: else: cls_num_for_file = '' - file_prefix = os.path.join(datasets_path, + file_prefix = os.path.join(dataset_path, f'data/synthetic-{gen_args.type}{cls_num_for_file}-') file_postfix = f'-{gen_args.samples}x{gen_args.features}.npy' diff --git a/utils.py b/utils.py index 6e025b804..a89d0a3a2 100755 --- a/utils.py +++ b/utils.py @@ -20,6 +20,7 @@ import platform import subprocess import sys +import logging from typing import Any, Dict, List, Tuple, Union, cast from datasets.load_datasets import try_load_dataset @@ -51,9 +52,21 @@ def filter_stdout(text: str) -> Tuple[str, str]: return filtered, extra -def find_the_dataset(name: str, fullpath: str) -> bool: - return os.path.isfile(fullpath) or try_load_dataset( - dataset_name=name, output_directory=pathlib.Path(fullpath).parent) +def find_the_dataset(name: str, folder: str, file: str) -> str: + if os.path.isfile(file): + return "" + fullpath = os.path.join(folder, file) + if os.path.isfile(os.path.join(folder, file)) or \ + try_load_dataset(dataset_name=name, output_directory=folder): + return folder + logging.warning( + f'failed downloading {name} to {folder}, ' + 'downloading to local folder"' + ) + if try_load_dataset(dataset_name=name, + output_directory="data"): + return "" + return None def read_output_from_command(command: str, From 4c2b48aaff0bf1a1fc44b755938959eb1819743a Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Tue, 22 Feb 2022 22:21:34 +0300 Subject: [PATCH 07/20] remove logging from utils --- runner.py | 6 +++++- utils.py | 17 +++++------------ 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/runner.py b/runner.py index 1dafc343d..5d2c97e86 100755 --- a/runner.py +++ b/runner.py @@ -150,13 +150,17 @@ def get_configs(path: Path) -> List[str]: continue dataset_path = utils.find_the_dataset(dataset_name, datasets_root, dataset['training']['x']) - if (dataset_path is None): + if dataset_path is None: logging.warning( f'Dataset {dataset_name} could not be loaded. \n' 'Check the correct name or expand the download in ' 'the folder dataset.' ) continue + elif not dataset_path and datasets_root: + logging.info( + f'{dataset_name} is taken from local folder' + ) paths = '--file-X-train ' + os.path.join(dataset_path, dataset['training']["x"]) if 'y' in dataset['training']: diff --git a/utils.py b/utils.py index a89d0a3a2..47b712b31 100755 --- a/utils.py +++ b/utils.py @@ -16,11 +16,9 @@ import json import os -import pathlib import platform import subprocess import sys -import logging from typing import Any, Dict, List, Tuple, Union, cast from datasets.load_datasets import try_load_dataset @@ -55,16 +53,11 @@ def filter_stdout(text: str) -> Tuple[str, str]: def find_the_dataset(name: str, folder: str, file: str) -> str: if os.path.isfile(file): return "" - fullpath = os.path.join(folder, file) - if os.path.isfile(os.path.join(folder, file)) or \ - try_load_dataset(dataset_name=name, output_directory=folder): - return folder - logging.warning( - f'failed downloading {name} to {folder}, ' - 'downloading to local folder"' - ) - if try_load_dataset(dataset_name=name, - output_directory="data"): + if folder: + if os.path.isfile(os.path.join(folder, file)) or \ + try_load_dataset(dataset_name=name, output_directory=folder): + return folder + if try_load_dataset(dataset_name=name, output_directory="data"): return "" return None From ceb0ef646ce926751b60abd11ca9188c4540b981 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Mon, 14 Mar 2022 23:28:37 +0300 Subject: [PATCH 08/20] syntetic --- datasets/make_datasets.py | 42 ++++++++++++++++++++++++++++----------- runner.py | 31 ++++++++++++++++------------- utils.py | 11 ++++++++++ 3 files changed, 58 insertions(+), 26 deletions(-) diff --git a/datasets/make_datasets.py b/datasets/make_datasets.py index 4b3629dd1..56d1c82ab 100644 --- a/datasets/make_datasets.py +++ b/datasets/make_datasets.py @@ -15,26 +15,43 @@ # =============================================================================== import argparse +import logging +import os import numpy as np from sklearn.datasets import make_classification, make_regression, make_blobs from sklearn.utils import check_random_state import sys +def try_gen_dataset(args, folder): + try: + if args.type == 'regression': + gen_regression(args, folder) + elif args.type == 'classification': + gen_classification(args, folder) + elif args.type == 'blobs': + gen_blobs(args, folder) + return True + except BaseException as ex: + logging.warning(f"Internal error generating dataset:\n{ex}") + return False -def gen_blobs(args): + +def gen_blobs(args, folder): + os.makedirs(os.path.join(folder, "data"), exist_ok=True) X, y = make_blobs(n_samples=args.samples + args.test_samples, n_features=args.features, centers=args.clusters, center_box=(-32, 32), shuffle=True, random_state=args.seed) - np.save(args.filex, X[:args.samples]) + np.save(os.path.join(folder, args.filex), X[:args.samples]) if args.test_samples != 0: - np.save(args.filextest, X[args.samples:]) + np.save(os.path.join(folder, args.filextest), X[args.samples:]) return 0 -def gen_regression(args): +def gen_regression(args, folder): + os.makedirs(os.path.join(folder, "data"), exist_ok=True) rs = check_random_state(args.seed) X, y = make_regression(n_targets=1, n_samples=args.samples + args.test_samples, @@ -42,15 +59,16 @@ def gen_regression(args): n_informative=args.features, bias=rs.normal(0, 3), random_state=rs) - np.save(args.filex, X[:args.samples]) - np.save(args.filey, y[:args.samples]) + np.save(os.path.join(folder, args.filex), X[:args.samples]) + np.save(os.path.join(folder, args.filey), y[:args.samples]) if args.test_samples != 0: - np.save(args.filextest, X[args.samples:]) - np.save(args.fileytest, y[args.samples:]) + np.save(os.path.join(folder, args.filextest), X[args.samples:]) + np.save(os.path.join(folder, args.fileytest), y[args.samples:]) return 0 -def gen_classification(args): +def gen_classification(args, folder): + os.makedirs(os.path.join(folder, "data"), exist_ok=True) X, y = make_classification(n_samples=args.samples + args.test_samples, n_features=args.features, n_informative=args.features, @@ -58,11 +76,11 @@ def gen_classification(args): n_redundant=0, n_classes=args.classes, random_state=args.seed) - np.save(args.filex, X[:args.samples]) + np.save(os.path.join(folder, args.filex), X[:args.samples]) np.save(args.filey, y[:args.samples]) if args.test_samples != 0: - np.save(args.filextest, X[args.samples:]) - np.save(args.fileytest, y[args.samples:]) + np.save(os.path.join(folder, args.filextest), X[args.samples:]) + np.save(os.path.join(folder, args.fileytest), y[args.samples:]) return 0 diff --git a/runner.py b/runner.py index 5d2c97e86..ab47603aa 100755 --- a/runner.py +++ b/runner.py @@ -186,7 +186,6 @@ class GenerationArgs: test_samples: int type: str gen_args = GenerationArgs() - paths = '' if 'seed' in params_set: gen_args.seed = params_set['seed'] @@ -207,37 +206,41 @@ class GenerationArgs: else: cls_num_for_file = '' - file_prefix = os.path.join(dataset_path, - f'data/synthetic-{gen_args.type}{cls_num_for_file}-') + file_prefix = f'data/synthetic-{gen_args.type}{cls_num_for_file}-' file_postfix = f'-{gen_args.samples}x{gen_args.features}.npy' gen_args.filex = f'{file_prefix}X-train{file_postfix}' - paths += f' --file-X-train {gen_args.filex}' if gen_args.type not in ['blobs']: gen_args.filey = f'{file_prefix}y-train{file_postfix}' - paths += f' --file-y-train {gen_args.filey}' if 'testing' in dataset: gen_args.test_samples = dataset['testing']['n_samples'] gen_args.filextest = f'{file_prefix}X-test{file_postfix}' - paths += f' --file-X-test {gen_args.filextest}' if gen_args.type not in ['blobs']: gen_args.fileytest = f'{file_prefix}y-test{file_postfix}' - paths += f' --file-y-test {gen_args.fileytest}' else: gen_args.test_samples = 0 gen_args.filextest = gen_args.filex if gen_args.type not in ['blobs']: gen_args.fileytest = gen_args.filey - if not args.dummy_run and not os.path.isfile(gen_args.filex): - if gen_args.type == 'regression': - make_datasets.gen_regression(gen_args) - elif gen_args.type == 'classification': - make_datasets.gen_classification(gen_args) - elif gen_args.type == 'blobs': - make_datasets.gen_blobs(gen_args) dataset_name = f'synthetic_{gen_args.type}' + + if not args.dummy_run: + dataset_path = utils.find_or_gen_dataset(gen_args, datasets_root) + if dataset_path is None: + logging.warning( + f'Dataset {dataset_name} could not be generated. \n' + ) + continue + + paths = f' --file-X-train {os.path.join(dataset_path, gen_args.filex)}' + if gen_args.type not in ['blobs']: + paths += f' --file-y-train {os.path.join(dataset_path, gen_args.filey)}' + if 'testing' in dataset: + paths += f' --file-X-test {os.path.join(dataset_path, gen_args.filextest)}' + if gen_args.type not in ['blobs']: + paths += f' --file-y-test {os.path.join(dataset_path, gen_args.fileytest)}' else: logging.warning('Unknown dataset source. Only synthetics datasets ' 'and csv/npy files are supported now') diff --git a/utils.py b/utils.py index 47b712b31..7a5b4f546 100755 --- a/utils.py +++ b/utils.py @@ -21,6 +21,7 @@ import sys from typing import Any, Dict, List, Tuple, Union, cast +from datasets.make_datasets import try_gen_dataset from datasets.load_datasets import try_load_dataset @@ -49,6 +50,16 @@ def filter_stdout(text: str) -> Tuple[str, str]: filtered += line + '\n' return filtered, extra +def find_or_gen_dataset(args, folder) : + if os.path.isfile(args.filex): + return "" + if folder: + if os.path.isfile(os.path.join(folder, args.filex)) or \ + try_gen_dataset(args, folder): + return folder + if try_gen_dataset(args, ""): + return "" + return None def find_the_dataset(name: str, folder: str, file: str) -> str: if os.path.isfile(file): From 19c2ea0b1cd2506df6e9484bc9bd56aa68a589eb Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Tue, 15 Mar 2022 16:54:12 +0300 Subject: [PATCH 09/20] enable DAAL_DATASETS --- runner.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/runner.py b/runner.py index 3cedf4bb6..098089966 100755 --- a/runner.py +++ b/runner.py @@ -88,6 +88,9 @@ def get_configs(path: Path) -> List[str]: if 'DATASETSROOT' in env: datasets_root = env['DATASETSROOT'] logging.info(f'datasets folder at {datasets_root}') + else if 'DAAL_DATASETS': + datasets_root = env['DAAL_DATASETS'] + logging.info(f'datasets folder at {datasets_root}') else: datasets_root = '' From b94aa6ac7b967514f363fa58c5fc17feecc640d9 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Tue, 15 Mar 2022 16:56:29 +0300 Subject: [PATCH 10/20] fix --- runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runner.py b/runner.py index 098089966..d916c88c2 100755 --- a/runner.py +++ b/runner.py @@ -88,7 +88,7 @@ def get_configs(path: Path) -> List[str]: if 'DATASETSROOT' in env: datasets_root = env['DATASETSROOT'] logging.info(f'datasets folder at {datasets_root}') - else if 'DAAL_DATASETS': + elif 'DAAL_DATASETS': datasets_root = env['DAAL_DATASETS'] logging.info(f'datasets folder at {datasets_root}') else: From 618987b6d9eaa41ec8d58123e844038e27371092 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Tue, 15 Mar 2022 17:01:00 +0300 Subject: [PATCH 11/20] fix --- runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runner.py b/runner.py index d916c88c2..dfafaf6a8 100755 --- a/runner.py +++ b/runner.py @@ -88,7 +88,7 @@ def get_configs(path: Path) -> List[str]: if 'DATASETSROOT' in env: datasets_root = env['DATASETSROOT'] logging.info(f'datasets folder at {datasets_root}') - elif 'DAAL_DATASETS': + elif 'DAAL_DATASETS' in env: datasets_root = env['DAAL_DATASETS'] logging.info(f'datasets folder at {datasets_root}') else: From 8e9b42d132f7bc11291fb116e43f76e896b5c049 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Tue, 15 Mar 2022 17:55:08 +0300 Subject: [PATCH 12/20] pep8 --- datasets/make_datasets.py | 1 + runner.py | 6 +++--- utils.py | 11 +++++++---- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/datasets/make_datasets.py b/datasets/make_datasets.py index 56d1c82ab..db194c13d 100644 --- a/datasets/make_datasets.py +++ b/datasets/make_datasets.py @@ -22,6 +22,7 @@ from sklearn.utils import check_random_state import sys + def try_gen_dataset(args, folder): try: if args.type == 'regression': diff --git a/runner.py b/runner.py index dfafaf6a8..73cdff1d8 100755 --- a/runner.py +++ b/runner.py @@ -22,7 +22,6 @@ import sys from typing import Any, Dict, List, Union -import datasets.make_datasets as make_datasets import utils from pathlib import Path @@ -247,7 +246,7 @@ class GenerationArgs: dataset_path = utils.find_or_gen_dataset(gen_args, datasets_root) if dataset_path is None: logging.warning( - f'Dataset {dataset_name} could not be generated. \n' + f'Dataset {dataset_name} could not be generated. \n' ) continue @@ -257,7 +256,8 @@ class GenerationArgs: if 'testing' in dataset: paths += f' --file-X-test {os.path.join(dataset_path, gen_args.filextest)}' if gen_args.type not in ['blobs']: - paths += f' --file-y-test {os.path.join(dataset_path, gen_args.fileytest)}' + paths += f' --file-y-test \ + {os.path.join(dataset_path, gen_args.fileytest)}' else: logging.warning('Unknown dataset source. Only synthetics datasets ' 'and csv/npy files are supported now') diff --git a/utils.py b/utils.py index 7a5b4f546..0b841006f 100755 --- a/utils.py +++ b/utils.py @@ -19,6 +19,7 @@ import platform import subprocess import sys +from pathlib import Path from typing import Any, Dict, List, Tuple, Union, cast from datasets.make_datasets import try_gen_dataset @@ -50,23 +51,25 @@ def filter_stdout(text: str) -> Tuple[str, str]: filtered += line + '\n' return filtered, extra -def find_or_gen_dataset(args, folder) : + +def find_or_gen_dataset(args: dict, folder: Path): if os.path.isfile(args.filex): return "" if folder: if os.path.isfile(os.path.join(folder, args.filex)) or \ - try_gen_dataset(args, folder): + try_gen_dataset(args, folder): return folder if try_gen_dataset(args, ""): return "" return None -def find_the_dataset(name: str, folder: str, file: str) -> str: + +def find_the_dataset(name: str, folder: Path, file: Path) -> str: if os.path.isfile(file): return "" if folder: if os.path.isfile(os.path.join(folder, file)) or \ - try_load_dataset(dataset_name=name, output_directory=folder): + try_load_dataset(dataset_name=name, output_directory=folder): return folder if try_load_dataset(dataset_name=name, output_directory="data"): return "" From d9e064230ddb5628383eaa9000a246d1efc70e4e Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Tue, 15 Mar 2022 19:53:28 +0300 Subject: [PATCH 13/20] pep8 --- runner.py | 2 +- utils.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/runner.py b/runner.py index 73cdff1d8..52eaccc97 100755 --- a/runner.py +++ b/runner.py @@ -243,7 +243,7 @@ class GenerationArgs: dataset_name = f'synthetic_{gen_args.type}' if not args.dummy_run: - dataset_path = utils.find_or_gen_dataset(gen_args, datasets_root) + dataset_path = utils.find_or_gen_dataset(gen_args, datasets_root, gen_args.filex) if dataset_path is None: logging.warning( f'Dataset {dataset_name} could not be generated. \n' diff --git a/utils.py b/utils.py index 0b841006f..6f5bcc511 100755 --- a/utils.py +++ b/utils.py @@ -52,11 +52,11 @@ def filter_stdout(text: str) -> Tuple[str, str]: return filtered, extra -def find_or_gen_dataset(args: dict, folder: Path): - if os.path.isfile(args.filex): +def find_or_gen_dataset(args: Any, folder: str, file: str): + if os.path.isfile(file): return "" if folder: - if os.path.isfile(os.path.join(folder, args.filex)) or \ + if os.path.isfile(os.path.join(folder, file)) or \ try_gen_dataset(args, folder): return folder if try_gen_dataset(args, ""): @@ -64,14 +64,14 @@ def find_or_gen_dataset(args: dict, folder: Path): return None -def find_the_dataset(name: str, folder: Path, file: Path) -> str: +def find_the_dataset(name: str, folder: str, file: str): if os.path.isfile(file): return "" if folder: if os.path.isfile(os.path.join(folder, file)) or \ - try_load_dataset(dataset_name=name, output_directory=folder): + try_load_dataset(dataset_name=name, output_directory=Path(folder)): return folder - if try_load_dataset(dataset_name=name, output_directory="data"): + if try_load_dataset(dataset_name=name, output_directory=Path("data")): return "" return None From e05a44f2cf64dafd120b448671a8c97971fdbe1b Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Tue, 15 Mar 2022 20:01:53 +0300 Subject: [PATCH 14/20] pep8 --- runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/runner.py b/runner.py index 52eaccc97..7b5d07d9e 100755 --- a/runner.py +++ b/runner.py @@ -243,7 +243,8 @@ class GenerationArgs: dataset_name = f'synthetic_{gen_args.type}' if not args.dummy_run: - dataset_path = utils.find_or_gen_dataset(gen_args, datasets_root, gen_args.filex) + dataset_path = utils.find_or_gen_dataset(gen_args, + datasets_root, gen_args.filex) if dataset_path is None: logging.warning( f'Dataset {dataset_name} could not be generated. \n' From aff54b379dfc444581ed6d6a33ed34789b610e53 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Wed, 16 Mar 2022 00:58:23 +0300 Subject: [PATCH 15/20] try fix --- utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/utils.py b/utils.py index 6f5bcc511..7fdf0d62e 100755 --- a/utils.py +++ b/utils.py @@ -56,8 +56,9 @@ def find_or_gen_dataset(args: Any, folder: str, file: str): if os.path.isfile(file): return "" if folder: - if os.path.isfile(os.path.join(folder, file)) or \ - try_gen_dataset(args, folder): + if os.path.isfile(os.path.join(folder, file)): + return folder + elif try_gen_dataset(args, folder): return folder if try_gen_dataset(args, ""): return "" @@ -68,8 +69,9 @@ def find_the_dataset(name: str, folder: str, file: str): if os.path.isfile(file): return "" if folder: - if os.path.isfile(os.path.join(folder, file)) or \ - try_load_dataset(dataset_name=name, output_directory=Path(folder)): + if os.path.isfile(os.path.join(folder, file)): + return folder + elif try_load_dataset(dataset_name=name, output_directory=Path(folder)): return folder if try_load_dataset(dataset_name=name, output_directory=Path("data")): return "" From 44164b34ddd974d3a26ad4673feebc5c387e2e52 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Wed, 16 Mar 2022 11:32:38 +0300 Subject: [PATCH 16/20] download to data --- utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils.py b/utils.py index 7fdf0d62e..f0f30feaa 100755 --- a/utils.py +++ b/utils.py @@ -71,7 +71,8 @@ def find_the_dataset(name: str, folder: str, file: str): if folder: if os.path.isfile(os.path.join(folder, file)): return folder - elif try_load_dataset(dataset_name=name, output_directory=Path(folder)): + elif try_load_dataset(dataset_name=name, + output_directory=Path(os.path.join(folder, "data"))): return folder if try_load_dataset(dataset_name=name, output_directory=Path("data")): return "" From ba88a83a3576d40267c801064b58bee1ad024b1c Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Thu, 17 Mar 2022 20:04:15 +0300 Subject: [PATCH 17/20] codefactor --- runner.py | 3 +-- utils.py | 21 ++++++++++----------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/runner.py b/runner.py index 3b7fb626b..c8dceb446 100755 --- a/runner.py +++ b/runner.py @@ -244,7 +244,6 @@ class GenerationArgs: gen_args.test_samples = dataset['testing']['n_samples'] gen_args.filextest = f'{file_prefix}X-test{file_postfix}' files['file-X-test'] = gen_args.filextest - files.append(gen_args.filextest) if gen_args.type not in ['blobs']: gen_args.fileytest = f'{file_prefix}y-test{file_postfix}' files['file-y-test'] = gen_args.fileytest @@ -266,7 +265,7 @@ class GenerationArgs: f'Dataset {dataset_name} could not be generated. \n' ) continue - + paths = '' for data_path, data_file in files.items(): paths += f'--{data_path} {os.path.join(dataset_path, data_file)} ' diff --git a/utils.py b/utils.py index 0c14c49ac..192a5f421 100755 --- a/utils.py +++ b/utils.py @@ -20,7 +20,7 @@ import subprocess import sys from pathlib import Path -from typing import Any, Dict, List, Tuple, Union, cast +from typing import Any, Dict, Iterable, List, Tuple, Union, cast from datasets.make_datasets import try_gen_dataset from datasets.load_datasets import try_load_dataset @@ -51,34 +51,33 @@ def filter_stdout(text: str) -> Tuple[str, str]: filtered += line + '\n' return filtered, extra -def files_in_folder(folder: str, files: List[str]) -> bool: + +def files_in_folder(folder: str, files: Iterable[str]) -> bool: for file in files: if not os.path.isfile(os.path.join(folder, file)): return False return True -def find_or_gen_dataset(args: Any, folder: str, files: List[str]): +def find_or_gen_dataset(args: Any, folder: str, files: Iterable[str]): if files_in_folder("", files): return "" if folder: - if files_in_folder(folder, files): - return folder - elif try_gen_dataset(args, folder): + if files_in_folder(folder, files) or \ + try_gen_dataset(args, folder): return folder if try_gen_dataset(args, ""): return "" return None -def find_the_dataset(name: str, folder: str, files: List[str]): +def find_the_dataset(name: str, folder: str, files: Iterable[str]): if files_in_folder("", files): return "" if folder: - if files_in_folder(folder, files): - return folder - elif try_load_dataset(dataset_name=name, - output_directory=Path(os.path.join(folder, "data"))): + if files_in_folder(folder, files) or \ + try_load_dataset(dataset_name=name, + output_directory=Path(os.path.join(folder, "data"))): return folder if try_load_dataset(dataset_name=name, output_directory=Path("data")): return "" From a3eb76a5b5be580e86f4904c4691f32562bf1a6a Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Fri, 18 Mar 2022 18:25:17 +0300 Subject: [PATCH 18/20] exception for unknown type --- datasets/make_datasets.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datasets/make_datasets.py b/datasets/make_datasets.py index db194c13d..0add738cb 100644 --- a/datasets/make_datasets.py +++ b/datasets/make_datasets.py @@ -31,6 +31,8 @@ def try_gen_dataset(args, folder): gen_classification(args, folder) elif args.type == 'blobs': gen_blobs(args, folder) + else: + raise ValueError(f'{args.type} is unknown dataset type') return True except BaseException as ex: logging.warning(f"Internal error generating dataset:\n{ex}") From 21a2e52e58029b8e24ecf45aadb3ae80c4b87426 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Fri, 18 Mar 2022 18:32:26 +0300 Subject: [PATCH 19/20] fix logging --- runner.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/runner.py b/runner.py index c8dceb446..aad2f969e 100755 --- a/runner.py +++ b/runner.py @@ -86,12 +86,13 @@ def get_configs(path: Path) -> List[str]: env = os.environ.copy() if 'DATASETSROOT' in env: datasets_root = env['DATASETSROOT'] - logging.info(f'datasets folder at {datasets_root}') + logging.info(f'Datasets folder at {datasets_root}') elif 'DAAL_DATASETS' in env: datasets_root = env['DAAL_DATASETS'] - logging.info(f'datasets folder at {datasets_root}') + logging.info(f'Datasets folder at {datasets_root}') else: datasets_root = '' + logging.info(f'Datasets folder is not set, using local folder') json_result: Dict[str, Union[Dict[str, Any], List[Any]]] = { 'hardware': utils.get_hw_parameters(), @@ -189,7 +190,7 @@ def get_configs(path: Path) -> List[str]: continue elif not dataset_path and datasets_root: logging.info( - f'{dataset_name} is taken from local folder' + f'{dataset_name} is taken from local folder' ) paths = '' From 50fcf88f23c0e165c017cb5f490f2b18b48eed0e Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Fri, 18 Mar 2022 18:36:27 +0300 Subject: [PATCH 20/20] fix pep8 --- runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runner.py b/runner.py index aad2f969e..9a9d527c5 100755 --- a/runner.py +++ b/runner.py @@ -92,7 +92,7 @@ def get_configs(path: Path) -> List[str]: logging.info(f'Datasets folder at {datasets_root}') else: datasets_root = '' - logging.info(f'Datasets folder is not set, using local folder') + logging.info('Datasets folder is not set, using local folder') json_result: Dict[str, Union[Dict[str, Any], List[Any]]] = { 'hardware': utils.get_hw_parameters(),