From 6272ea525c8155efb896525e6d1672e779fca3b3 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Mon, 17 Jan 2022 19:11:30 +0300 Subject: [PATCH 01/10] knn_svm --- configs/xpu/knn_clsf.json | 162 +++++++++++++++++ configs/xpu/knn_regr.json | 69 +++++++ configs/xpu/svm.json | 192 ++++++++++++++++++++ datasets/load_datasets.py | 14 +- datasets/loader_classification.py | 289 +++++++++++++++++++++++++++++- 5 files changed, 722 insertions(+), 4 deletions(-) create mode 100644 configs/xpu/knn_clsf.json create mode 100644 configs/xpu/knn_regr.json create mode 100644 configs/xpu/svm.json diff --git a/configs/xpu/knn_clsf.json b/configs/xpu/knn_clsf.json new file mode 100644 index 000000000..2d72c4ade --- /dev/null +++ b/configs/xpu/knn_clsf.json @@ -0,0 +1,162 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "knn_clsf", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_100K", + "training": + { + "x": "data/epsilon_100K_x_train.npy", + "y": "data/epsilon_100K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_100K_x_test.npy", + "y": "data/epsilon_100K_y_test.npy" + } + } + ], + "n-neighbors": [2, 100] + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_150K", + "training": + { + "x": "data/higgs_150K_x_train.npy", + "y": "data/higgs_150K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_150K_x_test.npy", + "y": "data/higgs_150K_y_test.npy" + } + }, + { + "source": "npy", + "name": "hepmass_150K", + "training": + { + "x": "data/hepmass_150K_x_train.npy", + "y": "data/hepmass_150K_y_train.npy" + }, + "testing": + { + "x": "data/hepmass_150K_x_test.npy", + "y": "data/hepmass_150K_y_test.npy" + } + } + ], + "n-neighbors": [5, 100] + }, + { + "dataset": [ + { + "source": "npy", + "name": "cifar", + "training": + { + "x": "data/cifar_x_train.npy", + "y": "data/cifar_y_train.npy" + }, + "testing": + { + "x": "data/cifar_x_test.npy", + "y": "data/cifar_y_test.npy" + } + } + ], + "n-neighbors": 7 + }, + { + "dataset": [ + { + "source": "npy", + "name": "mnist", + "training": + { + "x": "data/mnist_x_train.npy", + "y": "data/mnist_y_train.npy" + }, + "testing": + { + "x": "data/mnist_x_test.npy", + "y": "data/mnist_y_test.npy" + } + } + ], + "n-neighbors": 5 + }, + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_100K", + "training": + { + "x": "data/epsilon_100K_x_train.npy", + "y": "data/epsilon_100K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_100K_x_test.npy", + "y": "data/epsilon_100K_y_test.npy" + } + } + ], + "task": "search", + "n-neighbors": 2 + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_150K", + "training": + { + "x": "data/higgs_150K_x_train.npy", + "y": "data/higgs_150K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_150K_x_test.npy", + "y": "data/higgs_150K_y_test.npy" + } + } + ], + "task": "search", + "n-neighbors": 5 + }, + { + "dataset": [ + { + "source": "npy", + "name": "cifar", + "training": + { + "x": "data/cifar_x_train.npy", + "y": "data/cifar_y_train.npy" + }, + "testing": + { + "x": "data/cifar_x_test.npy", + "y": "data/cifar_y_test.npy" + } + } + ], + "task": "search", + "n-neighbors": 7 + } + ] +} diff --git a/configs/xpu/knn_regr.json b/configs/xpu/knn_regr.json new file mode 100644 index 000000000..ec1fbc9a9 --- /dev/null +++ b/configs/xpu/knn_regr.json @@ -0,0 +1,69 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "knn_regr", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "n-neighbors": 2 + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_150K", + "training": + { + "x": "data/higgs_150K_x_train.npy", + "y": "data/higgs_150K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_150K_x_test.npy", + "y": "data/higgs_150K_y_test.npy" + } + } + ], + "n-neighbors": 5 + }, + { + "dataset": [ + { + "source": "npy", + "name": "cifar", + "training": + { + "x": "data/cifar_x_train.npy", + "y": "data/cifar_y_train.npy" + }, + "testing": + { + "x": "data/cifar_x_test.npy", + "y": "data/cifar_y_test.npy" + } + } + ], + "n-neighbors": 7 + } + ] +} diff --git a/configs/xpu/svm.json b/configs/xpu/svm.json new file mode 100644 index 000000000..a98377532 --- /dev/null +++ b/configs/xpu/svm.json @@ -0,0 +1,192 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "svm", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "gisette", + "training": + { + "x": "data/gisette_x_train.npy", + "y": "data/gisette_y_train.npy" + }, + "testing": + { + "x": "data/gisette_x_test.npy", + "y": "data/gisette_y_test.npy" + } + } + ], + "C": 1.5e-3, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_150K", + "training": + { + "x": "data/higgs_150K_x_train.npy", + "y": "data/higgs_150K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_150K_x_test.npy", + "y": "data/higgs_150K_y_test.npy" + } + } + ], + "C": 1.0, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_80K", + "training": + { + "x": "data/epsilon_80K_x_train.npy", + "y": "data/epsilon_80K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_80K_x_test.npy", + "y": "data/epsilon_80K_y_test.npy" + } + } + ], + "C": 1.0, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "cifar", + "training": + { + "x": "data/cifar_x_train.npy", + "y": "data/cifar_y_train.npy" + }, + "testing": + { + "x": "data/cifar_x_test.npy", + "y": "data/cifar_y_test.npy" + } + } + ], + "C": 1.0e-7, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "imb_drama", + "training": + { + "x": "data/imb_drama_x_train.npy", + "y": "data/imb_drama_y_train.npy" + }, + "testing": + { + "x": "data/imb_drama_x_train.npy", + "y": "data/imb_drama_y_train.npy" + } + } + ], + "C": 1e-3, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_16K", + "training": + { + "x": "data/epsilon_16K_x_train.npy", + "y": "data/epsilon_16K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_16K_x_test.npy", + "y": "data/epsilon_16K_y_test.npy" + } + } + ], + "C": 9.0e2, + "kernel": "rbf" + }, + { + "dataset": [ + { + "source": "npy", + "name": "covtype_binary", + "training": + { + "x": "data/covtype_binary_x_train.npy", + "y": "data/covtype_binary_y_train.npy" + }, + "testing": + { + "x": "data/covtype_binary_x_test.npy", + "y": "data/covtype_binary_y_test.npy" + } + } + ], + "C": 1000.0, + "kernel": "rbf" + }, + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_80K", + "training": + { + "x": "data/epsilon_80K_x_train.npy", + "y": "data/epsilon_80K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_80K_x_test.npy", + "y": "data/epsilon_80K_y_test.npy" + } + } + ], + "C": 1000.0, + "kernel": "rbf" + }, + { + "dataset": [ + { + "source": "npy", + "name": "imb_drama", + "training": + { + "x": "data/imb_drama_x_train.npy", + "y": "data/imb_drama_y_train.npy" + }, + "testing": + { + "x": "data/imb_drama_x_train.npy", + "y": "data/imb_drama_y_train.npy" + } + } + ], + "C": 50, + "kernel": "rbf" + } + ] +} diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index 0a7874d92..fbd7685d4 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -22,8 +22,10 @@ from typing import Callable, Dict from .loader_classification import (a_nine_a, airline, airline_ohe, bosch, - census, codrnanorm, creditcard, epsilon, fraud, - gisette, hepmass_150K, higgs, higgs_one_m, ijcnn, + census, cifar, codrnanorm, covtype_binary, creditcard, + epsilon_16K, epsilon_80K, epsilon, epsilon_100K, + fraud, gisette, hepmass_150K, + higgs, higgs_one_m, higgs_150K, ijcnn, imb_drama, klaverjas, santander, skin_segmentation, susy) from .loader_multiclass import (connect, covertype, covtype, letters, mlsr, mnist, msrank, plasticc, sensit) @@ -40,19 +42,25 @@ "bosch": bosch, "california_housing": california_housing, "census": census, + "cifar": cifar, "codrnanorm": codrnanorm, "connect": connect, - "covertype": covertype, + "covtype_binary": covtype_binary, "covtype": covtype, "creditcard": creditcard, "epsilon": epsilon, + "epsilon_16K": epsilon_16K, + "epsilon_80K": epsilon_80K, + "epsilon_100K": epsilon_100K, "fraud": fraud, "fried": fried, "gisette": gisette, "hepmass_150K": hepmass_150K, "higgs": higgs, "higgs1m": higgs_one_m, + "higgs_150K": higgs_150K, "ijcnn": ijcnn, + "imb_drama": imb_drama, "klaverjas": klaverjas, "letters": letters, "mlsr": mlsr, diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index fc3cb892d..ffb84f12f 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -22,7 +22,7 @@ import numpy as np import pandas as pd -from sklearn.datasets import fetch_openml, load_svmlight_file +from sklearn.datasets import fetch_openml, load_svmlight_file, fetch_covtype from sklearn.model_selection import train_test_split from .loader_utils import retrieve @@ -261,6 +261,41 @@ def codrnanorm(dataset_dir: Path) -> bool: return True +def covtype_binary(dataset_dir: Path) -> bool: + """ + Cover type dataset from UCI machine learning repository + https://archive.ics.uci.edu/ml/datasets/covertype + + y contains 7 unique class labels from 1 to 7 inclusive. + Classification task. n_classes = 7. + covtype X train dataset (464809, 54) + covtype y train dataset (464809, 1) + covtype X test dataset (116203, 54) + covtype y test dataset (116203, 1) + """ + dataset_name = 'covtype_binary' + os.makedirs(dataset_dir, exist_ok=True) + + nrows_train, nrows_test = 100000, 100000 + logging.info(f'Started loading {dataset_name}') + X, y = fetch_covtype(return_X_y=True) # pylint: disable=unexpected-keyword-arg + logging.info(f'{dataset_name} is loaded, started parsing...') + + y = (y > 3).astype(int) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, + train_size=nrows_train, + test_size=nrows_test, + shuffle=False + ) + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + def creditcard(dataset_dir: Path) -> bool: """ Classification task. n_classes = 2. @@ -334,6 +369,150 @@ def epsilon(dataset_dir: Path) -> bool: return True +def epsilon_16K(dataset_dir: Path) -> bool: + """ + Epsilon dataset + https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html + + Classification task. n_classes = 2. + epsilon_100K x train dataset (16000, 2000) + epsilon_100K y train dataset (16000, 1) + epsilon_100K x test dataset (16000, 2000) + epsilon_100K y test dataset (16000, 1) + """ + dataset_name = 'epsilon_16K' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.bz2' + url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.t.bz2' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + + num_train, num_test, dtype = 16000, 16000, np.float32 + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + X_train, y_train = load_svmlight_file(local_url_train, + dtype=dtype) + X_test, y_test = load_svmlight_file(local_url_test, + dtype=dtype) + X_train = X_train.toarray()[:num_train] + X_test = X_test.toarray()[:num_test] + y_train = y_train[:num_train] + y_train[y_train <= 0] = 0 + y_test = y_test[:num_test] + y_test[y_test <= 0] = 0 + + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def epsilon_100K(dataset_dir: Path) -> bool: + """ + Epsilon dataset + https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html + + Classification task. n_classes = 2. + epsilon_100K x train dataset (50000, 2000) + epsilon_100K y train dataset (50000, 1) + epsilon_100K x test dataset (50000, 2000) + epsilon_100K y test dataset (50000, 1) + """ + dataset_name = 'epsilon_100K' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.bz2' + url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.t.bz2' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + + num_train, num_test, dtype = 50000, 50000, np.float32 + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + X_train, y_train = load_svmlight_file(local_url_train, + dtype=dtype) + X_test, y_test = load_svmlight_file(local_url_test, + dtype=dtype) + X_train = X_train.toarray()[:num_train] + X_test = X_test.toarray()[:num_test] + y_train = y_train[:num_train] + y_train[y_train <= 0] = 0 + y_test = y_test[:num_test] + y_test[y_test <= 0] = 0 + + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def epsilon_80K(dataset_dir: Path) -> bool: + """ + Epsilon dataset + https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html + + Classification task. n_classes = 2. + epsilon_100K x train dataset (80000, 2000) + epsilon_100K y train dataset (80000, 1) + epsilon_100K x test dataset (80000, 2000) + epsilon_100K y test dataset (80000, 1) + """ + dataset_name = 'epsilon_80K' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.bz2' + url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.t.bz2' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + + num_train, num_test, dtype = 80000, 80000, np.float32 + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + X_train, y_train = load_svmlight_file(local_url_train, + dtype=dtype) + X_test, y_test = load_svmlight_file(local_url_test, + dtype=dtype) + X_train = X_train.toarray()[:num_train] + X_test = X_test.toarray()[:num_test] + y_train = y_train[:num_train] + y_train[y_train <= 0] = 0 + y_test = y_test[:num_test] + y_test[y_test <= 0] = 0 + + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + def fraud(dataset_dir: Path) -> bool: """ Credit Card Fraud Detection contest @@ -576,6 +755,46 @@ def higgs_one_m(dataset_dir: Path) -> bool: return True +def higgs_150K(dataset_dir: Path) -> bool: + """ + Higgs dataset from UCI machine learning repository + https://archive.ics.uci.edu/ml/datasets/HIGGS + + Classification task. n_classes = 2. + higgs_150K X train dataset (100000, 28) + higgs_150K y train dataset (50000, 1) + higgs_150K X test dataset (100000, 28) + higgs_150K y test dataset (50000, 1) + """ + dataset_name = 'higgs_150K' + os.makedirs(dataset_dir, exist_ok=True) + + url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, started parsing...') + + nrows_train, nrows_test, dtype = 100000, 50000, np.float32 + data: Any = pd.read_csv(local_url, delimiter=",", header=None, + compression="gzip", dtype=dtype, + nrows=nrows_train + nrows_test) + + X = data[data.columns[1:]] + y = data[data.columns[0:1]] + + x_train, x_test, y_train, y_test = train_test_split( + X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False) + + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + def ijcnn(dataset_dir: Path) -> bool: """ Author: Danil Prokhorov. @@ -611,6 +830,28 @@ def ijcnn(dataset_dir: Path) -> bool: logging.info(f'dataset {dataset_name} is ready.') return True +def imb_drama(dataset_dir: Path) -> bool: + """ + imdb_drama dataset from OpenML Datasets ( + https://www.openml.org/d/273) + + Classification task. + Number of features: 1001 + Number of instances: 120919 + """ + dataset_name = 'imb_drama' + os.makedirs(dataset_dir, exist_ok=True) + + x_train, y_train = fetch_openml('IMDB.drama', return_X_y=True, + as_frame=False, data_home=dataset_dir) + logging.info(f'{dataset_name} is loaded, started parsing...') + for data, name in zip((x_train.todense(), y_train), + ('x_train', 'y_train')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + def klaverjas(dataset_dir: Path) -> bool: """ @@ -726,3 +967,49 @@ def susy(dataset_dir: Path) -> bool: np.save(os.path.join(dataset_dir, filename), data) logging.info(f'dataset {dataset_name} is ready.') return True + + +def cifar(dataset_dir: Path) -> bool: + """ + Cifar dataset from LIBSVM Datasets ( + https://www.cs.toronto.edu/~kriz/cifar.html#cifar) + TaskType: Classification + cifar x train dataset (50000, 3072) + cifar y train dataset (50000, 1) + cifar x test dataset (10000, 3072) + cifar y test dataset (10000, 1) + """ + dataset_name = 'cifar' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.bz2' + url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.t.bz2' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + logging.info(f'{dataset_name} is loaded, started parsing...') + x_train, y_train = load_svmlight_file(local_url_train, + dtype=np.float32) + + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + x_test, y_test = load_svmlight_file(local_url_test, + dtype=np.float32) + + x_train = x_train.toarray() + y_train[y_train <= 0] = 0 + + x_test = x_test.toarray() + y_test[y_test <= 0] = 0 + + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + return True + From ad47e4e6a95928a2b066028ff186e3a317925429 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Wed, 19 Jan 2022 19:24:31 +0300 Subject: [PATCH 02/10] epsilon dataset uses only train --- configs/xpu/knn_clsf.json | 8 ++--- configs/xpu/svm.json | 8 ++--- datasets/loader_classification.py | 50 ++++++++----------------------- 3 files changed, 20 insertions(+), 46 deletions(-) diff --git a/configs/xpu/knn_clsf.json b/configs/xpu/knn_clsf.json index 2d72c4ade..bcc1eb452 100644 --- a/configs/xpu/knn_clsf.json +++ b/configs/xpu/knn_clsf.json @@ -20,8 +20,8 @@ }, "testing": { - "x": "data/epsilon_100K_x_test.npy", - "y": "data/epsilon_100K_y_test.npy" + "x": "data/epsilon_100K_x_train.npy", + "y": "data/epsilon_100K_y_train.npy" } } ], @@ -110,8 +110,8 @@ }, "testing": { - "x": "data/epsilon_100K_x_test.npy", - "y": "data/epsilon_100K_y_test.npy" + "x": "data/epsilon_100K_x_train.npy", + "y": "data/epsilon_100K_y_train.npy" } } ], diff --git a/configs/xpu/svm.json b/configs/xpu/svm.json index a98377532..6158ef642 100644 --- a/configs/xpu/svm.json +++ b/configs/xpu/svm.json @@ -60,8 +60,8 @@ }, "testing": { - "x": "data/epsilon_80K_x_test.npy", - "y": "data/epsilon_80K_y_test.npy" + "x": "data/epsilon_80K_x_train.npy", + "y": "data/epsilon_80K_y_train.npy" } } ], @@ -160,8 +160,8 @@ }, "testing": { - "x": "data/epsilon_80K_x_test.npy", - "y": "data/epsilon_80K_y_test.npy" + "x": "data/epsilon_80K_x_train.npy", + "y": "data/epsilon_80K_y_train.npy" } } ], diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index ffb84f12f..8e1e143ef 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -375,10 +375,10 @@ def epsilon_16K(dataset_dir: Path) -> bool: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html Classification task. n_classes = 2. - epsilon_100K x train dataset (16000, 2000) - epsilon_100K y train dataset (16000, 1) - epsilon_100K x test dataset (16000, 2000) - epsilon_100K y test dataset (16000, 1) + epsilon_16K x train dataset (16000, 2000) + epsilon_16K y train dataset (16000, 1) + epsilon_16K x test dataset (16000, 2000) + epsilon_16K y test dataset (16000, 1) """ dataset_name = 'epsilon_16K' os.makedirs(dataset_dir, exist_ok=True) @@ -425,40 +425,27 @@ def epsilon_100K(dataset_dir: Path) -> bool: Classification task. n_classes = 2. epsilon_100K x train dataset (50000, 2000) epsilon_100K y train dataset (50000, 1) - epsilon_100K x test dataset (50000, 2000) - epsilon_100K y test dataset (50000, 1) """ dataset_name = 'epsilon_100K' os.makedirs(dataset_dir, exist_ok=True) url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ '/epsilon_normalized.bz2' - url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.t.bz2' local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - num_train, num_test, dtype = 50000, 50000, np.float32 + num_train, dtype = 50000, np.float32 if not os.path.isfile(local_url_train): logging.info(f'Started loading {dataset_name}, train') retrieve(url_train, local_url_train) - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - retrieve(url_test, local_url_test) logging.info(f'{dataset_name} is loaded, started parsing...') X_train, y_train = load_svmlight_file(local_url_train, dtype=dtype) - X_test, y_test = load_svmlight_file(local_url_test, - dtype=dtype) X_train = X_train.toarray()[:num_train] - X_test = X_test.toarray()[:num_test] y_train = y_train[:num_train] y_train[y_train <= 0] = 0 - y_test = y_test[:num_test] - y_test[y_test <= 0] = 0 - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): + for data, name in zip((X_train, y_train, y_test), + ('x_train', 'y_train')): filename = f'{dataset_name}_{name}.npy' np.save(os.path.join(dataset_dir, filename), data) logging.info(f'dataset {dataset_name} is ready.') @@ -471,42 +458,29 @@ def epsilon_80K(dataset_dir: Path) -> bool: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html Classification task. n_classes = 2. - epsilon_100K x train dataset (80000, 2000) - epsilon_100K y train dataset (80000, 1) - epsilon_100K x test dataset (80000, 2000) - epsilon_100K y test dataset (80000, 1) + epsilon_80K x train dataset (80000, 2000) + epsilon_80K y train dataset (80000, 1) """ dataset_name = 'epsilon_80K' os.makedirs(dataset_dir, exist_ok=True) url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ '/epsilon_normalized.bz2' - url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.t.bz2' local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - num_train, num_test, dtype = 80000, 80000, np.float32 + num_train, dtype = 80000, np.float32 if not os.path.isfile(local_url_train): logging.info(f'Started loading {dataset_name}, train') retrieve(url_train, local_url_train) - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - retrieve(url_test, local_url_test) logging.info(f'{dataset_name} is loaded, started parsing...') X_train, y_train = load_svmlight_file(local_url_train, dtype=dtype) - X_test, y_test = load_svmlight_file(local_url_test, - dtype=dtype) X_train = X_train.toarray()[:num_train] - X_test = X_test.toarray()[:num_test] y_train = y_train[:num_train] y_train[y_train <= 0] = 0 - y_test = y_test[:num_test] - y_test[y_test <= 0] = 0 - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): + for data, name in zip((X_train, y_train, y_test), + ('x_train', 'y_train')): filename = f'{dataset_name}_{name}.npy' np.save(os.path.join(dataset_dir, filename), data) logging.info(f'dataset {dataset_name} is ready.') From a9f982625559abc2f138ce59e0c592161d3de168 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Wed, 19 Jan 2022 20:49:31 +0300 Subject: [PATCH 03/10] fix epsilon --- datasets/loader_classification.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index 8e1e143ef..8ffa70e40 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -444,7 +444,7 @@ def epsilon_100K(dataset_dir: Path) -> bool: y_train = y_train[:num_train] y_train[y_train <= 0] = 0 - for data, name in zip((X_train, y_train, y_test), + for data, name in zip((X_train, y_train), ('x_train', 'y_train')): filename = f'{dataset_name}_{name}.npy' np.save(os.path.join(dataset_dir, filename), data) @@ -479,7 +479,7 @@ def epsilon_80K(dataset_dir: Path) -> bool: y_train = y_train[:num_train] y_train[y_train <= 0] = 0 - for data, name in zip((X_train, y_train, y_test), + for data, name in zip((X_train, y_train), ('x_train', 'y_train')): filename = f'{dataset_name}_{name}.npy' np.save(os.path.join(dataset_dir, filename), data) From 8fa9824237fdcc204e9c5423adcb8706f6038ef6 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Thu, 20 Jan 2022 15:53:56 +0300 Subject: [PATCH 04/10] disable imb_drama --- configs/xpu/knn_clsf.json | 20 ++++++------ configs/xpu/knn_regr.json | 10 +++--- configs/xpu/svm.json | 52 ++++--------------------------- datasets/load_datasets.py | 9 +++--- datasets/loader_classification.py | 39 +++++------------------ 5 files changed, 33 insertions(+), 97 deletions(-) diff --git a/configs/xpu/knn_clsf.json b/configs/xpu/knn_clsf.json index bcc1eb452..d13128f2f 100644 --- a/configs/xpu/knn_clsf.json +++ b/configs/xpu/knn_clsf.json @@ -64,16 +64,16 @@ "dataset": [ { "source": "npy", - "name": "cifar", + "name": "cifar_binary", "training": { - "x": "data/cifar_x_train.npy", - "y": "data/cifar_y_train.npy" + "x": "data/cifar_binary_x_train.npy", + "y": "data/cifar_binary_y_train.npy" }, "testing": { - "x": "data/cifar_x_test.npy", - "y": "data/cifar_y_test.npy" + "x": "data/cifar_binary_x_test.npy", + "y": "data/cifar_binary_y_test.npy" } } ], @@ -142,16 +142,16 @@ "dataset": [ { "source": "npy", - "name": "cifar", + "name": "cifar_binary", "training": { - "x": "data/cifar_x_train.npy", - "y": "data/cifar_y_train.npy" + "x": "data/cifar_binary_x_train.npy", + "y": "data/cifar_binary_y_train.npy" }, "testing": { - "x": "data/cifar_x_test.npy", - "y": "data/cifar_y_test.npy" + "x": "data/cifar_binary_x_test.npy", + "y": "data/cifar_binary_y_test.npy" } } ], diff --git a/configs/xpu/knn_regr.json b/configs/xpu/knn_regr.json index ec1fbc9a9..32aa47c3b 100644 --- a/configs/xpu/knn_regr.json +++ b/configs/xpu/knn_regr.json @@ -50,16 +50,16 @@ "dataset": [ { "source": "npy", - "name": "cifar", + "name": "cifar_binary", "training": { - "x": "data/cifar_x_train.npy", - "y": "data/cifar_y_train.npy" + "x": "data/cifar_binary_x_train.npy", + "y": "data/cifar_binary_y_train.npy" }, "testing": { - "x": "data/cifar_x_test.npy", - "y": "data/cifar_y_test.npy" + "x": "data/cifar_binary_x_test.npy", + "y": "data/cifar_binary_y_test.npy" } } ], diff --git a/configs/xpu/svm.json b/configs/xpu/svm.json index 6158ef642..5f28bfe4a 100644 --- a/configs/xpu/svm.json +++ b/configs/xpu/svm.json @@ -4,7 +4,7 @@ "algorithm": "svm", "data-format": "pandas", "data-order": "F", - "dtype": ["float32", "float64"], + "dtype": "float32", "device": ["host", "cpu", "gpu", "none"] }, "cases": [ @@ -72,42 +72,22 @@ "dataset": [ { "source": "npy", - "name": "cifar", + "name": "cifar_binary", "training": { - "x": "data/cifar_x_train.npy", - "y": "data/cifar_y_train.npy" + "x": "data/cifar_binary_x_train.npy", + "y": "data/cifar_binary_y_train.npy" }, "testing": { - "x": "data/cifar_x_test.npy", - "y": "data/cifar_y_test.npy" + "x": "data/cifar_binary_x_test.npy", + "y": "data/cifar_binary_y_test.npy" } } ], "C": 1.0e-7, "kernel": "linear" }, - { - "dataset": [ - { - "source": "npy", - "name": "imb_drama", - "training": - { - "x": "data/imb_drama_x_train.npy", - "y": "data/imb_drama_y_train.npy" - }, - "testing": - { - "x": "data/imb_drama_x_train.npy", - "y": "data/imb_drama_y_train.npy" - } - } - ], - "C": 1e-3, - "kernel": "linear" - }, { "dataset": [ { @@ -167,26 +147,6 @@ ], "C": 1000.0, "kernel": "rbf" - }, - { - "dataset": [ - { - "source": "npy", - "name": "imb_drama", - "training": - { - "x": "data/imb_drama_x_train.npy", - "y": "data/imb_drama_y_train.npy" - }, - "testing": - { - "x": "data/imb_drama_x_train.npy", - "y": "data/imb_drama_y_train.npy" - } - } - ], - "C": 50, - "kernel": "rbf" } ] } diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index fbd7685d4..a5752963e 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -22,11 +22,11 @@ from typing import Callable, Dict from .loader_classification import (a_nine_a, airline, airline_ohe, bosch, - census, cifar, codrnanorm, covtype_binary, creditcard, + census, cifar_binary, codrnanorm, covtype_binary, creditcard, epsilon_16K, epsilon_80K, epsilon, epsilon_100K, fraud, gisette, hepmass_150K, - higgs, higgs_one_m, higgs_150K, ijcnn, imb_drama, - klaverjas, santander, skin_segmentation, susy) + higgs, higgs_one_m, higgs_150K, ijcnn, klaverjas, + santander, skin_segmentation, susy) from .loader_multiclass import (connect, covertype, covtype, letters, mlsr, mnist, msrank, plasticc, sensit) from .loader_regression import (abalone, california_housing, fried, @@ -42,7 +42,7 @@ "bosch": bosch, "california_housing": california_housing, "census": census, - "cifar": cifar, + "cifar_binary": cifar_binary, "codrnanorm": codrnanorm, "connect": connect, "covtype_binary": covtype_binary, @@ -60,7 +60,6 @@ "higgs1m": higgs_one_m, "higgs_150K": higgs_150K, "ijcnn": ijcnn, - "imb_drama": imb_drama, "klaverjas": klaverjas, "letters": letters, "mlsr": mlsr, diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index 8ffa70e40..16292ee81 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -804,28 +804,6 @@ def ijcnn(dataset_dir: Path) -> bool: logging.info(f'dataset {dataset_name} is ready.') return True -def imb_drama(dataset_dir: Path) -> bool: - """ - imdb_drama dataset from OpenML Datasets ( - https://www.openml.org/d/273) - - Classification task. - Number of features: 1001 - Number of instances: 120919 - """ - dataset_name = 'imb_drama' - os.makedirs(dataset_dir, exist_ok=True) - - x_train, y_train = fetch_openml('IMDB.drama', return_X_y=True, - as_frame=False, data_home=dataset_dir) - logging.info(f'{dataset_name} is loaded, started parsing...') - for data, name in zip((x_train.todense(), y_train), - ('x_train', 'y_train')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - def klaverjas(dataset_dir: Path) -> bool: """ @@ -943,17 +921,17 @@ def susy(dataset_dir: Path) -> bool: return True -def cifar(dataset_dir: Path) -> bool: +def cifar_binary(dataset_dir: Path) -> bool: """ Cifar dataset from LIBSVM Datasets ( https://www.cs.toronto.edu/~kriz/cifar.html#cifar) TaskType: Classification - cifar x train dataset (50000, 3072) - cifar y train dataset (50000, 1) - cifar x test dataset (10000, 3072) - cifar y test dataset (10000, 1) + cifar_binary x train dataset (50000, 3072) + cifar_binary y train dataset (50000, 1) + cifar_binary x test dataset (10000, 3072) + cifar_binary y test dataset (10000, 1) """ - dataset_name = 'cifar' + dataset_name = 'cifar_binary' os.makedirs(dataset_dir, exist_ok=True) url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.bz2' @@ -976,14 +954,13 @@ def cifar(dataset_dir: Path) -> bool: dtype=np.float32) x_train = x_train.toarray() - y_train[y_train <= 0] = 0 + y_train = (y_train > 0).astype(int) x_test = x_test.toarray() - y_test[y_test <= 0] = 0 + y_test = (y_test > 0).astype(int) for data, name in zip((x_train, x_test, y_train, y_test), ('x_train', 'x_test', 'y_train', 'y_test')): filename = f'{dataset_name}_{name}.npy' np.save(os.path.join(dataset_dir, filename), data) return True - From f54fb17bad9076c7f034560f63d731971ff86ed6 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Thu, 20 Jan 2022 16:07:22 +0300 Subject: [PATCH 05/10] C-style format --- configs/xpu/svm.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/xpu/svm.json b/configs/xpu/svm.json index 5f28bfe4a..38928d841 100644 --- a/configs/xpu/svm.json +++ b/configs/xpu/svm.json @@ -3,7 +3,7 @@ "lib": "sklearn", "algorithm": "svm", "data-format": "pandas", - "data-order": "F", + "data-order": "С", "dtype": "float32", "device": ["host", "cpu", "gpu", "none"] }, From 2dcdc49685ac8d12c4f98d52b48dd9303fa456e4 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Thu, 20 Jan 2022 21:45:45 +0300 Subject: [PATCH 06/10] Latin C --- configs/xpu/svm.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/xpu/svm.json b/configs/xpu/svm.json index 38928d841..667622050 100644 --- a/configs/xpu/svm.json +++ b/configs/xpu/svm.json @@ -3,7 +3,7 @@ "lib": "sklearn", "algorithm": "svm", "data-format": "pandas", - "data-order": "С", + "data-order": "C", "dtype": "float32", "device": ["host", "cpu", "gpu", "none"] }, From 065fbf3f4bc9925a0251d743c76cd76f654c8091 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Fri, 21 Jan 2022 17:44:26 +0300 Subject: [PATCH 07/10] fix warning with higgs --- datasets/loader_classification.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index 16292ee81..59c936f66 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -755,11 +755,14 @@ def higgs_150K(dataset_dir: Path) -> bool: compression="gzip", dtype=dtype, nrows=nrows_train + nrows_test) - X = data[data.columns[1:]] - y = data[data.columns[0:1]] - - x_train, x_test, y_train, y_test = train_test_split( - X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False) + x_train = np.ascontiguousarray(data.values[:nrows_train, 1:], dtype=dtype) + y_train = np.ascontiguousarray(data.values[:nrows_train, 0], dtype=dtype) + x_test = np.ascontiguousarray( + data.values[nrows_train: nrows_train + nrows_test, 1:], + dtype=dtype) + y_test = np.ascontiguousarray( + data.values[nrows_train: nrows_train + nrows_test, 0], + dtype=dtype) for data, name in zip((x_train, x_test, y_train, y_test), ('x_train', 'x_test', 'y_train', 'y_test')): From 0b708c0795e1fb07cb7dea63a45626757bafa36b Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Thu, 27 Jan 2022 19:15:13 +0300 Subject: [PATCH 08/10] resolve conflictsx2 --- datasets/load_datasets.py | 3 -- datasets/loader_classification.py | 59 ++++++++++++++++++------------- 2 files changed, 34 insertions(+), 28 deletions(-) diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index f7ad40558..e51916243 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -66,12 +66,9 @@ "hepmass_10K_cluster": hepmass_10K_cluster, "higgs": higgs, "higgs1m": higgs_one_m, -<<<<<<< HEAD "higgs_150K": higgs_150K, -======= "higgs_10500K": higgs_10500K, "higgs_one_m_clustering": higgs_one_m_clustering, ->>>>>>> aa4705e4eec0183f831a5912b6cbd0e22c419fa2 "ijcnn": ijcnn, "klaverjas": klaverjas, "letters": letters, diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index 5c0a0ec80..aad50bd1c 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -369,34 +369,22 @@ def epsilon(dataset_dir: Path) -> bool: return True -<<<<<<< HEAD def epsilon_16K(dataset_dir: Path) -> bool: -======= -def epsilon_30K(dataset_dir: Path) -> bool: ->>>>>>> aa4705e4eec0183f831a5912b6cbd0e22c419fa2 """ Epsilon dataset https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html Classification task. n_classes = 2. -<<<<<<< HEAD epsilon_16K x train dataset (16000, 2000) epsilon_16K y train dataset (16000, 1) epsilon_16K x test dataset (16000, 2000) epsilon_16K y test dataset (16000, 1) """ dataset_name = 'epsilon_16K' -======= - epsilon_30K x train dataset (30000, 2000) - epsilon_30K y train dataset (30000, 2000) - """ - dataset_name = 'epsilon_30K' ->>>>>>> aa4705e4eec0183f831a5912b6cbd0e22c419fa2 os.makedirs(dataset_dir, exist_ok=True) url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ '/epsilon_normalized.bz2' -<<<<<<< HEAD url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ '/epsilon_normalized.t.bz2' local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) @@ -429,6 +417,40 @@ def epsilon_30K(dataset_dir: Path) -> bool: return True +def epsilon_30K(dataset_dir: Path) -> bool: + """ + Epsilon dataset + https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html + + Classification task. n_classes = 2. + epsilon_30K x train dataset (30000, 2000) + epsilon_30K y train dataset (30000, 2000) + """ + dataset_name = 'epsilon_30K' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.bz2' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + + num_train, dtype = 30000, np.float32 + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + logging.info(f'{dataset_name} is loaded, started parsing...') + X_train, y_train = load_svmlight_file(local_url_train, + dtype=dtype) + X_train = X_train.toarray()[:num_train] + y_train = y_train[:num_train] + + for data, name in zip((X_train, y_train), + ('x_train', 'y_train')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + def epsilon_100K(dataset_dir: Path) -> bool: """ Epsilon dataset @@ -490,19 +512,6 @@ def epsilon_80K(dataset_dir: Path) -> bool: X_train = X_train.toarray()[:num_train] y_train = y_train[:num_train] y_train[y_train <= 0] = 0 -======= - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - - num_train, dtype = 30000, np.float32 - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - logging.info(f'{dataset_name} is loaded, started parsing...') - X_train, y_train = load_svmlight_file(local_url_train, - dtype=dtype) - X_train = X_train.toarray()[:num_train] - y_train = y_train[:num_train] ->>>>>>> aa4705e4eec0183f831a5912b6cbd0e22c419fa2 for data, name in zip((X_train, y_train), ('x_train', 'y_train')): From 9a0115d4b467ef100aa14706c0ac6f5a7fd7866f Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Thu, 27 Jan 2022 19:18:31 +0300 Subject: [PATCH 09/10] return covertype --- datasets/load_datasets.py | 1 + 1 file changed, 1 insertion(+) diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index e51916243..9e8ad552a 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -49,6 +49,7 @@ "cifar_cluster": cifar_cluster, "codrnanorm": codrnanorm, "connect": connect, + "covertype": covertype, "covtype_binary": covtype_binary, "covtype": covtype, "creditcard": creditcard, From fb1341deeaea0171d97839ed7c7632193356075f Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Thu, 27 Jan 2022 19:20:53 +0300 Subject: [PATCH 10/10] too much cifar --- datasets/loader_classification.py | 45 ------------------------------- 1 file changed, 45 deletions(-) diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index aad50bd1c..c7eb61dfd 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -1001,48 +1001,3 @@ def susy(dataset_dir: Path) -> bool: np.save(os.path.join(dataset_dir, filename), data) logging.info(f'dataset {dataset_name} is ready.') return True - - -def cifar_binary(dataset_dir: Path) -> bool: - """ - Cifar dataset from LIBSVM Datasets ( - https://www.cs.toronto.edu/~kriz/cifar.html#cifar) - TaskType: Classification - cifar_binary x train dataset (50000, 3072) - cifar_binary y train dataset (50000, 1) - cifar_binary x test dataset (10000, 3072) - cifar_binary y test dataset (10000, 1) - """ - dataset_name = 'cifar_binary' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.bz2' - url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.t.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - logging.info(f'{dataset_name} is loaded, started parsing...') - x_train, y_train = load_svmlight_file(local_url_train, - dtype=np.float32) - - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - retrieve(url_test, local_url_test) - logging.info(f'{dataset_name} is loaded, started parsing...') - x_test, y_test = load_svmlight_file(local_url_test, - dtype=np.float32) - - x_train = x_train.toarray() - y_train = (y_train > 0).astype(int) - - x_test = x_test.toarray() - y_test = (y_test > 0).astype(int) - - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - return True