From 6272ea525c8155efb896525e6d1672e779fca3b3 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Mon, 17 Jan 2022 19:11:30 +0300 Subject: [PATCH 1/6] knn_svm --- configs/xpu/knn_clsf.json | 162 +++++++++++++++++ configs/xpu/knn_regr.json | 69 +++++++ configs/xpu/svm.json | 192 ++++++++++++++++++++ datasets/load_datasets.py | 14 +- datasets/loader_classification.py | 289 +++++++++++++++++++++++++++++- 5 files changed, 722 insertions(+), 4 deletions(-) create mode 100644 configs/xpu/knn_clsf.json create mode 100644 configs/xpu/knn_regr.json create mode 100644 configs/xpu/svm.json diff --git a/configs/xpu/knn_clsf.json b/configs/xpu/knn_clsf.json new file mode 100644 index 000000000..2d72c4ade --- /dev/null +++ b/configs/xpu/knn_clsf.json @@ -0,0 +1,162 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "knn_clsf", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_100K", + "training": + { + "x": "data/epsilon_100K_x_train.npy", + "y": "data/epsilon_100K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_100K_x_test.npy", + "y": "data/epsilon_100K_y_test.npy" + } + } + ], + "n-neighbors": [2, 100] + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_150K", + "training": + { + "x": "data/higgs_150K_x_train.npy", + "y": "data/higgs_150K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_150K_x_test.npy", + "y": "data/higgs_150K_y_test.npy" + } + }, + { + "source": "npy", + "name": "hepmass_150K", + "training": + { + "x": "data/hepmass_150K_x_train.npy", + "y": "data/hepmass_150K_y_train.npy" + }, + "testing": + { + "x": "data/hepmass_150K_x_test.npy", + "y": "data/hepmass_150K_y_test.npy" + } + } + ], + "n-neighbors": [5, 100] + }, + { + "dataset": [ + { + "source": "npy", + "name": "cifar", + "training": + { + "x": "data/cifar_x_train.npy", + "y": "data/cifar_y_train.npy" + }, + "testing": + { + "x": "data/cifar_x_test.npy", + "y": "data/cifar_y_test.npy" + } + } + ], + "n-neighbors": 7 + }, + { + "dataset": [ + { + "source": "npy", + "name": "mnist", + "training": + { + "x": "data/mnist_x_train.npy", + "y": "data/mnist_y_train.npy" + }, + "testing": + { + "x": "data/mnist_x_test.npy", + "y": "data/mnist_y_test.npy" + } + } + ], + "n-neighbors": 5 + }, + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_100K", + "training": + { + "x": "data/epsilon_100K_x_train.npy", + "y": "data/epsilon_100K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_100K_x_test.npy", + "y": "data/epsilon_100K_y_test.npy" + } + } + ], + "task": "search", + "n-neighbors": 2 + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_150K", + "training": + { + "x": "data/higgs_150K_x_train.npy", + "y": "data/higgs_150K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_150K_x_test.npy", + "y": "data/higgs_150K_y_test.npy" + } + } + ], + "task": "search", + "n-neighbors": 5 + }, + { + "dataset": [ + { + "source": "npy", + "name": "cifar", + "training": + { + "x": "data/cifar_x_train.npy", + "y": "data/cifar_y_train.npy" + }, + "testing": + { + "x": "data/cifar_x_test.npy", + "y": "data/cifar_y_test.npy" + } + } + ], + "task": "search", + "n-neighbors": 7 + } + ] +} diff --git a/configs/xpu/knn_regr.json b/configs/xpu/knn_regr.json new file mode 100644 index 000000000..ec1fbc9a9 --- /dev/null +++ b/configs/xpu/knn_regr.json @@ -0,0 +1,69 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "knn_regr", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "n-neighbors": 2 + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_150K", + "training": + { + "x": "data/higgs_150K_x_train.npy", + "y": "data/higgs_150K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_150K_x_test.npy", + "y": "data/higgs_150K_y_test.npy" + } + } + ], + "n-neighbors": 5 + }, + { + "dataset": [ + { + "source": "npy", + "name": "cifar", + "training": + { + "x": "data/cifar_x_train.npy", + "y": "data/cifar_y_train.npy" + }, + "testing": + { + "x": "data/cifar_x_test.npy", + "y": "data/cifar_y_test.npy" + } + } + ], + "n-neighbors": 7 + } + ] +} diff --git a/configs/xpu/svm.json b/configs/xpu/svm.json new file mode 100644 index 000000000..a98377532 --- /dev/null +++ b/configs/xpu/svm.json @@ -0,0 +1,192 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "svm", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "gisette", + "training": + { + "x": "data/gisette_x_train.npy", + "y": "data/gisette_y_train.npy" + }, + "testing": + { + "x": "data/gisette_x_test.npy", + "y": "data/gisette_y_test.npy" + } + } + ], + "C": 1.5e-3, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_150K", + "training": + { + "x": "data/higgs_150K_x_train.npy", + "y": "data/higgs_150K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_150K_x_test.npy", + "y": "data/higgs_150K_y_test.npy" + } + } + ], + "C": 1.0, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_80K", + "training": + { + "x": "data/epsilon_80K_x_train.npy", + "y": "data/epsilon_80K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_80K_x_test.npy", + "y": "data/epsilon_80K_y_test.npy" + } + } + ], + "C": 1.0, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "cifar", + "training": + { + "x": "data/cifar_x_train.npy", + "y": "data/cifar_y_train.npy" + }, + "testing": + { + "x": "data/cifar_x_test.npy", + "y": "data/cifar_y_test.npy" + } + } + ], + "C": 1.0e-7, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "imb_drama", + "training": + { + "x": "data/imb_drama_x_train.npy", + "y": "data/imb_drama_y_train.npy" + }, + "testing": + { + "x": "data/imb_drama_x_train.npy", + "y": "data/imb_drama_y_train.npy" + } + } + ], + "C": 1e-3, + "kernel": "linear" + }, + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_16K", + "training": + { + "x": "data/epsilon_16K_x_train.npy", + "y": "data/epsilon_16K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_16K_x_test.npy", + "y": "data/epsilon_16K_y_test.npy" + } + } + ], + "C": 9.0e2, + "kernel": "rbf" + }, + { + "dataset": [ + { + "source": "npy", + "name": "covtype_binary", + "training": + { + "x": "data/covtype_binary_x_train.npy", + "y": "data/covtype_binary_y_train.npy" + }, + "testing": + { + "x": "data/covtype_binary_x_test.npy", + "y": "data/covtype_binary_y_test.npy" + } + } + ], + "C": 1000.0, + "kernel": "rbf" + }, + { + "dataset": [ + { + "source": "npy", + "name": "epsilon_80K", + "training": + { + "x": "data/epsilon_80K_x_train.npy", + "y": "data/epsilon_80K_y_train.npy" + }, + "testing": + { + "x": "data/epsilon_80K_x_test.npy", + "y": "data/epsilon_80K_y_test.npy" + } + } + ], + "C": 1000.0, + "kernel": "rbf" + }, + { + "dataset": [ + { + "source": "npy", + "name": "imb_drama", + "training": + { + "x": "data/imb_drama_x_train.npy", + "y": "data/imb_drama_y_train.npy" + }, + "testing": + { + "x": "data/imb_drama_x_train.npy", + "y": "data/imb_drama_y_train.npy" + } + } + ], + "C": 50, + "kernel": "rbf" + } + ] +} diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index 0a7874d92..fbd7685d4 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -22,8 +22,10 @@ from typing import Callable, Dict from .loader_classification import (a_nine_a, airline, airline_ohe, bosch, - census, codrnanorm, creditcard, epsilon, fraud, - gisette, hepmass_150K, higgs, higgs_one_m, ijcnn, + census, cifar, codrnanorm, covtype_binary, creditcard, + epsilon_16K, epsilon_80K, epsilon, epsilon_100K, + fraud, gisette, hepmass_150K, + higgs, higgs_one_m, higgs_150K, ijcnn, imb_drama, klaverjas, santander, skin_segmentation, susy) from .loader_multiclass import (connect, covertype, covtype, letters, mlsr, mnist, msrank, plasticc, sensit) @@ -40,19 +42,25 @@ "bosch": bosch, "california_housing": california_housing, "census": census, + "cifar": cifar, "codrnanorm": codrnanorm, "connect": connect, - "covertype": covertype, + "covtype_binary": covtype_binary, "covtype": covtype, "creditcard": creditcard, "epsilon": epsilon, + "epsilon_16K": epsilon_16K, + "epsilon_80K": epsilon_80K, + "epsilon_100K": epsilon_100K, "fraud": fraud, "fried": fried, "gisette": gisette, "hepmass_150K": hepmass_150K, "higgs": higgs, "higgs1m": higgs_one_m, + "higgs_150K": higgs_150K, "ijcnn": ijcnn, + "imb_drama": imb_drama, "klaverjas": klaverjas, "letters": letters, "mlsr": mlsr, diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index fc3cb892d..ffb84f12f 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -22,7 +22,7 @@ import numpy as np import pandas as pd -from sklearn.datasets import fetch_openml, load_svmlight_file +from sklearn.datasets import fetch_openml, load_svmlight_file, fetch_covtype from sklearn.model_selection import train_test_split from .loader_utils import retrieve @@ -261,6 +261,41 @@ def codrnanorm(dataset_dir: Path) -> bool: return True +def covtype_binary(dataset_dir: Path) -> bool: + """ + Cover type dataset from UCI machine learning repository + https://archive.ics.uci.edu/ml/datasets/covertype + + y contains 7 unique class labels from 1 to 7 inclusive. + Classification task. n_classes = 7. + covtype X train dataset (464809, 54) + covtype y train dataset (464809, 1) + covtype X test dataset (116203, 54) + covtype y test dataset (116203, 1) + """ + dataset_name = 'covtype_binary' + os.makedirs(dataset_dir, exist_ok=True) + + nrows_train, nrows_test = 100000, 100000 + logging.info(f'Started loading {dataset_name}') + X, y = fetch_covtype(return_X_y=True) # pylint: disable=unexpected-keyword-arg + logging.info(f'{dataset_name} is loaded, started parsing...') + + y = (y > 3).astype(int) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, + train_size=nrows_train, + test_size=nrows_test, + shuffle=False + ) + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + def creditcard(dataset_dir: Path) -> bool: """ Classification task. n_classes = 2. @@ -334,6 +369,150 @@ def epsilon(dataset_dir: Path) -> bool: return True +def epsilon_16K(dataset_dir: Path) -> bool: + """ + Epsilon dataset + https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html + + Classification task. n_classes = 2. + epsilon_100K x train dataset (16000, 2000) + epsilon_100K y train dataset (16000, 1) + epsilon_100K x test dataset (16000, 2000) + epsilon_100K y test dataset (16000, 1) + """ + dataset_name = 'epsilon_16K' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.bz2' + url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.t.bz2' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + + num_train, num_test, dtype = 16000, 16000, np.float32 + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + X_train, y_train = load_svmlight_file(local_url_train, + dtype=dtype) + X_test, y_test = load_svmlight_file(local_url_test, + dtype=dtype) + X_train = X_train.toarray()[:num_train] + X_test = X_test.toarray()[:num_test] + y_train = y_train[:num_train] + y_train[y_train <= 0] = 0 + y_test = y_test[:num_test] + y_test[y_test <= 0] = 0 + + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def epsilon_100K(dataset_dir: Path) -> bool: + """ + Epsilon dataset + https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html + + Classification task. n_classes = 2. + epsilon_100K x train dataset (50000, 2000) + epsilon_100K y train dataset (50000, 1) + epsilon_100K x test dataset (50000, 2000) + epsilon_100K y test dataset (50000, 1) + """ + dataset_name = 'epsilon_100K' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.bz2' + url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.t.bz2' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + + num_train, num_test, dtype = 50000, 50000, np.float32 + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + X_train, y_train = load_svmlight_file(local_url_train, + dtype=dtype) + X_test, y_test = load_svmlight_file(local_url_test, + dtype=dtype) + X_train = X_train.toarray()[:num_train] + X_test = X_test.toarray()[:num_test] + y_train = y_train[:num_train] + y_train[y_train <= 0] = 0 + y_test = y_test[:num_test] + y_test[y_test <= 0] = 0 + + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + +def epsilon_80K(dataset_dir: Path) -> bool: + """ + Epsilon dataset + https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html + + Classification task. n_classes = 2. + epsilon_100K x train dataset (80000, 2000) + epsilon_100K y train dataset (80000, 1) + epsilon_100K x test dataset (80000, 2000) + epsilon_100K y test dataset (80000, 1) + """ + dataset_name = 'epsilon_80K' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.bz2' + url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ + '/epsilon_normalized.t.bz2' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + + num_train, num_test, dtype = 80000, 80000, np.float32 + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + X_train, y_train = load_svmlight_file(local_url_train, + dtype=dtype) + X_test, y_test = load_svmlight_file(local_url_test, + dtype=dtype) + X_train = X_train.toarray()[:num_train] + X_test = X_test.toarray()[:num_test] + y_train = y_train[:num_train] + y_train[y_train <= 0] = 0 + y_test = y_test[:num_test] + y_test[y_test <= 0] = 0 + + for data, name in zip((X_train, X_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + def fraud(dataset_dir: Path) -> bool: """ Credit Card Fraud Detection contest @@ -576,6 +755,46 @@ def higgs_one_m(dataset_dir: Path) -> bool: return True +def higgs_150K(dataset_dir: Path) -> bool: + """ + Higgs dataset from UCI machine learning repository + https://archive.ics.uci.edu/ml/datasets/HIGGS + + Classification task. n_classes = 2. + higgs_150K X train dataset (100000, 28) + higgs_150K y train dataset (50000, 1) + higgs_150K X test dataset (100000, 28) + higgs_150K y test dataset (50000, 1) + """ + dataset_name = 'higgs_150K' + os.makedirs(dataset_dir, exist_ok=True) + + url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, started parsing...') + + nrows_train, nrows_test, dtype = 100000, 50000, np.float32 + data: Any = pd.read_csv(local_url, delimiter=",", header=None, + compression="gzip", dtype=dtype, + nrows=nrows_train + nrows_test) + + X = data[data.columns[1:]] + y = data[data.columns[0:1]] + + x_train, x_test, y_train, y_test = train_test_split( + X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False) + + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + def ijcnn(dataset_dir: Path) -> bool: """ Author: Danil Prokhorov. @@ -611,6 +830,28 @@ def ijcnn(dataset_dir: Path) -> bool: logging.info(f'dataset {dataset_name} is ready.') return True +def imb_drama(dataset_dir: Path) -> bool: + """ + imdb_drama dataset from OpenML Datasets ( + https://www.openml.org/d/273) + + Classification task. + Number of features: 1001 + Number of instances: 120919 + """ + dataset_name = 'imb_drama' + os.makedirs(dataset_dir, exist_ok=True) + + x_train, y_train = fetch_openml('IMDB.drama', return_X_y=True, + as_frame=False, data_home=dataset_dir) + logging.info(f'{dataset_name} is loaded, started parsing...') + for data, name in zip((x_train.todense(), y_train), + ('x_train', 'y_train')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + def klaverjas(dataset_dir: Path) -> bool: """ @@ -726,3 +967,49 @@ def susy(dataset_dir: Path) -> bool: np.save(os.path.join(dataset_dir, filename), data) logging.info(f'dataset {dataset_name} is ready.') return True + + +def cifar(dataset_dir: Path) -> bool: + """ + Cifar dataset from LIBSVM Datasets ( + https://www.cs.toronto.edu/~kriz/cifar.html#cifar) + TaskType: Classification + cifar x train dataset (50000, 3072) + cifar y train dataset (50000, 1) + cifar x test dataset (10000, 3072) + cifar y test dataset (10000, 1) + """ + dataset_name = 'cifar' + os.makedirs(dataset_dir, exist_ok=True) + + url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.bz2' + url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.t.bz2' + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + logging.info(f'{dataset_name} is loaded, started parsing...') + x_train, y_train = load_svmlight_file(local_url_train, + dtype=np.float32) + + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + retrieve(url_test, local_url_test) + logging.info(f'{dataset_name} is loaded, started parsing...') + x_test, y_test = load_svmlight_file(local_url_test, + dtype=np.float32) + + x_train = x_train.toarray() + y_train[y_train <= 0] = 0 + + x_test = x_test.toarray() + y_test[y_test <= 0] = 0 + + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + return True + From 81868d37ec55ec0be33d1ebe62849e7c97305eb4 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Mon, 17 Jan 2022 19:16:10 +0300 Subject: [PATCH 2/6] Revert "knn_svm" This reverts commit 6272ea525c8155efb896525e6d1672e779fca3b3. --- configs/xpu/knn_clsf.json | 162 ----------------- configs/xpu/knn_regr.json | 69 ------- configs/xpu/svm.json | 192 -------------------- datasets/load_datasets.py | 14 +- datasets/loader_classification.py | 289 +----------------------------- 5 files changed, 4 insertions(+), 722 deletions(-) delete mode 100644 configs/xpu/knn_clsf.json delete mode 100644 configs/xpu/knn_regr.json delete mode 100644 configs/xpu/svm.json diff --git a/configs/xpu/knn_clsf.json b/configs/xpu/knn_clsf.json deleted file mode 100644 index 2d72c4ade..000000000 --- a/configs/xpu/knn_clsf.json +++ /dev/null @@ -1,162 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "knn_clsf", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_100K", - "training": - { - "x": "data/epsilon_100K_x_train.npy", - "y": "data/epsilon_100K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_100K_x_test.npy", - "y": "data/epsilon_100K_y_test.npy" - } - } - ], - "n-neighbors": [2, 100] - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - }, - { - "source": "npy", - "name": "hepmass_150K", - "training": - { - "x": "data/hepmass_150K_x_train.npy", - "y": "data/hepmass_150K_y_train.npy" - }, - "testing": - { - "x": "data/hepmass_150K_x_test.npy", - "y": "data/hepmass_150K_y_test.npy" - } - } - ], - "n-neighbors": [5, 100] - }, - { - "dataset": [ - { - "source": "npy", - "name": "cifar", - "training": - { - "x": "data/cifar_x_train.npy", - "y": "data/cifar_y_train.npy" - }, - "testing": - { - "x": "data/cifar_x_test.npy", - "y": "data/cifar_y_test.npy" - } - } - ], - "n-neighbors": 7 - }, - { - "dataset": [ - { - "source": "npy", - "name": "mnist", - "training": - { - "x": "data/mnist_x_train.npy", - "y": "data/mnist_y_train.npy" - }, - "testing": - { - "x": "data/mnist_x_test.npy", - "y": "data/mnist_y_test.npy" - } - } - ], - "n-neighbors": 5 - }, - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_100K", - "training": - { - "x": "data/epsilon_100K_x_train.npy", - "y": "data/epsilon_100K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_100K_x_test.npy", - "y": "data/epsilon_100K_y_test.npy" - } - } - ], - "task": "search", - "n-neighbors": 2 - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - } - ], - "task": "search", - "n-neighbors": 5 - }, - { - "dataset": [ - { - "source": "npy", - "name": "cifar", - "training": - { - "x": "data/cifar_x_train.npy", - "y": "data/cifar_y_train.npy" - }, - "testing": - { - "x": "data/cifar_x_test.npy", - "y": "data/cifar_y_test.npy" - } - } - ], - "task": "search", - "n-neighbors": 7 - } - ] -} diff --git a/configs/xpu/knn_regr.json b/configs/xpu/knn_regr.json deleted file mode 100644 index ec1fbc9a9..000000000 --- a/configs/xpu/knn_regr.json +++ /dev/null @@ -1,69 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "knn_regr", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "n-neighbors": 2 - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - } - ], - "n-neighbors": 5 - }, - { - "dataset": [ - { - "source": "npy", - "name": "cifar", - "training": - { - "x": "data/cifar_x_train.npy", - "y": "data/cifar_y_train.npy" - }, - "testing": - { - "x": "data/cifar_x_test.npy", - "y": "data/cifar_y_test.npy" - } - } - ], - "n-neighbors": 7 - } - ] -} diff --git a/configs/xpu/svm.json b/configs/xpu/svm.json deleted file mode 100644 index a98377532..000000000 --- a/configs/xpu/svm.json +++ /dev/null @@ -1,192 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "algorithm": "svm", - "data-format": "pandas", - "data-order": "F", - "dtype": ["float32", "float64"], - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "dataset": [ - { - "source": "npy", - "name": "gisette", - "training": - { - "x": "data/gisette_x_train.npy", - "y": "data/gisette_y_train.npy" - }, - "testing": - { - "x": "data/gisette_x_test.npy", - "y": "data/gisette_y_test.npy" - } - } - ], - "C": 1.5e-3, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "higgs_150K", - "training": - { - "x": "data/higgs_150K_x_train.npy", - "y": "data/higgs_150K_y_train.npy" - }, - "testing": - { - "x": "data/higgs_150K_x_test.npy", - "y": "data/higgs_150K_y_test.npy" - } - } - ], - "C": 1.0, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_80K", - "training": - { - "x": "data/epsilon_80K_x_train.npy", - "y": "data/epsilon_80K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_80K_x_test.npy", - "y": "data/epsilon_80K_y_test.npy" - } - } - ], - "C": 1.0, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "cifar", - "training": - { - "x": "data/cifar_x_train.npy", - "y": "data/cifar_y_train.npy" - }, - "testing": - { - "x": "data/cifar_x_test.npy", - "y": "data/cifar_y_test.npy" - } - } - ], - "C": 1.0e-7, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "imb_drama", - "training": - { - "x": "data/imb_drama_x_train.npy", - "y": "data/imb_drama_y_train.npy" - }, - "testing": - { - "x": "data/imb_drama_x_train.npy", - "y": "data/imb_drama_y_train.npy" - } - } - ], - "C": 1e-3, - "kernel": "linear" - }, - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_16K", - "training": - { - "x": "data/epsilon_16K_x_train.npy", - "y": "data/epsilon_16K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_16K_x_test.npy", - "y": "data/epsilon_16K_y_test.npy" - } - } - ], - "C": 9.0e2, - "kernel": "rbf" - }, - { - "dataset": [ - { - "source": "npy", - "name": "covtype_binary", - "training": - { - "x": "data/covtype_binary_x_train.npy", - "y": "data/covtype_binary_y_train.npy" - }, - "testing": - { - "x": "data/covtype_binary_x_test.npy", - "y": "data/covtype_binary_y_test.npy" - } - } - ], - "C": 1000.0, - "kernel": "rbf" - }, - { - "dataset": [ - { - "source": "npy", - "name": "epsilon_80K", - "training": - { - "x": "data/epsilon_80K_x_train.npy", - "y": "data/epsilon_80K_y_train.npy" - }, - "testing": - { - "x": "data/epsilon_80K_x_test.npy", - "y": "data/epsilon_80K_y_test.npy" - } - } - ], - "C": 1000.0, - "kernel": "rbf" - }, - { - "dataset": [ - { - "source": "npy", - "name": "imb_drama", - "training": - { - "x": "data/imb_drama_x_train.npy", - "y": "data/imb_drama_y_train.npy" - }, - "testing": - { - "x": "data/imb_drama_x_train.npy", - "y": "data/imb_drama_y_train.npy" - } - } - ], - "C": 50, - "kernel": "rbf" - } - ] -} diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index fbd7685d4..0a7874d92 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -22,10 +22,8 @@ from typing import Callable, Dict from .loader_classification import (a_nine_a, airline, airline_ohe, bosch, - census, cifar, codrnanorm, covtype_binary, creditcard, - epsilon_16K, epsilon_80K, epsilon, epsilon_100K, - fraud, gisette, hepmass_150K, - higgs, higgs_one_m, higgs_150K, ijcnn, imb_drama, + census, codrnanorm, creditcard, epsilon, fraud, + gisette, hepmass_150K, higgs, higgs_one_m, ijcnn, klaverjas, santander, skin_segmentation, susy) from .loader_multiclass import (connect, covertype, covtype, letters, mlsr, mnist, msrank, plasticc, sensit) @@ -42,25 +40,19 @@ "bosch": bosch, "california_housing": california_housing, "census": census, - "cifar": cifar, "codrnanorm": codrnanorm, "connect": connect, - "covtype_binary": covtype_binary, + "covertype": covertype, "covtype": covtype, "creditcard": creditcard, "epsilon": epsilon, - "epsilon_16K": epsilon_16K, - "epsilon_80K": epsilon_80K, - "epsilon_100K": epsilon_100K, "fraud": fraud, "fried": fried, "gisette": gisette, "hepmass_150K": hepmass_150K, "higgs": higgs, "higgs1m": higgs_one_m, - "higgs_150K": higgs_150K, "ijcnn": ijcnn, - "imb_drama": imb_drama, "klaverjas": klaverjas, "letters": letters, "mlsr": mlsr, diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index ffb84f12f..fc3cb892d 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -22,7 +22,7 @@ import numpy as np import pandas as pd -from sklearn.datasets import fetch_openml, load_svmlight_file, fetch_covtype +from sklearn.datasets import fetch_openml, load_svmlight_file from sklearn.model_selection import train_test_split from .loader_utils import retrieve @@ -261,41 +261,6 @@ def codrnanorm(dataset_dir: Path) -> bool: return True -def covtype_binary(dataset_dir: Path) -> bool: - """ - Cover type dataset from UCI machine learning repository - https://archive.ics.uci.edu/ml/datasets/covertype - - y contains 7 unique class labels from 1 to 7 inclusive. - Classification task. n_classes = 7. - covtype X train dataset (464809, 54) - covtype y train dataset (464809, 1) - covtype X test dataset (116203, 54) - covtype y test dataset (116203, 1) - """ - dataset_name = 'covtype_binary' - os.makedirs(dataset_dir, exist_ok=True) - - nrows_train, nrows_test = 100000, 100000 - logging.info(f'Started loading {dataset_name}') - X, y = fetch_covtype(return_X_y=True) # pylint: disable=unexpected-keyword-arg - logging.info(f'{dataset_name} is loaded, started parsing...') - - y = (y > 3).astype(int) - - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, - train_size=nrows_train, - test_size=nrows_test, - shuffle=False - ) - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - def creditcard(dataset_dir: Path) -> bool: """ Classification task. n_classes = 2. @@ -369,150 +334,6 @@ def epsilon(dataset_dir: Path) -> bool: return True -def epsilon_16K(dataset_dir: Path) -> bool: - """ - Epsilon dataset - https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html - - Classification task. n_classes = 2. - epsilon_100K x train dataset (16000, 2000) - epsilon_100K y train dataset (16000, 1) - epsilon_100K x test dataset (16000, 2000) - epsilon_100K y test dataset (16000, 1) - """ - dataset_name = 'epsilon_16K' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.bz2' - url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.t.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - - num_train, num_test, dtype = 16000, 16000, np.float32 - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - retrieve(url_test, local_url_test) - logging.info(f'{dataset_name} is loaded, started parsing...') - X_train, y_train = load_svmlight_file(local_url_train, - dtype=dtype) - X_test, y_test = load_svmlight_file(local_url_test, - dtype=dtype) - X_train = X_train.toarray()[:num_train] - X_test = X_test.toarray()[:num_test] - y_train = y_train[:num_train] - y_train[y_train <= 0] = 0 - y_test = y_test[:num_test] - y_test[y_test <= 0] = 0 - - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def epsilon_100K(dataset_dir: Path) -> bool: - """ - Epsilon dataset - https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html - - Classification task. n_classes = 2. - epsilon_100K x train dataset (50000, 2000) - epsilon_100K y train dataset (50000, 1) - epsilon_100K x test dataset (50000, 2000) - epsilon_100K y test dataset (50000, 1) - """ - dataset_name = 'epsilon_100K' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.bz2' - url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.t.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - - num_train, num_test, dtype = 50000, 50000, np.float32 - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - retrieve(url_test, local_url_test) - logging.info(f'{dataset_name} is loaded, started parsing...') - X_train, y_train = load_svmlight_file(local_url_train, - dtype=dtype) - X_test, y_test = load_svmlight_file(local_url_test, - dtype=dtype) - X_train = X_train.toarray()[:num_train] - X_test = X_test.toarray()[:num_test] - y_train = y_train[:num_train] - y_train[y_train <= 0] = 0 - y_test = y_test[:num_test] - y_test[y_test <= 0] = 0 - - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - -def epsilon_80K(dataset_dir: Path) -> bool: - """ - Epsilon dataset - https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html - - Classification task. n_classes = 2. - epsilon_100K x train dataset (80000, 2000) - epsilon_100K y train dataset (80000, 1) - epsilon_100K x test dataset (80000, 2000) - epsilon_100K y test dataset (80000, 1) - """ - dataset_name = 'epsilon_80K' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.bz2' - url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ - '/epsilon_normalized.t.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - - num_train, num_test, dtype = 80000, 80000, np.float32 - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - retrieve(url_test, local_url_test) - logging.info(f'{dataset_name} is loaded, started parsing...') - X_train, y_train = load_svmlight_file(local_url_train, - dtype=dtype) - X_test, y_test = load_svmlight_file(local_url_test, - dtype=dtype) - X_train = X_train.toarray()[:num_train] - X_test = X_test.toarray()[:num_test] - y_train = y_train[:num_train] - y_train[y_train <= 0] = 0 - y_test = y_test[:num_test] - y_test[y_test <= 0] = 0 - - for data, name in zip((X_train, X_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - def fraud(dataset_dir: Path) -> bool: """ Credit Card Fraud Detection contest @@ -755,46 +576,6 @@ def higgs_one_m(dataset_dir: Path) -> bool: return True -def higgs_150K(dataset_dir: Path) -> bool: - """ - Higgs dataset from UCI machine learning repository - https://archive.ics.uci.edu/ml/datasets/HIGGS - - Classification task. n_classes = 2. - higgs_150K X train dataset (100000, 28) - higgs_150K y train dataset (50000, 1) - higgs_150K X test dataset (100000, 28) - higgs_150K y test dataset (50000, 1) - """ - dataset_name = 'higgs_150K' - os.makedirs(dataset_dir, exist_ok=True) - - url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' - local_url = os.path.join(dataset_dir, os.path.basename(url)) - if not os.path.isfile(local_url): - logging.info(f'Started loading {dataset_name}') - retrieve(url, local_url) - logging.info(f'{dataset_name} is loaded, started parsing...') - - nrows_train, nrows_test, dtype = 100000, 50000, np.float32 - data: Any = pd.read_csv(local_url, delimiter=",", header=None, - compression="gzip", dtype=dtype, - nrows=nrows_train + nrows_test) - - X = data[data.columns[1:]] - y = data[data.columns[0:1]] - - x_train, x_test, y_train, y_test = train_test_split( - X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False) - - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - - def ijcnn(dataset_dir: Path) -> bool: """ Author: Danil Prokhorov. @@ -830,28 +611,6 @@ def ijcnn(dataset_dir: Path) -> bool: logging.info(f'dataset {dataset_name} is ready.') return True -def imb_drama(dataset_dir: Path) -> bool: - """ - imdb_drama dataset from OpenML Datasets ( - https://www.openml.org/d/273) - - Classification task. - Number of features: 1001 - Number of instances: 120919 - """ - dataset_name = 'imb_drama' - os.makedirs(dataset_dir, exist_ok=True) - - x_train, y_train = fetch_openml('IMDB.drama', return_X_y=True, - as_frame=False, data_home=dataset_dir) - logging.info(f'{dataset_name} is loaded, started parsing...') - for data, name in zip((x_train.todense(), y_train), - ('x_train', 'y_train')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - logging.info(f'dataset {dataset_name} is ready.') - return True - def klaverjas(dataset_dir: Path) -> bool: """ @@ -967,49 +726,3 @@ def susy(dataset_dir: Path) -> bool: np.save(os.path.join(dataset_dir, filename), data) logging.info(f'dataset {dataset_name} is ready.') return True - - -def cifar(dataset_dir: Path) -> bool: - """ - Cifar dataset from LIBSVM Datasets ( - https://www.cs.toronto.edu/~kriz/cifar.html#cifar) - TaskType: Classification - cifar x train dataset (50000, 3072) - cifar y train dataset (50000, 1) - cifar x test dataset (10000, 3072) - cifar y test dataset (10000, 1) - """ - dataset_name = 'cifar' - os.makedirs(dataset_dir, exist_ok=True) - - url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.bz2' - url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.t.bz2' - local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) - local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) - - if not os.path.isfile(local_url_train): - logging.info(f'Started loading {dataset_name}, train') - retrieve(url_train, local_url_train) - logging.info(f'{dataset_name} is loaded, started parsing...') - x_train, y_train = load_svmlight_file(local_url_train, - dtype=np.float32) - - if not os.path.isfile(local_url_test): - logging.info(f'Started loading {dataset_name}, test') - retrieve(url_test, local_url_test) - logging.info(f'{dataset_name} is loaded, started parsing...') - x_test, y_test = load_svmlight_file(local_url_test, - dtype=np.float32) - - x_train = x_train.toarray() - y_train[y_train <= 0] = 0 - - x_test = x_test.toarray() - y_test[y_test <= 0] = 0 - - for data, name in zip((x_train, x_test, y_train, y_test), - ('x_train', 'x_test', 'y_train', 'y_test')): - filename = f'{dataset_name}_{name}.npy' - np.save(os.path.join(dataset_dir, filename), data) - return True - From b1394ea4b8ac9c849823d3f3df0075f74663bb7f Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Wed, 26 Jan 2022 19:58:16 +0300 Subject: [PATCH 3/6] test copy --- configs/no_copy_test.json | 92 +++++++++++++++++++++++++++++++++++++ sklearn_bench/elasticnet.py | 4 +- sklearn_bench/lasso.py | 2 +- sklearn_bench/linear.py | 2 +- 4 files changed, 96 insertions(+), 4 deletions(-) create mode 100644 configs/no_copy_test.json diff --git a/configs/no_copy_test.json b/configs/no_copy_test.json new file mode 100644 index 000000000..b2e890297 --- /dev/null +++ b/configs/no_copy_test.json @@ -0,0 +1,92 @@ +{ + "common": { + "lib": "sklearn", + "data-format": "pandas", + "data-order": "F", + "dtype": "float64", + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "algorithm": "elasticnet", + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "alpha": 2.0, + "l1_ratio": 0.5, + "tol": 1e-4 + }, + { + "algorithm": "linear", + "dataset": [ + { + "source": "synthetic", + "type": "regression", + "n_features": 20, + "training": { + "n_samples": 10000000 + } + }, + { + "source": "synthetic", + "type": "regression", + "n_features": 100, + "training": { + "n_samples": 2000000 + } + } + ] + }, + { + "algorithm": "linear", + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ] + }, + { + "algorithm": "linear", + "dataset": [ + { + "source": "npy", + "name": "higgs1m", + "training": + { + "x": "data/higgs1m_x_train.npy", + "y": "data/higgs1m_y_train.npy" + }, + "testing": + { + "x": "data/higgs1m_x_test.npy", + "y": "data/higgs1m_y_test.npy" + } + } + ] + } + ] +} \ No newline at end of file diff --git a/sklearn_bench/elasticnet.py b/sklearn_bench/elasticnet.py index 89e820e6f..4cf6bb740 100755 --- a/sklearn_bench/elasticnet.py +++ b/sklearn_bench/elasticnet.py @@ -28,7 +28,7 @@ def main(): # Create our regression object regr = ElasticNet(fit_intercept=params.fit_intercept, l1_ratio=params.l1_ratio, alpha=params.alpha, tol=params.tol, - max_iter=params.maxiter, copy_X=False) + max_iter=params.maxiter, copy_X=True) # Time fit fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) @@ -63,7 +63,7 @@ def main(): if __name__ == "__main__": parser = argparse.ArgumentParser(description='scikit-learn elastic-net regression ' 'benchmark') - parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=False, + parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, action='store_false', help="Don't fit intercept (assume data already centered)") parser.add_argument('--alpha', dest='alpha', type=float, default=1.0, diff --git a/sklearn_bench/lasso.py b/sklearn_bench/lasso.py index 6346d5e8a..dedafbe6e 100755 --- a/sklearn_bench/lasso.py +++ b/sklearn_bench/lasso.py @@ -27,7 +27,7 @@ def main(): # Create our regression object regr = Lasso(fit_intercept=params.fit_intercept, alpha=params.alpha, - tol=params.tol, max_iter=params.maxiter, copy_X=False) + tol=params.tol, max_iter=params.maxiter) # Time fit fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) diff --git a/sklearn_bench/linear.py b/sklearn_bench/linear.py index b97d49a6e..1cc3577ac 100644 --- a/sklearn_bench/linear.py +++ b/sklearn_bench/linear.py @@ -27,7 +27,7 @@ def main(): # Create our regression object regr = LinearRegression(fit_intercept=params.fit_intercept, - n_jobs=params.n_jobs, copy_X=False) + n_jobs=params.n_jobs, copy_X=True) # Time fit fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) From c925f9c269089e21bb9d1d1de7e9e9c35d65aa62 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Thu, 27 Jan 2022 20:49:48 +0300 Subject: [PATCH 4/6] remove temp config --- configs/no_copy_test.json | 92 --------------------------------------- 1 file changed, 92 deletions(-) delete mode 100644 configs/no_copy_test.json diff --git a/configs/no_copy_test.json b/configs/no_copy_test.json deleted file mode 100644 index b2e890297..000000000 --- a/configs/no_copy_test.json +++ /dev/null @@ -1,92 +0,0 @@ -{ - "common": { - "lib": "sklearn", - "data-format": "pandas", - "data-order": "F", - "dtype": "float64", - "device": ["host", "cpu", "gpu", "none"] - }, - "cases": [ - { - "algorithm": "elasticnet", - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ], - "alpha": 2.0, - "l1_ratio": 0.5, - "tol": 1e-4 - }, - { - "algorithm": "linear", - "dataset": [ - { - "source": "synthetic", - "type": "regression", - "n_features": 20, - "training": { - "n_samples": 10000000 - } - }, - { - "source": "synthetic", - "type": "regression", - "n_features": 100, - "training": { - "n_samples": 2000000 - } - } - ] - }, - { - "algorithm": "linear", - "dataset": [ - { - "source": "npy", - "name": "year_prediction_msd", - "training": - { - "x": "data/year_prediction_msd_x_train.npy", - "y": "data/year_prediction_msd_y_train.npy" - }, - "testing": - { - "x": "data/year_prediction_msd_x_test.npy", - "y": "data/year_prediction_msd_y_test.npy" - } - } - ] - }, - { - "algorithm": "linear", - "dataset": [ - { - "source": "npy", - "name": "higgs1m", - "training": - { - "x": "data/higgs1m_x_train.npy", - "y": "data/higgs1m_y_train.npy" - }, - "testing": - { - "x": "data/higgs1m_x_test.npy", - "y": "data/higgs1m_y_test.npy" - } - } - ] - } - ] -} \ No newline at end of file From 91d4ca200276528b5a04b48e37b10c89fcdaa3bd Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Thu, 27 Jan 2022 20:52:53 +0300 Subject: [PATCH 5/6] default parametrs --- sklearn_bench/elasticnet.py | 2 +- sklearn_bench/linear.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn_bench/elasticnet.py b/sklearn_bench/elasticnet.py index 4cf6bb740..3467e0dda 100755 --- a/sklearn_bench/elasticnet.py +++ b/sklearn_bench/elasticnet.py @@ -28,7 +28,7 @@ def main(): # Create our regression object regr = ElasticNet(fit_intercept=params.fit_intercept, l1_ratio=params.l1_ratio, alpha=params.alpha, tol=params.tol, - max_iter=params.maxiter, copy_X=True) + max_iter=params.maxiter) # Time fit fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) diff --git a/sklearn_bench/linear.py b/sklearn_bench/linear.py index 1cc3577ac..7da0dba45 100644 --- a/sklearn_bench/linear.py +++ b/sklearn_bench/linear.py @@ -27,7 +27,7 @@ def main(): # Create our regression object regr = LinearRegression(fit_intercept=params.fit_intercept, - n_jobs=params.n_jobs, copy_X=True) + n_jobs=params.n_jobs) # Time fit fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) From 98fb20949cd9563477fe9e5a7710bc8429d7b9c1 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Fri, 28 Jan 2022 11:58:26 +0300 Subject: [PATCH 6/6] same problem on lasso --- sklearn_bench/lasso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn_bench/lasso.py b/sklearn_bench/lasso.py index dedafbe6e..c167bc359 100755 --- a/sklearn_bench/lasso.py +++ b/sklearn_bench/lasso.py @@ -63,7 +63,7 @@ def main(): if __name__ == "__main__": parser = argparse.ArgumentParser(description='scikit-learn lasso regression ' 'benchmark') - parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=False, + parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True, action='store_false', help="Don't fit intercept (assume data already centered)") parser.add_argument('--alpha', dest='alpha', type=float, default=1.0,