From a07e639ec72a2240ee21b4468834943160805c7c Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Mon, 10 Jan 2022 19:01:01 +0300 Subject: [PATCH 1/8] regression xpu configs --- configs/xpu/df_regr.json | 133 ++++++++++++++++++++++++++++++ configs/xpu/linear.json | 48 +++++++++++ configs/xpu/log_reg.json | 89 ++++++++++++++++++++ datasets/load_datasets.py | 9 +- datasets/loader_classification.py | 89 ++++++++++++++++++++ datasets/loader_regression.py | 39 +++++++++ 6 files changed, 404 insertions(+), 3 deletions(-) create mode 100644 configs/xpu/df_regr.json create mode 100644 configs/xpu/linear.json create mode 100644 configs/xpu/log_reg.json diff --git a/configs/xpu/df_regr.json b/configs/xpu/df_regr.json new file mode 100644 index 000000000..7ad6165b0 --- /dev/null +++ b/configs/xpu/df_regr.json @@ -0,0 +1,133 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "df_regr", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "max-features": 0.33, + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "num-trees": [10, 100], + "max-depth": 5 + }, + { + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "num-trees": [100, 20], + "max-depth": 8 + }, + { + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "num-trees": 20, + "max-depth": 16 + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs1m", + "training": + { + "x": "data/higgs1m_x_train.npy", + "y": "data/higgs1m_y_train.npy" + }, + "testing": + { + "x": "data/higgs1m_x_test.npy", + "y": "data/higgs1m_y_test.npy" + } + } + ], + "num-trees": [15, 20, 100], + "max-depth": 8 + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_10500K", + "training": + { + "x": "data/higgs_10500K_x_train.npy", + "y": "data/higgs_10500K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_10500K_x_test.npy", + "y": "data/higgs_10500K_y_test.npy" + } + } + ], + "num-trees": 100, + "max-depth": 8 + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_10500K", + "training": + { + "x": "data/higgs_10500K_x_train.npy", + "y": "data/higgs_10500K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_10500K_x_test.npy", + "y": "data/higgs_10500K_y_test.npy" + } + } + ], + "num-trees": 20, + "max-depth": 16 + } + ] +} diff --git a/configs/xpu/linear.json b/configs/xpu/linear.json new file mode 100644 index 000000000..f829a5f97 --- /dev/null +++ b/configs/xpu/linear.json @@ -0,0 +1,48 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "linear", + "data-format": "pandas", + "data-order": "F", + "dtype": "float32", + "device": "gpu" + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ] + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs1m", + "training": + { + "x": "data/higgs1m_x_train.npy", + "y": "data/higgs1m_y_train.npy" + }, + "testing": + { + "x": "data/higgs1m_x_test.npy", + "y": "data/higgs1m_y_test.npy" + } + } + ] + } + ] +} diff --git a/configs/xpu/log_reg.json b/configs/xpu/log_reg.json new file mode 100644 index 000000000..4702da87e --- /dev/null +++ b/configs/xpu/log_reg.json @@ -0,0 +1,89 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "log_reg", + "data-format": "pandas", + "data-order": "F", + "dtype": "float32", + "device": "gpu" + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "susy", + "training": + { + "x": "data/susy_x_train.npy", + "y": "data/susy_y_train.npy" + }, + "testing": + { + "x": "data/susy_x_test.npy", + "y": "data/susy_y_test.npy" + } + } + ], + "maxiter": "20" + }, + { + "dataset": [ + { + "source": "npy", + "name": "susy", + "training": + { + "x": "data/susy_x_train.npy", + "y": "data/susy_y_train.npy" + }, + "testing": + { + "x": "data/susy_x_test.npy", + "y": "data/susy_y_test.npy" + } + } + ], + "maxiter": "10" + }, + { + "dataset": [ + { + "source": "npy", + "name": "mnist", + "training": + { + "x": "data/mnist_x_train.npy", + "y": "data/mnist_y_train.npy" + }, + "testing": + { + "x": "data/mnist_x_test.npy", + "y": "data/mnist_y_test.npy" + } + } + ], + "no-fit-intercept": "", + "maxiter": "50" + }, + { + "dataset": [ + { + "source": "npy", + "name": "mnist", + "training": + { + "x": "data/mnist_x_train.npy", + "y": "data/mnist_y_train.npy" + }, + "testing": + { + "x": "data/mnist_x_test.npy", + "y": "data/mnist_y_test.npy" + } + } + ], + "maxiter": "500" + } + ] +} diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index a3bc15187..80a7d581c 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -23,11 +23,11 @@ from .loader_classification import (a_nine_a, airline, airline_ohe, bosch, census, codrnanorm, creditcard, epsilon, fraud, - gisette, higgs, higgs_one_m, ijcnn, - klaverjas, santander, skin_segmentation) + gisette, hepmass_150K, higgs, higgs_one_m, ijcnn, + klaverjas, santander, skin_segmentation, susy) from .loader_multiclass import (connect, covertype, covtype, letters, mlsr, mnist, msrank, plasticc, sensit) -from .loader_regression import (abalone, california_housing, fried, +from .loader_regression import (abalone, california_housing, fried, higgs_10500K, medical_charges_nominal, mortgage_first_q, twodplanes, year_prediction_msd, yolanda, airline_regression) @@ -49,8 +49,10 @@ "fraud": fraud, "fried": fried, "gisette": gisette, + "hepmass_150K": hepmass_150K, "higgs": higgs, "higgs1m": higgs_one_m, + "higgs_10500K": higgs_10500K, "ijcnn": ijcnn, "klaverjas": klaverjas, "letters": letters, @@ -63,6 +65,7 @@ "santander": santander, "sensit": sensit, "skin_segmentation": skin_segmentation, + "susy": susy, "twodplanes": twodplanes, "year_prediction_msd": year_prediction_msd, "yolanda": yolanda, diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index 5a5d9df74..fc3cb892d 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -446,6 +446,55 @@ def gisette(dataset_dir: Path) -> bool: return True +def hepmass_150K(dataset_dir: Path) -> bool: + """ + HEPMASS dataset from UCI machine learning repository ( + https://archive.ics.uci.edu/ml/datasets/HEPMASS). + + Classification task. n_classes = 2. + hepmass_150K X train dataset (100000, 28) + hepmass_150K y train dataset (100000, 1) + hepmass_150K X test dataset (50000, 28) + hepmass_150K y test dataset (50000, 1) + """ + dataset_name = 'hepmass_150K' + os.makedirs(dataset_dir, exist_ok=True) + + url_test = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00347/all_test.csv.gz' + url_train = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00347/all_train.csv.gz' + + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + retrieve(url_test, local_url_test) + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + logging.info(f'{dataset_name} is loaded, started parsing...') + + nrows_train, nrows_test, dtype = 100000, 50000, np.float32 + data_test: Any = pd.read_csv(local_url_test, delimiter=",", + compression="gzip", dtype=dtype, + nrows=nrows_test) + data_train: Any = pd.read_csv(local_url_train, delimiter=",", + compression="gzip", dtype=dtype, + nrows=nrows_train) + + x_test = np.ascontiguousarray(data_test.values[:nrows_test, 1:], dtype=dtype) + y_test = np.ascontiguousarray(data_test.values[:nrows_test, 0], dtype=dtype) + x_train = np.ascontiguousarray(data_train.values[:nrows_train, 1:], dtype=dtype) + y_train = np.ascontiguousarray(data_train.values[:nrows_train, 0], dtype=dtype) + + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + def higgs(dataset_dir: Path) -> bool: """ Higgs dataset from UCI machine learning repository @@ -637,3 +686,43 @@ def skin_segmentation(dataset_dir: Path) -> bool: np.save(os.path.join(dataset_dir, filename), data) logging.info(f'dataset {dataset_name} is ready.') return True + + +def susy(dataset_dir: Path) -> bool: + """ + SUSY dataset from UCI machine learning repository ( + https://archive.ics.uci.edu/ml/datasets/SUSY). + + Classification task. n_classes = 2. + susy X train dataset (4500000, 28) + susy y train dataset (4500000, 1) + susy X test dataset (500000, 28) + susy y test dataset (500000, 1) + """ + dataset_name = 'susy' + os.makedirs(dataset_dir, exist_ok=True) + + url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00279/SUSY.csv.gz' + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, started parsing...') + + nrows_train, nrows_test, dtype = 4500000, 500000, np.float32 + data: Any = pd.read_csv(local_url, delimiter=",", header=None, + compression="gzip", dtype=dtype, + nrows=nrows_train + nrows_test) + + X = data[data.columns[1:]] + y = data[data.columns[0:1]] + + x_train, x_test, y_train, y_test = train_test_split( + X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False) + + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True diff --git a/datasets/loader_regression.py b/datasets/loader_regression.py index e915ee9cf..70df12988 100644 --- a/datasets/loader_regression.py +++ b/datasets/loader_regression.py @@ -295,3 +295,42 @@ def airline_regression(dataset_dir: Path) -> bool: np.save(os.path.join(dataset_dir, filename), data) logging.info(f'dataset {dataset_name} is ready.') return True + +def higgs_10500K(dataset_dir: Path) -> bool: + """ + Higgs dataset from UCI machine learning repository + https://archive.ics.uci.edu/ml/datasets/HIGGS + + Classification task. n_classes = 2. + higgs_10500K X train dataset (10500000, 28) + higgs_10500K y train dataset (10500000, 1) + higgs_10500K X test dataset (500000, 28) + higgs_10500K y test dataset (500000, 1) + """ + dataset_name = 'higgs_10500K' + os.makedirs(dataset_dir, exist_ok=True) + + url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, started parsing...') + + nrows_train, nrows_test, dtype = 10500000, 500000, np.float32 + data: Any = pd.read_csv(local_url, delimiter=",", header=None, + compression="gzip", dtype=dtype, + nrows=nrows_train + nrows_test) + + X = data[data.columns[1:]] + y = data[data.columns[0:1]] + + x_train, x_test, y_train, y_test = train_test_split( + X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False) + + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True \ No newline at end of file From 8cc810f30429701f37e3f6ba6eb5ec68d2504bba Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Mon, 10 Jan 2022 19:03:27 +0300 Subject: [PATCH 2/8] blank line --- datasets/loader_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/loader_regression.py b/datasets/loader_regression.py index 70df12988..34f384e54 100644 --- a/datasets/loader_regression.py +++ b/datasets/loader_regression.py @@ -333,4 +333,4 @@ def higgs_10500K(dataset_dir: Path) -> bool: filename = f'{dataset_name}_{name}.npy' np.save(os.path.join(dataset_dir, filename), data) logging.info(f'dataset {dataset_name} is ready.') - return True \ No newline at end of file + return True From 213ee6b5ca33e23c0cee1658833ab0aaf358fda5 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Thu, 13 Jan 2022 13:08:40 +0300 Subject: [PATCH 3/8] enable devices&dtypes --- configs/xpu/linear.json | 4 ++-- configs/xpu/log_reg.json | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/xpu/linear.json b/configs/xpu/linear.json index f829a5f97..93dce36e4 100644 --- a/configs/xpu/linear.json +++ b/configs/xpu/linear.json @@ -4,8 +4,8 @@ "algorithm": "linear", "data-format": "pandas", "data-order": "F", - "dtype": "float32", - "device": "gpu" + "dtype": ["float32", "float64"], + "device": ["host", "cpu", "gpu", "none"] }, "cases": [ { diff --git a/configs/xpu/log_reg.json b/configs/xpu/log_reg.json index 4702da87e..bf62fee5d 100644 --- a/configs/xpu/log_reg.json +++ b/configs/xpu/log_reg.json @@ -4,8 +4,8 @@ "algorithm": "log_reg", "data-format": "pandas", "data-order": "F", - "dtype": "float32", - "device": "gpu" + "dtype": ["float32", "float64"], + "device": ["host", "cpu", "gpu", "none"] }, "cases": [ { From b613c75da4a757768c83d32a1df44fc678994cf6 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Mon, 10 Jan 2022 19:01:01 +0300 Subject: [PATCH 4/8] regression xpu configs --- configs/xpu/df_regr.json | 133 ++++++++++++++++++++++++++++++++++ configs/xpu/linear.json | 48 ++++++++++++ configs/xpu/log_reg.json | 89 +++++++++++++++++++++++ datasets/load_datasets.py | 3 +- datasets/loader_regression.py | 39 ++++++++++ 5 files changed, 311 insertions(+), 1 deletion(-) create mode 100644 configs/xpu/df_regr.json create mode 100644 configs/xpu/linear.json create mode 100644 configs/xpu/log_reg.json diff --git a/configs/xpu/df_regr.json b/configs/xpu/df_regr.json new file mode 100644 index 000000000..7ad6165b0 --- /dev/null +++ b/configs/xpu/df_regr.json @@ -0,0 +1,133 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "df_regr", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "max-features": 0.33, + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "num-trees": [10, 100], + "max-depth": 5 + }, + { + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "num-trees": [100, 20], + "max-depth": 8 + }, + { + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "num-trees": 20, + "max-depth": 16 + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs1m", + "training": + { + "x": "data/higgs1m_x_train.npy", + "y": "data/higgs1m_y_train.npy" + }, + "testing": + { + "x": "data/higgs1m_x_test.npy", + "y": "data/higgs1m_y_test.npy" + } + } + ], + "num-trees": [15, 20, 100], + "max-depth": 8 + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_10500K", + "training": + { + "x": "data/higgs_10500K_x_train.npy", + "y": "data/higgs_10500K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_10500K_x_test.npy", + "y": "data/higgs_10500K_y_test.npy" + } + } + ], + "num-trees": 100, + "max-depth": 8 + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_10500K", + "training": + { + "x": "data/higgs_10500K_x_train.npy", + "y": "data/higgs_10500K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_10500K_x_test.npy", + "y": "data/higgs_10500K_y_test.npy" + } + } + ], + "num-trees": 20, + "max-depth": 16 + } + ] +} diff --git a/configs/xpu/linear.json b/configs/xpu/linear.json new file mode 100644 index 000000000..f829a5f97 --- /dev/null +++ b/configs/xpu/linear.json @@ -0,0 +1,48 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "linear", + "data-format": "pandas", + "data-order": "F", + "dtype": "float32", + "device": "gpu" + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ] + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs1m", + "training": + { + "x": "data/higgs1m_x_train.npy", + "y": "data/higgs1m_y_train.npy" + }, + "testing": + { + "x": "data/higgs1m_x_test.npy", + "y": "data/higgs1m_y_test.npy" + } + } + ] + } + ] +} diff --git a/configs/xpu/log_reg.json b/configs/xpu/log_reg.json new file mode 100644 index 000000000..4702da87e --- /dev/null +++ b/configs/xpu/log_reg.json @@ -0,0 +1,89 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "log_reg", + "data-format": "pandas", + "data-order": "F", + "dtype": "float32", + "device": "gpu" + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "susy", + "training": + { + "x": "data/susy_x_train.npy", + "y": "data/susy_y_train.npy" + }, + "testing": + { + "x": "data/susy_x_test.npy", + "y": "data/susy_y_test.npy" + } + } + ], + "maxiter": "20" + }, + { + "dataset": [ + { + "source": "npy", + "name": "susy", + "training": + { + "x": "data/susy_x_train.npy", + "y": "data/susy_y_train.npy" + }, + "testing": + { + "x": "data/susy_x_test.npy", + "y": "data/susy_y_test.npy" + } + } + ], + "maxiter": "10" + }, + { + "dataset": [ + { + "source": "npy", + "name": "mnist", + "training": + { + "x": "data/mnist_x_train.npy", + "y": "data/mnist_y_train.npy" + }, + "testing": + { + "x": "data/mnist_x_test.npy", + "y": "data/mnist_y_test.npy" + } + } + ], + "no-fit-intercept": "", + "maxiter": "50" + }, + { + "dataset": [ + { + "source": "npy", + "name": "mnist", + "training": + { + "x": "data/mnist_x_train.npy", + "y": "data/mnist_y_train.npy" + }, + "testing": + { + "x": "data/mnist_x_test.npy", + "y": "data/mnist_y_test.npy" + } + } + ], + "maxiter": "500" + } + ] +} diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index 0a7874d92..80a7d581c 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -27,7 +27,7 @@ klaverjas, santander, skin_segmentation, susy) from .loader_multiclass import (connect, covertype, covtype, letters, mlsr, mnist, msrank, plasticc, sensit) -from .loader_regression import (abalone, california_housing, fried, +from .loader_regression import (abalone, california_housing, fried, higgs_10500K, medical_charges_nominal, mortgage_first_q, twodplanes, year_prediction_msd, yolanda, airline_regression) @@ -52,6 +52,7 @@ "hepmass_150K": hepmass_150K, "higgs": higgs, "higgs1m": higgs_one_m, + "higgs_10500K": higgs_10500K, "ijcnn": ijcnn, "klaverjas": klaverjas, "letters": letters, diff --git a/datasets/loader_regression.py b/datasets/loader_regression.py index e915ee9cf..70df12988 100644 --- a/datasets/loader_regression.py +++ b/datasets/loader_regression.py @@ -295,3 +295,42 @@ def airline_regression(dataset_dir: Path) -> bool: np.save(os.path.join(dataset_dir, filename), data) logging.info(f'dataset {dataset_name} is ready.') return True + +def higgs_10500K(dataset_dir: Path) -> bool: + """ + Higgs dataset from UCI machine learning repository + https://archive.ics.uci.edu/ml/datasets/HIGGS + + Classification task. n_classes = 2. + higgs_10500K X train dataset (10500000, 28) + higgs_10500K y train dataset (10500000, 1) + higgs_10500K X test dataset (500000, 28) + higgs_10500K y test dataset (500000, 1) + """ + dataset_name = 'higgs_10500K' + os.makedirs(dataset_dir, exist_ok=True) + + url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, started parsing...') + + nrows_train, nrows_test, dtype = 10500000, 500000, np.float32 + data: Any = pd.read_csv(local_url, delimiter=",", header=None, + compression="gzip", dtype=dtype, + nrows=nrows_train + nrows_test) + + X = data[data.columns[1:]] + y = data[data.columns[0:1]] + + x_train, x_test, y_train, y_test = train_test_split( + X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False) + + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True \ No newline at end of file From d0ef134e57b11f1104991db2223a4779c8521be7 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Mon, 10 Jan 2022 19:03:27 +0300 Subject: [PATCH 5/8] blank line --- datasets/loader_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/loader_regression.py b/datasets/loader_regression.py index 70df12988..34f384e54 100644 --- a/datasets/loader_regression.py +++ b/datasets/loader_regression.py @@ -333,4 +333,4 @@ def higgs_10500K(dataset_dir: Path) -> bool: filename = f'{dataset_name}_{name}.npy' np.save(os.path.join(dataset_dir, filename), data) logging.info(f'dataset {dataset_name} is ready.') - return True \ No newline at end of file + return True From 37643eaadd25fe825c4a67150ad77a126bcf081b Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Thu, 13 Jan 2022 13:08:40 +0300 Subject: [PATCH 6/8] enable devices&dtypes --- configs/xpu/linear.json | 4 ++-- configs/xpu/log_reg.json | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/xpu/linear.json b/configs/xpu/linear.json index f829a5f97..93dce36e4 100644 --- a/configs/xpu/linear.json +++ b/configs/xpu/linear.json @@ -4,8 +4,8 @@ "algorithm": "linear", "data-format": "pandas", "data-order": "F", - "dtype": "float32", - "device": "gpu" + "dtype": ["float32", "float64"], + "device": ["host", "cpu", "gpu", "none"] }, "cases": [ { diff --git a/configs/xpu/log_reg.json b/configs/xpu/log_reg.json index 4702da87e..bf62fee5d 100644 --- a/configs/xpu/log_reg.json +++ b/configs/xpu/log_reg.json @@ -4,8 +4,8 @@ "algorithm": "log_reg", "data-format": "pandas", "data-order": "F", - "dtype": "float32", - "device": "gpu" + "dtype": ["float32", "float64"], + "device": ["host", "cpu", "gpu", "none"] }, "cases": [ { From 099da17bed7f75dd9c553525b11b2391213a6410 Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Fri, 14 Jan 2022 12:36:05 +0300 Subject: [PATCH 7/8] pep8 --- datasets/loader_regression.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datasets/loader_regression.py b/datasets/loader_regression.py index 34f384e54..e2fd31c47 100644 --- a/datasets/loader_regression.py +++ b/datasets/loader_regression.py @@ -296,6 +296,7 @@ def airline_regression(dataset_dir: Path) -> bool: logging.info(f'dataset {dataset_name} is ready.') return True + def higgs_10500K(dataset_dir: Path) -> bool: """ Higgs dataset from UCI machine learning repository @@ -323,7 +324,7 @@ def higgs_10500K(dataset_dir: Path) -> bool: nrows=nrows_train + nrows_test) X = data[data.columns[1:]] - y = data[data.columns[0:1]] + y = data[data.columns[0:1]] x_train, x_test, y_train, y_test = train_test_split( X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False) From fb83752e40d2ea3ba1ee9b6dc64f2fc0b75cff0e Mon Sep 17 00:00:00 2001 From: dmitrii-kriukov Date: Fri, 14 Jan 2022 16:09:44 +0300 Subject: [PATCH 8/8] fix susy y shape --- datasets/loader_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index fc3cb892d..c1ef16329 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -715,7 +715,7 @@ def susy(dataset_dir: Path) -> bool: nrows=nrows_train + nrows_test) X = data[data.columns[1:]] - y = data[data.columns[0:1]] + y = data[data.columns[0:1]].values.ravel() x_train, x_test, y_train, y_test = train_test_split( X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False)