diff --git a/configs/xpu/df_regr.json b/configs/xpu/df_regr.json new file mode 100644 index 000000000..7ad6165b0 --- /dev/null +++ b/configs/xpu/df_regr.json @@ -0,0 +1,133 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "df_regr", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "max-features": 0.33, + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "num-trees": [10, 100], + "max-depth": 5 + }, + { + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "num-trees": [100, 20], + "max-depth": 8 + }, + { + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "num-trees": 20, + "max-depth": 16 + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs1m", + "training": + { + "x": "data/higgs1m_x_train.npy", + "y": "data/higgs1m_y_train.npy" + }, + "testing": + { + "x": "data/higgs1m_x_test.npy", + "y": "data/higgs1m_y_test.npy" + } + } + ], + "num-trees": [15, 20, 100], + "max-depth": 8 + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_10500K", + "training": + { + "x": "data/higgs_10500K_x_train.npy", + "y": "data/higgs_10500K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_10500K_x_test.npy", + "y": "data/higgs_10500K_y_test.npy" + } + } + ], + "num-trees": 100, + "max-depth": 8 + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs_10500K", + "training": + { + "x": "data/higgs_10500K_x_train.npy", + "y": "data/higgs_10500K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_10500K_x_test.npy", + "y": "data/higgs_10500K_y_test.npy" + } + } + ], + "num-trees": 20, + "max-depth": 16 + } + ] +} diff --git a/configs/xpu/linear.json b/configs/xpu/linear.json new file mode 100644 index 000000000..93dce36e4 --- /dev/null +++ b/configs/xpu/linear.json @@ -0,0 +1,48 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "linear", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ] + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs1m", + "training": + { + "x": "data/higgs1m_x_train.npy", + "y": "data/higgs1m_y_train.npy" + }, + "testing": + { + "x": "data/higgs1m_x_test.npy", + "y": "data/higgs1m_y_test.npy" + } + } + ] + } + ] +} diff --git a/configs/xpu/log_reg.json b/configs/xpu/log_reg.json new file mode 100644 index 000000000..bf62fee5d --- /dev/null +++ b/configs/xpu/log_reg.json @@ -0,0 +1,89 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "log_reg", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "susy", + "training": + { + "x": "data/susy_x_train.npy", + "y": "data/susy_y_train.npy" + }, + "testing": + { + "x": "data/susy_x_test.npy", + "y": "data/susy_y_test.npy" + } + } + ], + "maxiter": "20" + }, + { + "dataset": [ + { + "source": "npy", + "name": "susy", + "training": + { + "x": "data/susy_x_train.npy", + "y": "data/susy_y_train.npy" + }, + "testing": + { + "x": "data/susy_x_test.npy", + "y": "data/susy_y_test.npy" + } + } + ], + "maxiter": "10" + }, + { + "dataset": [ + { + "source": "npy", + "name": "mnist", + "training": + { + "x": "data/mnist_x_train.npy", + "y": "data/mnist_y_train.npy" + }, + "testing": + { + "x": "data/mnist_x_test.npy", + "y": "data/mnist_y_test.npy" + } + } + ], + "no-fit-intercept": "", + "maxiter": "50" + }, + { + "dataset": [ + { + "source": "npy", + "name": "mnist", + "training": + { + "x": "data/mnist_x_train.npy", + "y": "data/mnist_y_train.npy" + }, + "testing": + { + "x": "data/mnist_x_test.npy", + "y": "data/mnist_y_test.npy" + } + } + ], + "maxiter": "500" + } + ] +} diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index 0a7874d92..80a7d581c 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -27,7 +27,7 @@ klaverjas, santander, skin_segmentation, susy) from .loader_multiclass import (connect, covertype, covtype, letters, mlsr, mnist, msrank, plasticc, sensit) -from .loader_regression import (abalone, california_housing, fried, +from .loader_regression import (abalone, california_housing, fried, higgs_10500K, medical_charges_nominal, mortgage_first_q, twodplanes, year_prediction_msd, yolanda, airline_regression) @@ -52,6 +52,7 @@ "hepmass_150K": hepmass_150K, "higgs": higgs, "higgs1m": higgs_one_m, + "higgs_10500K": higgs_10500K, "ijcnn": ijcnn, "klaverjas": klaverjas, "letters": letters, diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index fc3cb892d..c1ef16329 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -715,7 +715,7 @@ def susy(dataset_dir: Path) -> bool: nrows=nrows_train + nrows_test) X = data[data.columns[1:]] - y = data[data.columns[0:1]] + y = data[data.columns[0:1]].values.ravel() x_train, x_test, y_train, y_test = train_test_split( X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False) diff --git a/datasets/loader_regression.py b/datasets/loader_regression.py index e915ee9cf..e2fd31c47 100644 --- a/datasets/loader_regression.py +++ b/datasets/loader_regression.py @@ -295,3 +295,43 @@ def airline_regression(dataset_dir: Path) -> bool: np.save(os.path.join(dataset_dir, filename), data) logging.info(f'dataset {dataset_name} is ready.') return True + + +def higgs_10500K(dataset_dir: Path) -> bool: + """ + Higgs dataset from UCI machine learning repository + https://archive.ics.uci.edu/ml/datasets/HIGGS + + Classification task. n_classes = 2. + higgs_10500K X train dataset (10500000, 28) + higgs_10500K y train dataset (10500000, 1) + higgs_10500K X test dataset (500000, 28) + higgs_10500K y test dataset (500000, 1) + """ + dataset_name = 'higgs_10500K' + os.makedirs(dataset_dir, exist_ok=True) + + url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, started parsing...') + + nrows_train, nrows_test, dtype = 10500000, 500000, np.float32 + data: Any = pd.read_csv(local_url, delimiter=",", header=None, + compression="gzip", dtype=dtype, + nrows=nrows_train + nrows_test) + + X = data[data.columns[1:]] + y = data[data.columns[0:1]] + + x_train, x_test, y_train, y_test = train_test_split( + X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False) + + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True