diff --git a/bench.py b/bench.py index 9c1033271..cbc969bb4 100644 --- a/bench.py +++ b/bench.py @@ -19,6 +19,7 @@ import logging import sys import timeit +import re import numpy as np import sklearn @@ -64,8 +65,17 @@ def _parse_size(string, dim=2): return tup +def is_float(string): + return bool(re.match(r"^[-+]?(?:\b[0-9]+(?:\.[0-9]*)?|\.[0-9]+\b)(?:[eE][-+]?[0-9]+\b)?$", + string)) + + def float_or_int(string): - return float(string) if '.' in string else int(string) + return int(string) if string.isdigit() else float(string) + + +def float_or_int_or_str(string): + return int(string) if string.isdigit() else float(string) if is_float(string) else string def get_optimal_cache_size(n_rows, dtype=np.double, max_cache=64): diff --git a/configs/xpu/df_clsf.json b/configs/xpu/df_clsf.json new file mode 100644 index 000000000..4d14763b8 --- /dev/null +++ b/configs/xpu/df_clsf.json @@ -0,0 +1,113 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "df_clsf", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "max-features": "sqrt", + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "susy", + "training": + { + "x": "data/susy_x_train.npy", + "y": "data/susy_y_train.npy" + }, + "testing": + { + "x": "data/susy_x_test.npy", + "y": "data/susy_y_test.npy" + } + } + ], + "num-trees": 10, + "max-depth": 5 + }, + { + "dataset": [ + { + "source": "npy", + "name": "susy", + "training": + { + "x": "data/susy_x_train.npy", + "y": "data/susy_y_train.npy" + }, + "testing": + { + "x": "data/susy_x_test.npy", + "y": "data/susy_y_test.npy" + } + } + ], + "num-trees": 100, + "max-depth": 8 + }, + { + "dataset": [ + { + "source": "npy", + "name": "susy", + "training": + { + "x": "data/susy_x_train.npy", + "y": "data/susy_y_train.npy" + }, + "testing": + { + "x": "data/susy_x_test.npy", + "y": "data/susy_y_test.npy" + } + } + ], + "num-trees": 20, + "max-depth": 16 + }, + { + "dataset": [ + { + "source": "npy", + "name": "mnist", + "training": + { + "x": "data/mnist_x_train.npy", + "y": "data/mnist_y_train.npy" + }, + "testing": + { + "x": "data/mnist_x_test.npy", + "y": "data/mnist_y_test.npy" + } + } + ], + "num-trees": 100, + "max-depth": 10 + }, + { + "dataset": [ + { + "source": "npy", + "name": "hepmass_150K", + "training": + { + "x": "data/hepmass_150K_x_train.npy", + "y": "data/hepmass_150K_y_train.npy" + }, + "testing": + { + "x": "data/hepmass_150K_x_test.npy", + "y": "data/hepmass_150K_y_test.npy" + } + } + ], + "num-trees": 50, + "max-depth": 15 + } + ] +} diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index a3bc15187..0a7874d92 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -23,8 +23,8 @@ from .loader_classification import (a_nine_a, airline, airline_ohe, bosch, census, codrnanorm, creditcard, epsilon, fraud, - gisette, higgs, higgs_one_m, ijcnn, - klaverjas, santander, skin_segmentation) + gisette, hepmass_150K, higgs, higgs_one_m, ijcnn, + klaverjas, santander, skin_segmentation, susy) from .loader_multiclass import (connect, covertype, covtype, letters, mlsr, mnist, msrank, plasticc, sensit) from .loader_regression import (abalone, california_housing, fried, @@ -49,6 +49,7 @@ "fraud": fraud, "fried": fried, "gisette": gisette, + "hepmass_150K": hepmass_150K, "higgs": higgs, "higgs1m": higgs_one_m, "ijcnn": ijcnn, @@ -63,6 +64,7 @@ "santander": santander, "sensit": sensit, "skin_segmentation": skin_segmentation, + "susy": susy, "twodplanes": twodplanes, "year_prediction_msd": year_prediction_msd, "yolanda": yolanda, diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index 5a5d9df74..fc3cb892d 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -446,6 +446,55 @@ def gisette(dataset_dir: Path) -> bool: return True +def hepmass_150K(dataset_dir: Path) -> bool: + """ + HEPMASS dataset from UCI machine learning repository ( + https://archive.ics.uci.edu/ml/datasets/HEPMASS). + + Classification task. n_classes = 2. + hepmass_150K X train dataset (100000, 28) + hepmass_150K y train dataset (100000, 1) + hepmass_150K X test dataset (50000, 28) + hepmass_150K y test dataset (50000, 1) + """ + dataset_name = 'hepmass_150K' + os.makedirs(dataset_dir, exist_ok=True) + + url_test = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00347/all_test.csv.gz' + url_train = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00347/all_train.csv.gz' + + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) + + if not os.path.isfile(local_url_test): + logging.info(f'Started loading {dataset_name}, test') + retrieve(url_test, local_url_test) + if not os.path.isfile(local_url_train): + logging.info(f'Started loading {dataset_name}, train') + retrieve(url_train, local_url_train) + logging.info(f'{dataset_name} is loaded, started parsing...') + + nrows_train, nrows_test, dtype = 100000, 50000, np.float32 + data_test: Any = pd.read_csv(local_url_test, delimiter=",", + compression="gzip", dtype=dtype, + nrows=nrows_test) + data_train: Any = pd.read_csv(local_url_train, delimiter=",", + compression="gzip", dtype=dtype, + nrows=nrows_train) + + x_test = np.ascontiguousarray(data_test.values[:nrows_test, 1:], dtype=dtype) + y_test = np.ascontiguousarray(data_test.values[:nrows_test, 0], dtype=dtype) + x_train = np.ascontiguousarray(data_train.values[:nrows_train, 1:], dtype=dtype) + y_train = np.ascontiguousarray(data_train.values[:nrows_train, 0], dtype=dtype) + + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + def higgs(dataset_dir: Path) -> bool: """ Higgs dataset from UCI machine learning repository @@ -637,3 +686,43 @@ def skin_segmentation(dataset_dir: Path) -> bool: np.save(os.path.join(dataset_dir, filename), data) logging.info(f'dataset {dataset_name} is ready.') return True + + +def susy(dataset_dir: Path) -> bool: + """ + SUSY dataset from UCI machine learning repository ( + https://archive.ics.uci.edu/ml/datasets/SUSY). + + Classification task. n_classes = 2. + susy X train dataset (4500000, 28) + susy y train dataset (4500000, 1) + susy X test dataset (500000, 28) + susy y test dataset (500000, 1) + """ + dataset_name = 'susy' + os.makedirs(dataset_dir, exist_ok=True) + + url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00279/SUSY.csv.gz' + local_url = os.path.join(dataset_dir, os.path.basename(url)) + if not os.path.isfile(local_url): + logging.info(f'Started loading {dataset_name}') + retrieve(url, local_url) + logging.info(f'{dataset_name} is loaded, started parsing...') + + nrows_train, nrows_test, dtype = 4500000, 500000, np.float32 + data: Any = pd.read_csv(local_url, delimiter=",", header=None, + compression="gzip", dtype=dtype, + nrows=nrows_train + nrows_test) + + X = data[data.columns[1:]] + y = data[data.columns[0:1]] + + x_train, x_test, y_train, y_test = train_test_split( + X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False) + + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True diff --git a/sklearn_bench/df_clsf.py b/sklearn_bench/df_clsf.py index d99ffa898..5d35ef02d 100644 --- a/sklearn_bench/df_clsf.py +++ b/sklearn_bench/df_clsf.py @@ -81,7 +81,7 @@ def main(): help='The function to measure the quality of a split') parser.add_argument('--num-trees', type=int, default=100, help='Number of trees in the forest') - parser.add_argument('--max-features', type=bench.float_or_int, default=None, + parser.add_argument('--max-features', type=bench.float_or_int_or_str, default=None, help='Upper bound on features used at each split') parser.add_argument('--max-depth', type=int, default=None, help='Upper bound on depth of constructed trees') diff --git a/sklearn_bench/df_regr.py b/sklearn_bench/df_regr.py index 460a28804..4c7491af3 100644 --- a/sklearn_bench/df_regr.py +++ b/sklearn_bench/df_regr.py @@ -72,7 +72,7 @@ def main(): help='The function to measure the quality of a split') parser.add_argument('--num-trees', type=int, default=100, help='Number of trees in the forest') - parser.add_argument('--max-features', type=bench.float_or_int, default=None, + parser.add_argument('--max-features', type=bench.float_or_int_or_str, default=None, help='Upper bound on features used at each split') parser.add_argument('--max-depth', type=int, default=None, help='Upper bound on depth of constructed trees')