|
16 | 16 |
|
17 | 17 | import logging
|
18 | 18 | import os
|
| 19 | +import re |
| 20 | +import tarfile |
19 | 21 | from pathlib import Path
|
20 | 22 | from typing import Any
|
21 | 23 | from urllib.request import urlretrieve
|
@@ -46,6 +48,42 @@ def _retrieve(url: str, filename: str) -> None:
|
46 | 48 | urlretrieve(url, filename, reporthook=_show_progress)
|
47 | 49 |
|
48 | 50 |
|
| 51 | +def _read_libsvm_msrank(file_obj, n_samples, n_features, dtype): |
| 52 | + X = np.zeros((n_samples, n_features)) |
| 53 | + y = np.zeros((n_samples,)) |
| 54 | + |
| 55 | + counter = 0 |
| 56 | + |
| 57 | + regexp = re.compile(r'[A-Za-z0-9]+:(-?\d*\.?\d+)') |
| 58 | + |
| 59 | + for line in file_obj: |
| 60 | + line = str(line).replace("\\n'", "") |
| 61 | + line = regexp.sub('\g<1>', line) |
| 62 | + line = line.rstrip(" \n\r").split(' ') |
| 63 | + |
| 64 | + y[counter] = int(line[0]) |
| 65 | + X[counter] = [float(i) for i in line[1:]] |
| 66 | + |
| 67 | + counter += 1 |
| 68 | + if counter == n_samples: |
| 69 | + break |
| 70 | + |
| 71 | + return np.array(X, dtype=dtype), np.array(y, dtype=dtype) |
| 72 | + |
| 73 | + |
| 74 | +def _make_gen(reader): |
| 75 | + b = reader(1024 * 1024) |
| 76 | + while b: |
| 77 | + yield b |
| 78 | + b = reader(1024 * 1024) |
| 79 | + |
| 80 | + |
| 81 | +def _count_lines(filename): |
| 82 | + with open(filename, 'rb') as f: |
| 83 | + f_gen = _make_gen(f.read) |
| 84 | + return sum(buf.count(b'\n') for buf in f_gen) |
| 85 | + |
| 86 | + |
49 | 87 | def a_nine_a(dataset_dir: Path) -> bool:
|
50 | 88 | """
|
51 | 89 | Author: Ronny Kohavi","Barry Becker
|
@@ -136,7 +174,56 @@ def airline(dataset_dir: Path) -> bool:
|
136 | 174 |
|
137 | 175 |
|
138 | 176 | def airline_ohe(dataset_dir: Path) -> bool:
|
139 |
| - return False |
| 177 | + """ |
| 178 | + Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf |
| 179 | + TaskType:binclass |
| 180 | + NumberOfFeatures:700 |
| 181 | + NumberOfInstances:10100000 |
| 182 | + """ |
| 183 | + dataset_name = 'airline-ohe' |
| 184 | + os.makedirs(dataset_dir, exist_ok=True) |
| 185 | + |
| 186 | + url_train = 'https://s3.amazonaws.com/benchm-ml--main/train-10m.csv' |
| 187 | + url_test = 'https://s3.amazonaws.com/benchm-ml--main/test.csv' |
| 188 | + local_url_train = os.path.join(dataset_dir, os.path.basename(url_train)) |
| 189 | + local_url_test = os.path.join(dataset_dir, os.path.basename(url_test)) |
| 190 | + if not os.path.isfile(local_url_train): |
| 191 | + logging.info(f'Started loading {dataset_name}') |
| 192 | + _retrieve(url_train, local_url_train) |
| 193 | + if not os.path.isfile(local_url_test): |
| 194 | + logging.info(f'Started loading {dataset_name}') |
| 195 | + _retrieve(url_test, local_url_test) |
| 196 | + logging.info(f'{dataset_name} is loaded, started parsing...') |
| 197 | + |
| 198 | + sets = [] |
| 199 | + labels = [] |
| 200 | + |
| 201 | + categorical_names = ["Month", "DayofMonth", |
| 202 | + "DayOfWeek", "UniqueCarrier", "Origin", "Dest"] |
| 203 | + |
| 204 | + for local_url in [local_url_train, local_url_train]: |
| 205 | + df = pd.read_csv(local_url, nrows=1000000 |
| 206 | + if local_url.endswith('train-10m.csv') else None) |
| 207 | + X = df.drop('dep_delayed_15min', 1) |
| 208 | + y = df["dep_delayed_15min"] |
| 209 | + |
| 210 | + y_num = np.where(y == "Y", 1, 0) |
| 211 | + |
| 212 | + sets.append(X) |
| 213 | + labels.append(y_num) |
| 214 | + |
| 215 | + n_samples_train = sets[0].shape[0] |
| 216 | + |
| 217 | + X_final: Any = pd.concat(sets) |
| 218 | + X_final = pd.get_dummies(X_final, columns=categorical_names) |
| 219 | + sets = [X_final[:n_samples_train], X_final[n_samples_train:]] |
| 220 | + |
| 221 | + for data, name in zip((sets[0], sets[1], labels[0], labels[1]), |
| 222 | + ('x_train', 'x_test', 'y_train', 'y_test')): |
| 223 | + filename = f'{dataset_name}_{name}.npy' |
| 224 | + np.save(os.path.join(dataset_dir, filename), data) |
| 225 | + logging.info(f'dataset {dataset_name} is ready.') |
| 226 | + return True |
140 | 227 |
|
141 | 228 |
|
142 | 229 | def bosch(dataset_dir: Path) -> bool:
|
@@ -454,7 +541,43 @@ def higgs(dataset_dir: Path) -> bool:
|
454 | 541 |
|
455 | 542 |
|
456 | 543 | def higgs_one_m(dataset_dir: Path) -> bool:
|
457 |
| - return False |
| 544 | + """ |
| 545 | + Higgs dataset from UCI machine learning repository ( |
| 546 | + https://archive.ics.uci.edu/ml/datasets/HIGGS). |
| 547 | + TaskType:binclass |
| 548 | + NumberOfFeatures:28 |
| 549 | + NumberOfInstances:11M |
| 550 | + """ |
| 551 | + dataset_name = 'higgs1m' |
| 552 | + os.makedirs(dataset_dir, exist_ok=True) |
| 553 | + |
| 554 | + url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' |
| 555 | + local_url = os.path.join(dataset_dir, os.path.basename(url)) |
| 556 | + if not os.path.isfile(local_url): |
| 557 | + logging.info(f'Started loading {dataset_name}') |
| 558 | + _retrieve(url, local_url) |
| 559 | + logging.info(f'{dataset_name} is loaded, started parsing...') |
| 560 | + |
| 561 | + nrows_train, nrows_test, dtype = 1000000, 500000, np.float32 |
| 562 | + data: Any = pd.read_csv(local_url, delimiter=",", header=None, |
| 563 | + compression="gzip", dtype=dtype, nrows=nrows_train+nrows_test) |
| 564 | + |
| 565 | + data = data[list(data.columns[1:])+list(data.columns[0:1])] |
| 566 | + n_features = data.shape[1]-1 |
| 567 | + train_data = np.ascontiguousarray(data.values[:nrows_train, :n_features], dtype=dtype) |
| 568 | + train_label = np.ascontiguousarray(data.values[:nrows_train, n_features], dtype=dtype) |
| 569 | + test_data = np.ascontiguousarray( |
| 570 | + data.values[nrows_train: nrows_train + nrows_test, : n_features], |
| 571 | + dtype=dtype) |
| 572 | + test_label = np.ascontiguousarray( |
| 573 | + data.values[nrows_train: nrows_train + nrows_test, n_features], |
| 574 | + dtype=dtype) |
| 575 | + for data, name in zip((train_data, test_data, train_label, test_label), |
| 576 | + ('x_train', 'x_test', 'y_train', 'y_test')): |
| 577 | + filename = f'{dataset_name}_{name}.npy' |
| 578 | + np.save(os.path.join(dataset_dir, filename), data) |
| 579 | + logging.info(f'dataset {dataset_name} is ready.') |
| 580 | + return True |
458 | 581 |
|
459 | 582 |
|
460 | 583 | def ijcnn(dataset_dir: Path) -> bool:
|
@@ -576,7 +699,51 @@ def mortgage_first_q(dataset_dir: Path) -> bool:
|
576 | 699 |
|
577 | 700 |
|
578 | 701 | def msrank(dataset_dir: Path) -> bool:
|
579 |
| - return False |
| 702 | + """ |
| 703 | + Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf |
| 704 | + TaskType:binclass |
| 705 | + NumberOfFeatures:700 |
| 706 | + NumberOfInstances:10100000 |
| 707 | + """ |
| 708 | + dataset_name = 'msrank' |
| 709 | + os.makedirs(dataset_dir, exist_ok=True) |
| 710 | + url = "https://storage.mds.yandex.net/get-devtools-opensource/471749/msrank.tar.gz" |
| 711 | + local_url = os.path.join(dataset_dir, os.path.basename(url)) |
| 712 | + if not os.path.isfile(local_url): |
| 713 | + logging.info(f'Started loading {dataset_name}') |
| 714 | + _retrieve(url, local_url) |
| 715 | + logging.info(f'{dataset_name} is loaded, unzipping...') |
| 716 | + tar = tarfile.open(local_url, "r:gz") |
| 717 | + tar.extractall(dataset_dir) |
| 718 | + tar.close() |
| 719 | + logging.info(f'{dataset_name} is unzipped, started parsing...') |
| 720 | + |
| 721 | + sets = [] |
| 722 | + labels = [] |
| 723 | + n_features = 137 |
| 724 | + |
| 725 | + for set_name in ['train.txt', 'vali.txt', 'test.txt']: |
| 726 | + file_name = str(dataset_dir) + os.path.join('MSRank', set_name) |
| 727 | + |
| 728 | + n_samples = _count_lines(file_name) |
| 729 | + with open(file_name, 'r') as file_obj: |
| 730 | + X, y = _read_libsvm_msrank(file_obj, n_samples, n_features, np.float32) |
| 731 | + |
| 732 | + sets.append(X) |
| 733 | + labels.append(y) |
| 734 | + |
| 735 | + sets[0] = np.vstack((sets[0], sets[1])) |
| 736 | + labels[0] = np.hstack((labels[0], labels[1])) |
| 737 | + |
| 738 | + sets = [np.ascontiguousarray(sets[i]) for i in [0, 2]] |
| 739 | + labels = [np.ascontiguousarray(labels[i]) for i in [0, 2]] |
| 740 | + |
| 741 | + for data, name in zip((sets[0], sets[1], labels[0], labels[1]), |
| 742 | + ('x_train', 'x_test', 'y_train', 'y_test')): |
| 743 | + filename = f'{dataset_name}_{name}.npy' |
| 744 | + np.save(os.path.join(dataset_dir, filename), data) |
| 745 | + logging.info(f'dataset {dataset_name} is ready.') |
| 746 | + return True |
580 | 747 |
|
581 | 748 |
|
582 | 749 | def plasticc(dataset_dir: Path) -> bool:
|
|
0 commit comments