Skip to content

Commit 7e780bb

Browse files
author
Igor Rukhovich
committed
Added higgs, msrank and airline fetching
1 parent b6a7eb0 commit 7e780bb

File tree

2 files changed

+194
-12
lines changed

2 files changed

+194
-12
lines changed

configs/xgb_cpu_config.json

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -79,12 +79,17 @@
7979
{
8080
"dataset": [
8181
{
82-
"source": "csv",
82+
"source": "npy",
8383
"name": "airline-ohe",
8484
"training":
8585
{
86-
"x": "data/airline-ohe_x_train.csv",
87-
"y": "data/airline-ohe_y_train.csv"
86+
"x": "data/airline-ohe_x_train.npy",
87+
"y": "data/airline-ohe_y_train.npy"
88+
},
89+
"testing":
90+
{
91+
"x": "data/airline-ohe_x_test.npy",
92+
"y": "data/airline-ohe_y_test.npy"
8893
}
8994
}
9095
],
@@ -103,12 +108,17 @@
103108
{
104109
"dataset": [
105110
{
106-
"source": "csv",
111+
"source": "npy",
107112
"name": "higgs1m",
108113
"training":
109114
{
110-
"x": "data/higgs1m_x_train.csv",
111-
"y": "data/higgs1m_y_train.csv"
115+
"x": "data/higgs1m_x_train.npy",
116+
"y": "data/higgs1m_y_train.npy"
117+
},
118+
"testing":
119+
{
120+
"x": "data/higgs1m_x_test.npy",
121+
"y": "data/higgs1m_y_test.npy"
112122
}
113123
}
114124
],
@@ -129,12 +139,17 @@
129139
{
130140
"dataset": [
131141
{
132-
"source": "csv",
142+
"source": "npy",
133143
"name": "msrank",
134144
"training":
135145
{
136-
"x": "data/mlsr_x_train.csv",
137-
"y": "data/mlsr_y_train.csv"
146+
"x": "data/msrank_x_train.npy",
147+
"y": "data/msrank_y_train.npy"
148+
},
149+
"testing":
150+
{
151+
"x": "data/msrank_x_test.npy",
152+
"y": "data/msrank_y_test.npy"
138153
}
139154
}
140155
],

datasets/loader.py

Lines changed: 170 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717
import logging
1818
import os
19+
import re
20+
import tarfile
1921
from pathlib import Path
2022
from typing import Any
2123
from urllib.request import urlretrieve
@@ -46,6 +48,42 @@ def _retrieve(url: str, filename: str) -> None:
4648
urlretrieve(url, filename, reporthook=_show_progress)
4749

4850

51+
def _read_libsvm_msrank(file_obj, n_samples, n_features, dtype):
52+
X = np.zeros((n_samples, n_features))
53+
y = np.zeros((n_samples,))
54+
55+
counter = 0
56+
57+
regexp = re.compile(r'[A-Za-z0-9]+:(-?\d*\.?\d+)')
58+
59+
for line in file_obj:
60+
line = str(line).replace("\\n'", "")
61+
line = regexp.sub('\g<1>', line)
62+
line = line.rstrip(" \n\r").split(' ')
63+
64+
y[counter] = int(line[0])
65+
X[counter] = [float(i) for i in line[1:]]
66+
67+
counter += 1
68+
if counter == n_samples:
69+
break
70+
71+
return np.array(X, dtype=dtype), np.array(y, dtype=dtype)
72+
73+
74+
def _make_gen(reader):
75+
b = reader(1024 * 1024)
76+
while b:
77+
yield b
78+
b = reader(1024 * 1024)
79+
80+
81+
def _count_lines(filename):
82+
with open(filename, 'rb') as f:
83+
f_gen = _make_gen(f.read)
84+
return sum(buf.count(b'\n') for buf in f_gen)
85+
86+
4987
def a_nine_a(dataset_dir: Path) -> bool:
5088
"""
5189
Author: Ronny Kohavi","Barry Becker
@@ -136,7 +174,56 @@ def airline(dataset_dir: Path) -> bool:
136174

137175

138176
def airline_ohe(dataset_dir: Path) -> bool:
139-
return False
177+
"""
178+
Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf
179+
TaskType:binclass
180+
NumberOfFeatures:700
181+
NumberOfInstances:10100000
182+
"""
183+
dataset_name = 'airline-ohe'
184+
os.makedirs(dataset_dir, exist_ok=True)
185+
186+
url_train = 'https://s3.amazonaws.com/benchm-ml--main/train-10m.csv'
187+
url_test = 'https://s3.amazonaws.com/benchm-ml--main/test.csv'
188+
local_url_train = os.path.join(dataset_dir, os.path.basename(url_train))
189+
local_url_test = os.path.join(dataset_dir, os.path.basename(url_test))
190+
if not os.path.isfile(local_url_train):
191+
logging.info(f'Started loading {dataset_name}')
192+
_retrieve(url_train, local_url_train)
193+
if not os.path.isfile(local_url_test):
194+
logging.info(f'Started loading {dataset_name}')
195+
_retrieve(url_test, local_url_test)
196+
logging.info(f'{dataset_name} is loaded, started parsing...')
197+
198+
sets = []
199+
labels = []
200+
201+
categorical_names = ["Month", "DayofMonth",
202+
"DayOfWeek", "UniqueCarrier", "Origin", "Dest"]
203+
204+
for local_url in [local_url_train, local_url_train]:
205+
df = pd.read_csv(local_url, nrows=1000000
206+
if local_url.endswith('train-10m.csv') else None)
207+
X = df.drop('dep_delayed_15min', 1)
208+
y = df["dep_delayed_15min"]
209+
210+
y_num = np.where(y == "Y", 1, 0)
211+
212+
sets.append(X)
213+
labels.append(y_num)
214+
215+
n_samples_train = sets[0].shape[0]
216+
217+
X_final: Any = pd.concat(sets)
218+
X_final = pd.get_dummies(X_final, columns=categorical_names)
219+
sets = [X_final[:n_samples_train], X_final[n_samples_train:]]
220+
221+
for data, name in zip((sets[0], sets[1], labels[0], labels[1]),
222+
('x_train', 'x_test', 'y_train', 'y_test')):
223+
filename = f'{dataset_name}_{name}.npy'
224+
np.save(os.path.join(dataset_dir, filename), data)
225+
logging.info(f'dataset {dataset_name} is ready.')
226+
return True
140227

141228

142229
def bosch(dataset_dir: Path) -> bool:
@@ -454,7 +541,43 @@ def higgs(dataset_dir: Path) -> bool:
454541

455542

456543
def higgs_one_m(dataset_dir: Path) -> bool:
457-
return False
544+
"""
545+
Higgs dataset from UCI machine learning repository (
546+
https://archive.ics.uci.edu/ml/datasets/HIGGS).
547+
TaskType:binclass
548+
NumberOfFeatures:28
549+
NumberOfInstances:11M
550+
"""
551+
dataset_name = 'higgs1m'
552+
os.makedirs(dataset_dir, exist_ok=True)
553+
554+
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz'
555+
local_url = os.path.join(dataset_dir, os.path.basename(url))
556+
if not os.path.isfile(local_url):
557+
logging.info(f'Started loading {dataset_name}')
558+
_retrieve(url, local_url)
559+
logging.info(f'{dataset_name} is loaded, started parsing...')
560+
561+
nrows_train, nrows_test, dtype = 1000000, 500000, np.float32
562+
data: Any = pd.read_csv(local_url, delimiter=",", header=None,
563+
compression="gzip", dtype=dtype, nrows=nrows_train+nrows_test)
564+
565+
data = data[list(data.columns[1:])+list(data.columns[0:1])]
566+
n_features = data.shape[1]-1
567+
train_data = np.ascontiguousarray(data.values[:nrows_train, :n_features], dtype=dtype)
568+
train_label = np.ascontiguousarray(data.values[:nrows_train, n_features], dtype=dtype)
569+
test_data = np.ascontiguousarray(
570+
data.values[nrows_train: nrows_train + nrows_test, : n_features],
571+
dtype=dtype)
572+
test_label = np.ascontiguousarray(
573+
data.values[nrows_train: nrows_train + nrows_test, n_features],
574+
dtype=dtype)
575+
for data, name in zip((train_data, test_data, train_label, test_label),
576+
('x_train', 'x_test', 'y_train', 'y_test')):
577+
filename = f'{dataset_name}_{name}.npy'
578+
np.save(os.path.join(dataset_dir, filename), data)
579+
logging.info(f'dataset {dataset_name} is ready.')
580+
return True
458581

459582

460583
def ijcnn(dataset_dir: Path) -> bool:
@@ -576,7 +699,51 @@ def mortgage_first_q(dataset_dir: Path) -> bool:
576699

577700

578701
def msrank(dataset_dir: Path) -> bool:
579-
return False
702+
"""
703+
Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf
704+
TaskType:binclass
705+
NumberOfFeatures:700
706+
NumberOfInstances:10100000
707+
"""
708+
dataset_name = 'msrank'
709+
os.makedirs(dataset_dir, exist_ok=True)
710+
url = "https://storage.mds.yandex.net/get-devtools-opensource/471749/msrank.tar.gz"
711+
local_url = os.path.join(dataset_dir, os.path.basename(url))
712+
if not os.path.isfile(local_url):
713+
logging.info(f'Started loading {dataset_name}')
714+
_retrieve(url, local_url)
715+
logging.info(f'{dataset_name} is loaded, unzipping...')
716+
tar = tarfile.open(local_url, "r:gz")
717+
tar.extractall(dataset_dir)
718+
tar.close()
719+
logging.info(f'{dataset_name} is unzipped, started parsing...')
720+
721+
sets = []
722+
labels = []
723+
n_features = 137
724+
725+
for set_name in ['train.txt', 'vali.txt', 'test.txt']:
726+
file_name = str(dataset_dir) + os.path.join('MSRank', set_name)
727+
728+
n_samples = _count_lines(file_name)
729+
with open(file_name, 'r') as file_obj:
730+
X, y = _read_libsvm_msrank(file_obj, n_samples, n_features, np.float32)
731+
732+
sets.append(X)
733+
labels.append(y)
734+
735+
sets[0] = np.vstack((sets[0], sets[1]))
736+
labels[0] = np.hstack((labels[0], labels[1]))
737+
738+
sets = [np.ascontiguousarray(sets[i]) for i in [0, 2]]
739+
labels = [np.ascontiguousarray(labels[i]) for i in [0, 2]]
740+
741+
for data, name in zip((sets[0], sets[1], labels[0], labels[1]),
742+
('x_train', 'x_test', 'y_train', 'y_test')):
743+
filename = f'{dataset_name}_{name}.npy'
744+
np.save(os.path.join(dataset_dir, filename), data)
745+
logging.info(f'dataset {dataset_name} is ready.')
746+
return True
580747

581748

582749
def plasticc(dataset_dir: Path) -> bool:

0 commit comments

Comments
 (0)