Skip to content

Commit 6e47423

Browse files
author
Igor Rukhovich
committed
Added links and descriptions for new datasets
1 parent 340402e commit 6e47423

File tree

5 files changed

+117
-49
lines changed

5 files changed

+117
-49
lines changed

configs/xgb_cpu_config.json

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -173,11 +173,6 @@
173173
{
174174
"x": "data/plasticc_x_train.csv",
175175
"y": "data/plasticc_y_train.csv"
176-
},
177-
"testing":
178-
{
179-
"x": "data/plasticc_x_test.csv",
180-
"y": "data/plasticc_y_test.csv"
181176
}
182177
}
183178
],

configs/xgb_gpu_config.json

Lines changed: 31 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -77,18 +77,18 @@
7777
}
7878
}
7979
],
80-
"reg-alpha": 0.9,
81-
"max-bin": 256,
82-
"scale-pos-weight": 2,
83-
"learning-rate": 0.1,
84-
"subsample": 1,
85-
"reg-lambda": 1,
86-
"min-child-weight": 0,
87-
"max-depth": 8,
88-
"max-leaves": 256,
89-
"n-estimators": 1000,
90-
"objective": "binary:logistic",
91-
"inplace-predict": ""
80+
"reg-alpha": 0.9,
81+
"max-bin": 256,
82+
"scale-pos-weight": 2,
83+
"learning-rate": 0.1,
84+
"subsample": 1,
85+
"reg-lambda": 1,
86+
"min-child-weight": 0,
87+
"max-depth": 8,
88+
"max-leaves": 256,
89+
"n-estimators": 1000,
90+
"objective": "binary:logistic",
91+
"inplace-predict": ""
9292
},
9393
{
9494
"dataset": [
@@ -107,10 +107,10 @@
107107
}
108108
}
109109
],
110-
"learning-rate": 0.03,
111-
"max-depth": 6,
112-
"n-estimators": 1000,
113-
"objective": "multi:softprob"
110+
"learning-rate":0.03,
111+
"max-depth": 6,
112+
"n-estimators": 1000,
113+
"objective": "multi:softprob"
114114
},
115115
{
116116
"dataset": [
@@ -152,15 +152,15 @@
152152
}
153153
}
154154
],
155-
"max-bin": 256,
156-
"learning-rate": 0.3,
157-
"subsample": 1,
158-
"reg-lambda": 2,
159-
"min-child-weight": 1,
160-
"min-split-loss": 0.1,
161-
"max-depth": 8,
162-
"n-estimators": 200,
163-
"objective": "multi:softprob"
155+
"max-bin": 256,
156+
"learning-rate": 0.3,
157+
"subsample": 1,
158+
"reg-lambda": 2,
159+
"min-child-weight": 1,
160+
"min-split-loss": 0.1,
161+
"max-depth": 8,
162+
"n-estimators": 200,
163+
"objective": "multi:softprob"
164164
},
165165
{
166166
"dataset": [
@@ -171,11 +171,6 @@
171171
{
172172
"x": "data/plasticc_x_train.csv",
173173
"y": "data/plasticc_y_train.csv"
174-
},
175-
"testing":
176-
{
177-
"x": "data/plasticc_x_test.csv",
178-
"y": "data/plasticc_y_test.csv"
179174
}
180175
}
181176
],
@@ -197,12 +192,12 @@
197192
}
198193
}
199194
],
200-
"n-estimators": 10000,
201-
"objective": "binary:logistic",
202-
"max-depth": 1,
203-
"subsample": 0.5,
204-
"eta": 0.1,
205-
"colsample-bytree": 0.05
195+
"n-estimators": 10000,
196+
"objective": "binary:logistic",
197+
"max-depth": 1,
198+
"subsample": 0.5,
199+
"eta": 0.1,
200+
"colsample-bytree": 0.05
206201
}
207202
]
208203
}

datasets/loader_clf.py

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def a_nine_a(dataset_dir: Path) -> bool:
3232
Author: Ronny Kohavi","Barry Becker
3333
libSVM","AAD group
3434
Source: original - Date unknown
35-
Cite: http://archive.ics.uci.edu/ml/datasets/Adult
35+
Site: http://archive.ics.uci.edu/ml/datasets/Adult
3636
3737
Classification task. n_classes = 2.
3838
a9a X train dataset (39073, 123)
@@ -65,6 +65,14 @@ def a_nine_a(dataset_dir: Path) -> bool:
6565

6666

6767
def airline(dataset_dir: Path) -> bool:
68+
"""
69+
Airline dataset
70+
http://kt.ijs.si/elena_ikonomovska/data.html
71+
72+
TaskType:binclass
73+
NumberOfFeatures:13
74+
NumberOfInstances:115M
75+
"""
6876
dataset_name = 'airline'
6977
os.makedirs(dataset_dir, exist_ok=True)
7078

@@ -131,10 +139,10 @@ def airline_ohe(dataset_dir: Path) -> bool:
131139
local_url_train = os.path.join(dataset_dir, os.path.basename(url_train))
132140
local_url_test = os.path.join(dataset_dir, os.path.basename(url_test))
133141
if not os.path.isfile(local_url_train):
134-
logging.info(f'Started loading {dataset_name}')
142+
logging.info(f'Started loading {dataset_name} train')
135143
retrieve(url_train, local_url_train)
136144
if not os.path.isfile(local_url_test):
137-
logging.info(f'Started loading {dataset_name}')
145+
logging.info(f'Started loading {dataset_name} test')
138146
retrieve(url_test, local_url_test)
139147
logging.info(f'{dataset_name} is loaded, started parsing...')
140148

@@ -170,6 +178,17 @@ def airline_ohe(dataset_dir: Path) -> bool:
170178

171179

172180
def bosch(dataset_dir: Path) -> bool:
181+
"""
182+
Bosch Production Line Performance data set
183+
https://www.kaggle.com/c/bosch-production-line-performance
184+
185+
Requires Kaggle API and API token (https://github.com/Kaggle/kaggle-api)
186+
Contains missing values as NaN.
187+
188+
TaskType:binclass
189+
NumberOfFeatures:968
190+
NumberOfInstances:1.184M
191+
"""
173192
dataset_name = 'bosch'
174193
os.makedirs(dataset_dir, exist_ok=True)
175194

@@ -233,6 +252,14 @@ def codrnanorm(dataset_dir: Path) -> bool:
233252

234253

235254
def epsilon(dataset_dir: Path) -> bool:
255+
"""
256+
Epsilon dataset
257+
https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html
258+
259+
TaskType:binclass
260+
NumberOfFeatures:2000
261+
NumberOfInstances:500K
262+
"""
236263
dataset_name = 'epsilon'
237264
os.makedirs(dataset_dir, exist_ok=True)
238265

@@ -268,6 +295,17 @@ def epsilon(dataset_dir: Path) -> bool:
268295

269296

270297
def fraud(dataset_dir: Path) -> bool:
298+
"""
299+
Credit Card Fraud Detection contest
300+
https://www.kaggle.com/mlg-ulb/creditcardfraud
301+
302+
Requires Kaggle API and API token (https://github.com/Kaggle/kaggle-api)
303+
Contains missing values as NaN.
304+
305+
TaskType:binclass
306+
NumberOfFeatures:30
307+
NumberOfInstances:285K
308+
"""
271309
dataset_name = 'fraud'
272310
os.makedirs(dataset_dir, exist_ok=True)
273311

@@ -371,6 +409,14 @@ def gisette(dataset_dir: Path) -> bool:
371409

372410

373411
def higgs(dataset_dir: Path) -> bool:
412+
"""
413+
Higgs dataset from UCI machine learning repository
414+
https://archive.ics.uci.edu/ml/datasets/HIGGS
415+
416+
TaskType:binclass
417+
NumberOfFeatures:28
418+
NumberOfInstances:11M
419+
"""
374420
dataset_name = 'higgs'
375421
os.makedirs(dataset_dir, exist_ok=True)
376422

@@ -397,11 +443,14 @@ def higgs(dataset_dir: Path) -> bool:
397443

398444
def higgs_one_m(dataset_dir: Path) -> bool:
399445
"""
400-
Higgs dataset from UCI machine learning repository (
401-
https://archive.ics.uci.edu/ml/datasets/HIGGS).
446+
Higgs dataset from UCI machine learning repository
447+
https://archive.ics.uci.edu/ml/datasets/HIGGS
448+
449+
Only first 1.5M samples is taken
450+
402451
TaskType:binclass
403452
NumberOfFeatures:28
404-
NumberOfInstances:11M
453+
NumberOfInstances:1.5M
405454
"""
406455
dataset_name = 'higgs1m'
407456
os.makedirs(dataset_dir, exist_ok=True)
@@ -511,6 +560,9 @@ def klaverjas(dataset_dir: Path) -> bool:
511560

512561

513562
def santander(dataset_dir: Path) -> bool:
563+
"""
564+
Still doesn't have an loading instruction
565+
"""
514566
return False
515567

516568

datasets/loader_mul.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,15 @@ def covertype(dataset_dir: Path) -> bool:
9898

9999

100100
def covtype(dataset_dir: Path) -> bool:
101+
"""
102+
Cover type dataset from UCI machine learning repository
103+
https://archive.ics.uci.edu/ml/datasets/covertype
104+
105+
y contains 7 unique class labels from 1 to 7 inclusive.
106+
TaskType:multiclass
107+
NumberOfFeatures:54
108+
NumberOfInstances:581012
109+
"""
101110
dataset_name = 'covtype'
102111
os.makedirs(dataset_dir, exist_ok=True)
103112

@@ -192,6 +201,7 @@ def mnist(dataset_dir: Path) -> bool:
192201
def msrank(dataset_dir: Path) -> bool:
193202
"""
194203
Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf
204+
195205
TaskType:binclass
196206
NumberOfFeatures:700
197207
NumberOfInstances:10100000
@@ -200,21 +210,23 @@ def msrank(dataset_dir: Path) -> bool:
200210
os.makedirs(dataset_dir, exist_ok=True)
201211
url = "https://storage.mds.yandex.net/get-devtools-opensource/471749/msrank.tar.gz"
202212
local_url = os.path.join(dataset_dir, os.path.basename(url))
213+
unzipped_url = os.path.join(dataset_dir, "MSRank")
203214
if not os.path.isfile(local_url):
204215
logging.info(f'Started loading {dataset_name}')
205216
retrieve(url, local_url)
217+
if not os.path.isdir(unzipped_url):
206218
logging.info(f'{dataset_name} is loaded, unzipping...')
207219
tar = tarfile.open(local_url, "r:gz")
208220
tar.extractall(dataset_dir)
209221
tar.close()
210-
logging.info(f'{dataset_name} is unzipped, started parsing...')
222+
logging.info(f'{dataset_name} is unzipped, started parsing...')
211223

212224
sets = []
213225
labels = []
214226
n_features = 137
215227

216228
for set_name in ['train.txt', 'vali.txt', 'test.txt']:
217-
file_name = str(dataset_dir) + os.path.join('MSRank', set_name)
229+
file_name = os.path.join(unzipped_url, set_name)
218230

219231
n_samples = count_lines(file_name)
220232
with open(file_name, 'r') as file_obj:
@@ -238,6 +250,9 @@ def msrank(dataset_dir: Path) -> bool:
238250

239251

240252
def plasticc(dataset_dir: Path) -> bool:
253+
"""
254+
Still doesn't have an loading instruction
255+
"""
241256
return False
242257

243258

datasets/loader_reg.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,21 @@ def abalone(dataset_dir: Path) -> bool:
6060

6161

6262
def mortgage_first_q(dataset_dir: Path) -> bool:
63+
"""
64+
Still doesn't have an loading instruction
65+
"""
6366
return False
6467

6568

6669
def year_prediction_msd(dataset_dir: Path) -> bool:
70+
"""
71+
YearPredictionMSD dataset from UCI repository
72+
https://archive.ics.uci.edu/ml/datasets/yearpredictionmsd
73+
74+
TaskType:regression
75+
NumberOfFeatures:90
76+
NumberOfInstances:515345
77+
"""
6778
dataset_name = 'year_prediction_msd'
6879
os.makedirs(dataset_dir, exist_ok=True)
6980

0 commit comments

Comments
 (0)