Skip to content

Commit aa4705e

Browse files
Configs for xpu: dbscan, kmeans, pca (#105)
* add new cases & fix * susy, disable float32, fixes * knn_svm * fix susy_cluster * fix datasets * add cifar * fix road network * epsilon dataset uses only train * fix epsilon * delete trash * pep8 * cifar binary * disable imb_drama * C-style format * C-style matrix * Latin C * fix warning with higgs * revert * cifar binary * indent * pandas version
1 parent 14cf153 commit aa4705e

File tree

7 files changed

+622
-4
lines changed

7 files changed

+622
-4
lines changed

configs/xpu/dbscan.json

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"common": {
3+
"lib": "sklearn",
4+
"algorithm": "dbscan",
5+
"data-format": "pandas",
6+
"data-order": "F",
7+
"dtype": ["float64"],
8+
"device": ["host", "cpu", "gpu", "none"]
9+
},
10+
"cases": [
11+
{
12+
"dataset": [
13+
{
14+
"source": "npy",
15+
"name": "hepmass_10K_cluster",
16+
"training":
17+
{
18+
"x": "data/hepmass_10K_cluster.npy"
19+
}
20+
}
21+
],
22+
"eps": 5,
23+
"min-samples": 3
24+
},
25+
{
26+
"dataset": [
27+
{
28+
"source": "npy",
29+
"name": "mnist_10K_cluster",
30+
"training":
31+
{
32+
"x": "data/mnist_10K_cluster.npy"
33+
}
34+
}
35+
],
36+
"eps": 1.7e3,
37+
"min-samples": 3
38+
},
39+
{
40+
"dataset": [
41+
{
42+
"source": "npy",
43+
"name": "road_network_20K_cluster",
44+
"training":
45+
{
46+
"x": "data/road_network_20K_cluster.npy"
47+
}
48+
}
49+
],
50+
"eps": 1.0e3,
51+
"min-samples": 220
52+
}
53+
]
54+
}

configs/xpu/kmeans.json

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
{
2+
"common": {
3+
"lib": "sklearn",
4+
"algorithm": "kmeans",
5+
"data-format": "pandas",
6+
"data-order": "C",
7+
"dtype": ["float64"],
8+
"device": ["host", "cpu", "gpu", "none"]
9+
},
10+
"cases": [
11+
{
12+
"dataset": [
13+
{
14+
"source": "npy",
15+
"name": "higgs_one_m_clustering",
16+
"training":
17+
{
18+
"x": "data/higgs_one_m_clustering.npy"
19+
}
20+
}
21+
],
22+
"n-clusters": 10,
23+
"maxiter": 100
24+
},
25+
{
26+
"dataset": [
27+
{
28+
"source": "npy",
29+
"name": "higgs_one_m_clustering",
30+
"training":
31+
{
32+
"x": "data/higgs_one_m_clustering.npy"
33+
}
34+
}
35+
],
36+
"n-clusters": [100, 250],
37+
"maxiter": 10
38+
},
39+
{
40+
"dataset": [
41+
{
42+
"source": "npy",
43+
"name": "epsilon_50K_cluster",
44+
"training":
45+
{
46+
"x": "data/epsilon_50K_cluster.npy"
47+
}
48+
}
49+
],
50+
"n-clusters": [512, 1024, 2048],
51+
"maxiter": 10
52+
},
53+
{
54+
"dataset": [
55+
{
56+
"source": "npy",
57+
"name": "hepmass_1M_cluster",
58+
"training":
59+
{
60+
"x": "data/hepmass_1M_cluster.npy"
61+
}
62+
}
63+
],
64+
"n-clusters": [100, 250],
65+
"maxiter": 10
66+
},
67+
{
68+
"dataset": [
69+
{
70+
"source": "npy",
71+
"name": "hepmass_1M_cluster",
72+
"training":
73+
{
74+
"x": "data/hepmass_1M_cluster.npy"
75+
}
76+
}
77+
],
78+
"n-clusters": 10,
79+
"maxiter": 100
80+
},
81+
{
82+
"dataset": [
83+
{
84+
"source": "npy",
85+
"name": "susy_cluster",
86+
"training":
87+
{
88+
"x": "data/susy_cluster.npy"
89+
}
90+
}
91+
],
92+
"n-clusters": 10,
93+
"maxiter": 100
94+
},
95+
{
96+
"dataset": [
97+
{
98+
"source": "npy",
99+
"name": "susy_cluster",
100+
"training":
101+
{
102+
"x": "data/susy_cluster.npy"
103+
}
104+
}
105+
],
106+
"n-clusters": [100 , 250],
107+
"maxiter": 10
108+
},
109+
{
110+
"dataset": [
111+
{
112+
"source": "npy",
113+
"name": "cifar_cluster",
114+
"training":
115+
{
116+
"x": "data/cifar_cluster.npy"
117+
}
118+
}
119+
],
120+
"n-clusters": [512, 1024, 2048],
121+
"maxiter": 10
122+
}
123+
]
124+
}

configs/xpu/pca.json

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
{
2+
"common": {
3+
"lib": "sklearn",
4+
"algorithm": "pca",
5+
"data-format": "pandas",
6+
"data-order": "F",
7+
"dtype": ["float64"],
8+
"device": ["host", "cpu", "none"]
9+
},
10+
"cases": [
11+
{
12+
"dataset": [
13+
{
14+
"source": "npy",
15+
"name": "higgs1m",
16+
"training":
17+
{
18+
"x": "data/higgs1m_x_train.npy",
19+
"y": "data/higgs1m_y_train.npy"
20+
}
21+
}
22+
]
23+
},
24+
{
25+
"device": ["host", "cpu", "gpu", "none"],
26+
"dataset": [
27+
{
28+
"source": "npy",
29+
"name": "epsilon_30K",
30+
"training":
31+
{
32+
"x": "data/epsilon_30K_x_train.npy",
33+
"y": "data/epsilon_30K_y_train.npy"
34+
}
35+
}
36+
]
37+
},
38+
{
39+
"dataset": [
40+
{
41+
"source": "npy",
42+
"name": "cifar_binary",
43+
"training":
44+
{
45+
"x": "data/cifar_binary_x_train.npy",
46+
"y": "data/cifar_binary_y_train.npy"
47+
}
48+
}
49+
]
50+
}
51+
]
52+
}

datasets/load_datasets.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,18 @@
2222
from typing import Callable, Dict
2323

2424
from .loader_classification import (a_nine_a, airline, airline_ohe, bosch,
25-
census, codrnanorm, creditcard, epsilon, fraud,
26-
gisette, hepmass_150K, higgs, higgs_one_m, ijcnn,
27-
klaverjas, santander, skin_segmentation, susy)
25+
census, codrnanorm, cifar_binary, creditcard, epsilon,
26+
epsilon_30K, fraud, gisette, hepmass_150K, higgs,
27+
higgs_one_m, ijcnn, klaverjas, santander, skin_segmentation,
28+
susy)
2829
from .loader_multiclass import (connect, covertype, covtype, letters, mlsr,
2930
mnist, msrank, plasticc, sensit)
3031
from .loader_regression import (abalone, california_housing, fried, higgs_10500K,
3132
medical_charges_nominal, mortgage_first_q,
3233
twodplanes, year_prediction_msd, yolanda, airline_regression)
34+
from .loader_clustering import (cifar_cluster, epsilon_50K_cluster, higgs_one_m_clustering,
35+
hepmass_1M_cluster, hepmass_10K_cluster, mnist_10K_cluster,
36+
road_network_20K_cluster, susy_cluster)
3337

3438
dataset_loaders: Dict[str, Callable[[Path], bool]] = {
3539
"a9a": a_nine_a,
@@ -40,31 +44,41 @@
4044
"bosch": bosch,
4145
"california_housing": california_housing,
4246
"census": census,
47+
"cifar_binary": cifar_binary,
48+
"cifar_cluster": cifar_cluster,
4349
"codrnanorm": codrnanorm,
4450
"connect": connect,
4551
"covertype": covertype,
4652
"covtype": covtype,
4753
"creditcard": creditcard,
4854
"epsilon": epsilon,
55+
"epsilon_30K": epsilon_30K,
56+
"epsilon_50K_cluster": epsilon_50K_cluster,
4957
"fraud": fraud,
5058
"fried": fried,
5159
"gisette": gisette,
5260
"hepmass_150K": hepmass_150K,
61+
"hepmass_1M_cluster": hepmass_1M_cluster,
62+
"hepmass_10K_cluster": hepmass_10K_cluster,
5363
"higgs": higgs,
5464
"higgs1m": higgs_one_m,
5565
"higgs_10500K": higgs_10500K,
66+
"higgs_one_m_clustering": higgs_one_m_clustering,
5667
"ijcnn": ijcnn,
5768
"klaverjas": klaverjas,
5869
"letters": letters,
5970
"mlsr": mlsr,
6071
"medical_charges_nominal": medical_charges_nominal,
6172
"mnist": mnist,
73+
"mnist_10K_cluster": mnist_10K_cluster,
6274
"mortgage1Q": mortgage_first_q,
6375
"msrank": msrank,
6476
"plasticc": plasticc,
77+
"road_network_20K_cluster": road_network_20K_cluster,
6578
"santander": santander,
6679
"sensit": sensit,
6780
"skin_segmentation": skin_segmentation,
81+
"susy_cluster": susy_cluster,
6882
"susy": susy,
6983
"twodplanes": twodplanes,
7084
"year_prediction_msd": year_prediction_msd,

datasets/loader_classification.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,40 @@ def epsilon(dataset_dir: Path) -> bool:
334334
return True
335335

336336

337+
def epsilon_30K(dataset_dir: Path) -> bool:
338+
"""
339+
Epsilon dataset
340+
https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html
341+
342+
Classification task. n_classes = 2.
343+
epsilon_30K x train dataset (30000, 2000)
344+
epsilon_30K y train dataset (30000, 2000)
345+
"""
346+
dataset_name = 'epsilon_30K'
347+
os.makedirs(dataset_dir, exist_ok=True)
348+
349+
url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \
350+
'/epsilon_normalized.bz2'
351+
local_url_train = os.path.join(dataset_dir, os.path.basename(url_train))
352+
353+
num_train, dtype = 30000, np.float32
354+
if not os.path.isfile(local_url_train):
355+
logging.info(f'Started loading {dataset_name}, train')
356+
retrieve(url_train, local_url_train)
357+
logging.info(f'{dataset_name} is loaded, started parsing...')
358+
X_train, y_train = load_svmlight_file(local_url_train,
359+
dtype=dtype)
360+
X_train = X_train.toarray()[:num_train]
361+
y_train = y_train[:num_train]
362+
363+
for data, name in zip((X_train, y_train),
364+
('x_train', 'y_train')):
365+
filename = f'{dataset_name}_{name}.npy'
366+
np.save(os.path.join(dataset_dir, filename), data)
367+
logging.info(f'dataset {dataset_name} is ready.')
368+
return True
369+
370+
337371
def fraud(dataset_dir: Path) -> bool:
338372
"""
339373
Credit Card Fraud Detection contest
@@ -688,6 +722,51 @@ def skin_segmentation(dataset_dir: Path) -> bool:
688722
return True
689723

690724

725+
def cifar_binary(dataset_dir: Path) -> bool:
726+
"""
727+
Cifar dataset from LIBSVM Datasets (
728+
https://www.cs.toronto.edu/~kriz/cifar.html#cifar)
729+
TaskType: Classification
730+
cifar_binary x train dataset (50000, 3072)
731+
cifar_binary y train dataset (50000, 1)
732+
cifar_binary x test dataset (10000, 3072)
733+
cifar_binary y test dataset (10000, 1)
734+
"""
735+
dataset_name = 'cifar_binary'
736+
os.makedirs(dataset_dir, exist_ok=True)
737+
738+
url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.bz2'
739+
url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.t.bz2'
740+
local_url_train = os.path.join(dataset_dir, os.path.basename(url_train))
741+
local_url_test = os.path.join(dataset_dir, os.path.basename(url_test))
742+
743+
if not os.path.isfile(local_url_train):
744+
logging.info(f'Started loading {dataset_name}, train')
745+
retrieve(url_train, local_url_train)
746+
logging.info(f'{dataset_name} is loaded, started parsing...')
747+
x_train, y_train = load_svmlight_file(local_url_train,
748+
dtype=np.float32)
749+
750+
if not os.path.isfile(local_url_test):
751+
logging.info(f'Started loading {dataset_name}, test')
752+
retrieve(url_test, local_url_test)
753+
logging.info(f'{dataset_name} is loaded, started parsing...')
754+
x_test, y_test = load_svmlight_file(local_url_test,
755+
dtype=np.float32)
756+
757+
x_train = x_train.toarray()
758+
y_train = (y_train > 0).astype(int)
759+
760+
x_test = x_test.toarray()
761+
y_test = (y_test > 0).astype(int)
762+
763+
for data, name in zip((x_train, x_test, y_train, y_test),
764+
('x_train', 'x_test', 'y_train', 'y_test')):
765+
filename = f'{dataset_name}_{name}.npy'
766+
np.save(os.path.join(dataset_dir, filename), data)
767+
return True
768+
769+
691770
def susy(dataset_dir: Path) -> bool:
692771
"""
693772
SUSY dataset from UCI machine learning repository (

0 commit comments

Comments
 (0)