Skip to content

Commit 14cf153

Browse files
Configs for xpu: decision forest regressor, linear regression, logistic regression (#104)
* regression xpu configs * blank line * enable devices&dtypes * regression xpu configs * blank line * enable devices&dtypes * pep8 * fix susy y shape
1 parent 2c1fec5 commit 14cf153

File tree

6 files changed

+313
-2
lines changed

6 files changed

+313
-2
lines changed

configs/xpu/df_regr.json

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
{
2+
"common": {
3+
"lib": "sklearn",
4+
"algorithm": "df_regr",
5+
"data-format": "pandas",
6+
"data-order": "F",
7+
"dtype": ["float32", "float64"],
8+
"max-features": 0.33,
9+
"device": ["host", "cpu", "gpu", "none"]
10+
},
11+
"cases": [
12+
{
13+
"dataset": [
14+
{
15+
"source": "npy",
16+
"name": "year_prediction_msd",
17+
"training":
18+
{
19+
"x": "data/year_prediction_msd_x_train.npy",
20+
"y": "data/year_prediction_msd_y_train.npy"
21+
},
22+
"testing":
23+
{
24+
"x": "data/year_prediction_msd_x_test.npy",
25+
"y": "data/year_prediction_msd_y_test.npy"
26+
}
27+
}
28+
],
29+
"num-trees": [10, 100],
30+
"max-depth": 5
31+
},
32+
{
33+
"dataset": [
34+
{
35+
"source": "npy",
36+
"name": "year_prediction_msd",
37+
"training":
38+
{
39+
"x": "data/year_prediction_msd_x_train.npy",
40+
"y": "data/year_prediction_msd_y_train.npy"
41+
},
42+
"testing":
43+
{
44+
"x": "data/year_prediction_msd_x_test.npy",
45+
"y": "data/year_prediction_msd_y_test.npy"
46+
}
47+
}
48+
],
49+
"num-trees": [100, 20],
50+
"max-depth": 8
51+
},
52+
{
53+
"dataset": [
54+
{
55+
"source": "npy",
56+
"name": "year_prediction_msd",
57+
"training":
58+
{
59+
"x": "data/year_prediction_msd_x_train.npy",
60+
"y": "data/year_prediction_msd_y_train.npy"
61+
},
62+
"testing":
63+
{
64+
"x": "data/year_prediction_msd_x_test.npy",
65+
"y": "data/year_prediction_msd_y_test.npy"
66+
}
67+
}
68+
],
69+
"num-trees": 20,
70+
"max-depth": 16
71+
},
72+
{
73+
"dataset": [
74+
{
75+
"source": "npy",
76+
"name": "higgs1m",
77+
"training":
78+
{
79+
"x": "data/higgs1m_x_train.npy",
80+
"y": "data/higgs1m_y_train.npy"
81+
},
82+
"testing":
83+
{
84+
"x": "data/higgs1m_x_test.npy",
85+
"y": "data/higgs1m_y_test.npy"
86+
}
87+
}
88+
],
89+
"num-trees": [15, 20, 100],
90+
"max-depth": 8
91+
},
92+
{
93+
"dataset": [
94+
{
95+
"source": "npy",
96+
"name": "higgs_10500K",
97+
"training":
98+
{
99+
"x": "data/higgs_10500K_x_train.npy",
100+
"y": "data/higgs_10500K_y_train.npy"
101+
},
102+
"testing":
103+
{
104+
"x": "data/higgs_10500K_x_test.npy",
105+
"y": "data/higgs_10500K_y_test.npy"
106+
}
107+
}
108+
],
109+
"num-trees": 100,
110+
"max-depth": 8
111+
},
112+
{
113+
"dataset": [
114+
{
115+
"source": "npy",
116+
"name": "higgs_10500K",
117+
"training":
118+
{
119+
"x": "data/higgs_10500K_x_train.npy",
120+
"y": "data/higgs_10500K_y_train.npy"
121+
},
122+
"testing":
123+
{
124+
"x": "data/higgs_10500K_x_test.npy",
125+
"y": "data/higgs_10500K_y_test.npy"
126+
}
127+
}
128+
],
129+
"num-trees": 20,
130+
"max-depth": 16
131+
}
132+
]
133+
}

configs/xpu/linear.json

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
{
2+
"common": {
3+
"lib": "sklearn",
4+
"algorithm": "linear",
5+
"data-format": "pandas",
6+
"data-order": "F",
7+
"dtype": ["float32", "float64"],
8+
"device": ["host", "cpu", "gpu", "none"]
9+
},
10+
"cases": [
11+
{
12+
"dataset": [
13+
{
14+
"source": "npy",
15+
"name": "year_prediction_msd",
16+
"training":
17+
{
18+
"x": "data/year_prediction_msd_x_train.npy",
19+
"y": "data/year_prediction_msd_y_train.npy"
20+
},
21+
"testing":
22+
{
23+
"x": "data/year_prediction_msd_x_test.npy",
24+
"y": "data/year_prediction_msd_y_test.npy"
25+
}
26+
}
27+
]
28+
},
29+
{
30+
"dataset": [
31+
{
32+
"source": "npy",
33+
"name": "higgs1m",
34+
"training":
35+
{
36+
"x": "data/higgs1m_x_train.npy",
37+
"y": "data/higgs1m_y_train.npy"
38+
},
39+
"testing":
40+
{
41+
"x": "data/higgs1m_x_test.npy",
42+
"y": "data/higgs1m_y_test.npy"
43+
}
44+
}
45+
]
46+
}
47+
]
48+
}

configs/xpu/log_reg.json

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
{
2+
"common": {
3+
"lib": "sklearn",
4+
"algorithm": "log_reg",
5+
"data-format": "pandas",
6+
"data-order": "F",
7+
"dtype": ["float32", "float64"],
8+
"device": ["host", "cpu", "gpu", "none"]
9+
},
10+
"cases": [
11+
{
12+
"dataset": [
13+
{
14+
"source": "npy",
15+
"name": "susy",
16+
"training":
17+
{
18+
"x": "data/susy_x_train.npy",
19+
"y": "data/susy_y_train.npy"
20+
},
21+
"testing":
22+
{
23+
"x": "data/susy_x_test.npy",
24+
"y": "data/susy_y_test.npy"
25+
}
26+
}
27+
],
28+
"maxiter": "20"
29+
},
30+
{
31+
"dataset": [
32+
{
33+
"source": "npy",
34+
"name": "susy",
35+
"training":
36+
{
37+
"x": "data/susy_x_train.npy",
38+
"y": "data/susy_y_train.npy"
39+
},
40+
"testing":
41+
{
42+
"x": "data/susy_x_test.npy",
43+
"y": "data/susy_y_test.npy"
44+
}
45+
}
46+
],
47+
"maxiter": "10"
48+
},
49+
{
50+
"dataset": [
51+
{
52+
"source": "npy",
53+
"name": "mnist",
54+
"training":
55+
{
56+
"x": "data/mnist_x_train.npy",
57+
"y": "data/mnist_y_train.npy"
58+
},
59+
"testing":
60+
{
61+
"x": "data/mnist_x_test.npy",
62+
"y": "data/mnist_y_test.npy"
63+
}
64+
}
65+
],
66+
"no-fit-intercept": "",
67+
"maxiter": "50"
68+
},
69+
{
70+
"dataset": [
71+
{
72+
"source": "npy",
73+
"name": "mnist",
74+
"training":
75+
{
76+
"x": "data/mnist_x_train.npy",
77+
"y": "data/mnist_y_train.npy"
78+
},
79+
"testing":
80+
{
81+
"x": "data/mnist_x_test.npy",
82+
"y": "data/mnist_y_test.npy"
83+
}
84+
}
85+
],
86+
"maxiter": "500"
87+
}
88+
]
89+
}

datasets/load_datasets.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
klaverjas, santander, skin_segmentation, susy)
2828
from .loader_multiclass import (connect, covertype, covtype, letters, mlsr,
2929
mnist, msrank, plasticc, sensit)
30-
from .loader_regression import (abalone, california_housing, fried,
30+
from .loader_regression import (abalone, california_housing, fried, higgs_10500K,
3131
medical_charges_nominal, mortgage_first_q,
3232
twodplanes, year_prediction_msd, yolanda, airline_regression)
3333

@@ -52,6 +52,7 @@
5252
"hepmass_150K": hepmass_150K,
5353
"higgs": higgs,
5454
"higgs1m": higgs_one_m,
55+
"higgs_10500K": higgs_10500K,
5556
"ijcnn": ijcnn,
5657
"klaverjas": klaverjas,
5758
"letters": letters,

datasets/loader_classification.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -715,7 +715,7 @@ def susy(dataset_dir: Path) -> bool:
715715
nrows=nrows_train + nrows_test)
716716

717717
X = data[data.columns[1:]]
718-
y = data[data.columns[0:1]]
718+
y = data[data.columns[0:1]].values.ravel()
719719

720720
x_train, x_test, y_train, y_test = train_test_split(
721721
X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False)

datasets/loader_regression.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,3 +295,43 @@ def airline_regression(dataset_dir: Path) -> bool:
295295
np.save(os.path.join(dataset_dir, filename), data)
296296
logging.info(f'dataset {dataset_name} is ready.')
297297
return True
298+
299+
300+
def higgs_10500K(dataset_dir: Path) -> bool:
301+
"""
302+
Higgs dataset from UCI machine learning repository
303+
https://archive.ics.uci.edu/ml/datasets/HIGGS
304+
305+
Classification task. n_classes = 2.
306+
higgs_10500K X train dataset (10500000, 28)
307+
higgs_10500K y train dataset (10500000, 1)
308+
higgs_10500K X test dataset (500000, 28)
309+
higgs_10500K y test dataset (500000, 1)
310+
"""
311+
dataset_name = 'higgs_10500K'
312+
os.makedirs(dataset_dir, exist_ok=True)
313+
314+
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz'
315+
local_url = os.path.join(dataset_dir, os.path.basename(url))
316+
if not os.path.isfile(local_url):
317+
logging.info(f'Started loading {dataset_name}')
318+
retrieve(url, local_url)
319+
logging.info(f'{dataset_name} is loaded, started parsing...')
320+
321+
nrows_train, nrows_test, dtype = 10500000, 500000, np.float32
322+
data: Any = pd.read_csv(local_url, delimiter=",", header=None,
323+
compression="gzip", dtype=dtype,
324+
nrows=nrows_train + nrows_test)
325+
326+
X = data[data.columns[1:]]
327+
y = data[data.columns[0:1]]
328+
329+
x_train, x_test, y_train, y_test = train_test_split(
330+
X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False)
331+
332+
for data, name in zip((x_train, x_test, y_train, y_test),
333+
('x_train', 'x_test', 'y_train', 'y_test')):
334+
filename = f'{dataset_name}_{name}.npy'
335+
np.save(os.path.join(dataset_dir, filename), data)
336+
logging.info(f'dataset {dataset_name} is ready.')
337+
return True

0 commit comments

Comments
 (0)